diff --git a/.github/bot-pr-format-base.sh b/.github/bot-pr-format-base.sh
index e72539a9d61..10be68353b9 100644
--- a/.github/bot-pr-format-base.sh
+++ b/.github/bot-pr-format-base.sh
@@ -3,8 +3,8 @@
 source .github/bot-pr-base.sh
 
 EXTENSION_REGEX='\.(cuh?|hpp|hpp\.inc?|cpp)$'
-FORMAT_HEADER_REGEX='^(benchmark|core|cuda|hip|include/ginkgo/core|omp|reference|dpcpp)/'
-FORMAT_REGEX='^(common|examples|test)/'
+FORMAT_HEADER_REGEX='^(benchmark|core|cuda|hip|include/ginkgo/core|omp|reference|dpcpp|common/unified)/'
+FORMAT_REGEX='^(common/cuda_hip|examples|test)/'
 
 echo "Retrieving PR file list"
 PR_FILES=$(bot_get_all_changed_files ${PR_URL})
diff --git a/common/cuda_hip/components/reduction.hpp.inc b/common/cuda_hip/components/reduction.hpp.inc
index 9b4ed4cc8c7..3853fca6d44 100644
--- a/common/cuda_hip/components/reduction.hpp.inc
+++ b/common/cuda_hip/components/reduction.hpp.inc
@@ -208,14 +208,15 @@ __device__ void reduce_array(size_type size,
  *
  * Computes a reduction using the add operation (+) on an array
  * `source` of any size. Has to be called a second time on `result` to reduce
- * an array larger than `default_block_size`.
+ * an array larger than `default_reduce_block_size`.
  */
 template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void reduce_add_array(
+__global__ __launch_bounds__(default_reduce_block_size) void reduce_add_array(
     size_type size, const ValueType* __restrict__ source,
     ValueType* __restrict__ result)
 {
-    __shared__ UninitializedArray<ValueType, default_block_size> block_sum;
+    __shared__ UninitializedArray<ValueType, default_reduce_block_size>
+        block_sum;
     reduce_array(size, source, static_cast<ValueType*>(block_sum),
                  [](const ValueType& x, const ValueType& y) { return x + y; });
 
diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp
index 6b3a698768c..0e25671c58a 100644
--- a/common/unified/base/kernel_launch.hpp
+++ b/common/unified/base/kernel_launch.hpp
@@ -170,14 +170,13 @@ namespace GKO_DEVICE_NAMESPACE {
 template <typename ValueType>
 struct matrix_accessor {
     ValueType* data;
-    size_type stride;
+    int64 stride;
 
     /**
      * @internal
      * Returns a reference to the element at position (row, col).
      */
-    GKO_INLINE GKO_ATTRIBUTES ValueType& operator()(size_type row,
-                                                    size_type col)
+    GKO_INLINE GKO_ATTRIBUTES ValueType& operator()(int64 row, int64 col)
     {
         return data[row * stride + col];
     }
@@ -187,7 +186,7 @@ struct matrix_accessor {
      * Returns a reference to the element at position idx in the underlying
      * storage.
      */
-    GKO_INLINE GKO_ATTRIBUTES ValueType& operator[](size_type idx)
+    GKO_INLINE GKO_ATTRIBUTES ValueType& operator[](int64 idx)
     {
         return data[idx];
     }
@@ -223,7 +222,8 @@ struct to_device_type_impl<matrix::Dense<ValueType>*&> {
     using type = matrix_accessor<device_type<ValueType>>;
     static type map_to_device(matrix::Dense<ValueType>* mtx)
     {
-        return {as_device_type(mtx->get_values()), mtx->get_stride()};
+        return {as_device_type(mtx->get_values()),
+                static_cast<int64>(mtx->get_stride())};
     }
 };
 
@@ -232,7 +232,8 @@ struct to_device_type_impl<const matrix::Dense<ValueType>*&> {
     using type = matrix_accessor<const device_type<ValueType>>;
     static type map_to_device(const matrix::Dense<ValueType>* mtx)
     {
-        return {as_device_type(mtx->get_const_values()), mtx->get_stride()};
+        return {as_device_type(mtx->get_const_values()),
+                static_cast<int64>(mtx->get_stride())};
     }
 };
 
@@ -267,8 +268,6 @@ typename to_device_type_impl<T>::type map_to_device(T&& param)
 }  // namespace gko
 
 
-// these files include this file again to make inclusion work from both sides,
-// this does not lead to issues due to the header guards.
 #if defined(GKO_COMPILING_CUDA)
 #include "cuda/base/kernel_launch.cuh"
 #elif defined(GKO_COMPILING_HIP)
diff --git a/common/unified/base/kernel_launch_reduction.hpp b/common/unified/base/kernel_launch_reduction.hpp
new file mode 100644
index 00000000000..9eb65216416
--- /dev/null
+++ b/common/unified/base/kernel_launch_reduction.hpp
@@ -0,0 +1,51 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#define GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+
+
+#include "common/unified/base/kernel_launch.hpp"
+
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/base/kernel_launch_reduction.cuh"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/kernel_launch_reduction.hip.hpp"
+#elif defined(GKO_COMPILING_DPCPP)
+#include "dpcpp/base/kernel_launch_reduction.dp.hpp"
+#elif defined(GKO_COMPILING_OMP)
+#include "omp/base/kernel_launch_reduction.hpp"
+#endif
+
+
+#endif  // GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
diff --git a/common/unified/base/kernel_launch_solver.hpp b/common/unified/base/kernel_launch_solver.hpp
index 6c8a1296b83..716bd94a093 100644
--- a/common/unified/base/kernel_launch_solver.hpp
+++ b/common/unified/base/kernel_launch_solver.hpp
@@ -63,7 +63,7 @@ struct default_stride_dense_wrapper {
 template <typename T>
 struct device_unpack_solver_impl {
     using type = T;
-    static GKO_INLINE GKO_ATTRIBUTES type unpack(T param, size_type)
+    static GKO_INLINE GKO_ATTRIBUTES type unpack(T param, int64)
     {
         return param;
     }
@@ -72,8 +72,8 @@ struct device_unpack_solver_impl {
 template <typename ValueType>
 struct device_unpack_solver_impl<default_stride_dense_wrapper<ValueType>> {
     using type = matrix_accessor<ValueType>;
-    static GKO_INLINE GKO_ATTRIBUTES type unpack(
-        default_stride_dense_wrapper<ValueType> param, size_type default_stride)
+    static GKO_INLINE GKO_ATTRIBUTES type
+    unpack(default_stride_dense_wrapper<ValueType> param, int64 default_stride)
     {
         return {param.data, default_stride};
     }
diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.cpp
index a06d8e1eef2..a3e90576ced 100644
--- a/common/unified/matrix/dense_kernels.cpp
+++ b/common/unified/matrix/dense_kernels.cpp
@@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "common/unified/base/kernel_launch.hpp"
+#include "common/unified/base/kernel_launch_reduction.hpp"
 
 
 namespace gko {
@@ -220,6 +221,60 @@ void sub_scaled_diag(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
 
 
+template <typename ValueType>
+void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Dense<ValueType>* x,
+                 const matrix::Dense<ValueType>* y,
+                 matrix::Dense<ValueType>* result)
+{
+    run_kernel_col_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto x, auto y) {
+            return x(i, j) * y(i, j);
+        },
+        [] GKO_KERNEL(auto a, auto b) { return a + b; },
+        [] GKO_KERNEL(auto a) { return a; }, ValueType{}, result->get_values(),
+        x->get_size(), x, y);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Dense<ValueType>* x,
+                      const matrix::Dense<ValueType>* y,
+                      matrix::Dense<ValueType>* result)
+{
+    run_kernel_col_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto x, auto y) {
+            return conj(x(i, j)) * y(i, j);
+        },
+        [] GKO_KERNEL(auto a, auto b) { return a + b; },
+        [] GKO_KERNEL(auto a) { return a; }, ValueType{}, result->get_values(),
+        x->get_size(), x, y);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
+                   const matrix::Dense<ValueType>* x,
+                   matrix::Dense<remove_complex<ValueType>>* result)
+{
+    run_kernel_col_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto x) { return squared_norm(x(i, j)); },
+        [] GKO_KERNEL(auto a, auto b) { return a + b; },
+        [] GKO_KERNEL(auto a) { return sqrt(a); }, remove_complex<ValueType>{},
+        result->get_values(), x->get_size(), x);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void symm_permute(std::shared_ptr<const DefaultExecutor> exec,
                   const Array<IndexType>* permutation_indices,
diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh
index d55faed5053..5179a5cc27d 100644
--- a/cuda/base/kernel_launch.cuh
+++ b/cuda/base/kernel_launch.cuh
@@ -51,9 +51,9 @@ constexpr int default_block_size = 512;
 
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_1d(
-    size_type size, KernelFunction fn, KernelArgs... args)
+    int64 size, KernelFunction fn, KernelArgs... args)
 {
-    auto tidx = thread::get_thread_id_flat();
+    auto tidx = thread::get_thread_id_flat<int64>();
     if (tidx >= size) {
         return;
     }
@@ -63,9 +63,9 @@ __global__ __launch_bounds__(default_block_size) void generic_kernel_1d(
 
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_2d(
-    size_type rows, size_type cols, KernelFunction fn, KernelArgs... args)
+    int64 rows, int64 cols, KernelFunction fn, KernelArgs... args)
 {
-    auto tidx = thread::get_thread_id_flat();
+    auto tidx = thread::get_thread_id_flat<int64>();
     auto col = tidx % cols;
     auto row = tidx / cols;
     if (row >= rows) {
@@ -82,7 +82,7 @@ void run_kernel(std::shared_ptr<const CudaExecutor> exec, KernelFunction fn,
     gko::cuda::device_guard guard{exec->get_device_id()};
     constexpr auto block_size = default_block_size;
     auto num_blocks = ceildiv(size, block_size);
-    generic_kernel_1d<<<num_blocks, block_size>>>(size, fn,
+    generic_kernel_1d<<<num_blocks, block_size>>>(static_cast<int64>(size), fn,
                                                   map_to_device(args)...);
 }
 
@@ -93,8 +93,9 @@ void run_kernel(std::shared_ptr<const CudaExecutor> exec, KernelFunction fn,
     gko::cuda::device_guard guard{exec->get_device_id()};
     constexpr auto block_size = default_block_size;
     auto num_blocks = ceildiv(size[0] * size[1], block_size);
-    generic_kernel_2d<<<num_blocks, block_size>>>(size[0], size[1], fn,
-                                                  map_to_device(args)...);
+    generic_kernel_2d<<<num_blocks, block_size>>>(static_cast<int64>(size[0]),
+                                                  static_cast<int64>(size[1]),
+                                                  fn, map_to_device(args)...);
 }
 
 
diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
new file mode 100644
index 00000000000..30f7fa1ba96
--- /dev/null
+++ b/cuda/base/kernel_launch_reduction.cuh
@@ -0,0 +1,529 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#error \
+    "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp"
+#endif
+
+
+#include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/base/device_guard.hpp"
+#include "cuda/base/types.hpp"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+__global__ __launch_bounds__(
+    default_block_size) void generic_kernel_reduction_1d(int64 size,
+                                                         KernelFunction fn,
+                                                         ReductionOp op,
+                                                         FinalizeOp finalize,
+                                                         ValueType identity,
+                                                         ValueType* storage,
+                                                         KernelArgs... args)
+{
+    __shared__
+        UninitializedArray<ValueType, default_block_size / config::warp_size>
+            warp_partial;
+    static_assert(default_block_size / config::warp_size <= config::warp_size,
+                  "needs third reduction level");
+    auto tidx = thread::get_thread_id_flat<int64>();
+    auto grid_size = thread::get_thread_num_flat<int64>();
+    auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    auto partial = identity;
+    for (int64 i = tidx; i < size; i += grid_size) {
+        partial = op(partial, fn(i, args...));
+    }
+    partial = reduce(warp, partial, op);
+    if (warp.thread_rank() == 0) {
+        warp_partial[threadIdx.x / config::warp_size] = partial;
+    }
+    __syncthreads();
+    if (threadIdx.x < config::warp_size) {
+        partial = reduce(warp,
+                         threadIdx.x < default_block_size / config::warp_size
+                             ? warp_partial[threadIdx.x]
+                             : identity,
+                         op);
+        if (threadIdx.x == 0) {
+            storage[blockIdx.x] = finalize(partial);
+        }
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+__global__ __launch_bounds__(
+    default_block_size) void generic_kernel_reduction_2d(int64 rows, int64 cols,
+                                                         KernelFunction fn,
+                                                         ReductionOp op,
+                                                         FinalizeOp finalize,
+                                                         ValueType identity,
+                                                         ValueType* storage,
+                                                         KernelArgs... args)
+{
+    __shared__
+        UninitializedArray<ValueType, default_block_size / config::warp_size>
+            warp_partial;
+    static_assert(default_block_size / config::warp_size <= config::warp_size,
+                  "needs third reduction level");
+    auto tidx = thread::get_thread_id_flat<int64>();
+    auto grid_size = thread::get_thread_num_flat<int64>();
+    auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    auto partial = identity;
+    for (int64 i = tidx; i < rows * cols; i += grid_size) {
+        const auto row = i / cols;
+        const auto col = i % cols;
+        partial = op(partial, fn(row, col, args...));
+    }
+    partial = reduce(warp, partial, op);
+    if (warp.thread_rank() == 0) {
+        warp_partial[threadIdx.x / config::warp_size] = partial;
+    }
+    __syncthreads();
+    if (threadIdx.x < config::warp_size) {
+        partial = reduce(warp,
+                         threadIdx.x < default_block_size / config::warp_size
+                             ? warp_partial[threadIdx.x]
+                             : identity,
+                         op);
+        if (threadIdx.x == 0) {
+            storage[blockIdx.x] = finalize(partial);
+        }
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const CudaExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType identity,
+                          ValueType* result, size_type size,
+                          KernelArgs&&... args)
+{
+    constexpr int oversubscription = 16;
+    gko::cuda::device_guard guard{exec->get_device_id()};
+    constexpr auto block_size = default_block_size;
+    const auto num_blocks = std::min<int64>(
+        ceildiv(size, block_size), exec->get_num_warps() * oversubscription);
+    if (num_blocks > 1) {
+        Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
+        generic_kernel_reduction_1d<<<num_blocks, block_size>>>(
+            static_cast<int64>(size), fn, op,
+            [] __device__(auto v) { return v; }, as_cuda_type(identity),
+            as_cuda_type(partial.get_data()), map_to_device(args)...);
+        generic_kernel_reduction_1d<<<1, block_size>>>(
+            static_cast<int64>(num_blocks),
+            [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
+            as_cuda_type(identity), as_cuda_type(result),
+            as_cuda_type(partial.get_const_data()));
+    } else {
+        generic_kernel_reduction_1d<<<1, block_size>>>(
+            static_cast<int64>(size), fn, op, finalize, as_cuda_type(identity),
+            as_cuda_type(result), map_to_device(args)...);
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const CudaExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType identity,
+                          ValueType* result, dim<2> size, KernelArgs&&... args)
+{
+    constexpr int oversubscription = 16;
+    gko::cuda::device_guard guard{exec->get_device_id()};
+    constexpr auto block_size = default_block_size;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto num_blocks =
+        std::min<int64>(ceildiv(rows * cols, block_size),
+                        exec->get_num_warps() * oversubscription);
+    if (num_blocks > 1) {
+        Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
+        generic_kernel_reduction_2d<<<num_blocks, block_size>>>(
+            rows, cols, fn, op, [] __device__(auto v) { return v; },
+            as_cuda_type(identity), as_cuda_type(partial.get_data()),
+            map_to_device(args)...);
+        generic_kernel_reduction_1d<<<1, block_size>>>(
+            static_cast<int64>(num_blocks),
+            [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
+            as_cuda_type(identity), as_cuda_type(result),
+            as_cuda_type(partial.get_const_data()));
+    } else {
+        generic_kernel_reduction_2d<<<1, block_size>>>(
+            rows, cols, fn, op, finalize, as_cuda_type(identity),
+            as_cuda_type(result), map_to_device(args)...);
+    }
+}
+
+
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d(
+        int64 rows, int64 cols, int64 col_blocks, KernelFunction fn,
+        ReductionOp op, FinalizeOp finalize, ValueType identity,
+        ValueType* result, int64 result_stride, KernelArgs... args)
+{
+    const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
+    const auto row = idx % rows;
+    const auto col_block = idx / rows;
+    if (col_block >= col_blocks) {
+        return;
+    }
+    const auto cols_per_part =
+        ceildiv(ceildiv(cols, subwarp_size), col_blocks) * subwarp_size;
+    const auto begin = cols_per_part * col_block;
+    const auto end = min(begin + cols_per_part, cols);
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    auto partial = identity;
+    for (auto col = begin + subwarp.thread_rank(); col < end;
+         col += subwarp_size) {
+        partial = op(partial, fn(row, col, args...));
+    }
+    partial = reduce(subwarp, partial, op);
+    if (subwarp.thread_rank() == 0) {
+        result[(row + col_block * rows) * result_stride] = finalize(partial);
+    }
+}
+
+
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small(
+        int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
+        FinalizeOp finalize, ValueType identity, ValueType* result,
+        KernelArgs... args)
+{
+    constexpr auto warp_size = config::warp_size;
+    constexpr auto warps_per_block = default_block_size / warp_size;
+    // stores the subwarp_size partial sums from each warp, grouped by warp
+    constexpr auto shared_storage = warps_per_block * subwarp_size;
+    __shared__ UninitializedArray<ValueType, shared_storage> block_partial;
+    const auto subwarp_id = thread::get_subwarp_id_flat<subwarp_size, int64>();
+    const auto local_warp_id = threadIdx.x / warp_size;
+    const auto local_subwarp_id = threadIdx.x % warp_size / subwarp_size;
+    const auto subwarp_num =
+        thread::get_subwarp_num_flat<subwarp_size, int64>();
+    const auto block = group::this_thread_block();
+    const auto warp = group::tiled_partition<warp_size>(block);
+    const auto warp_rank = warp.thread_rank();
+    const auto subwarp_rank = warp_rank % subwarp_size;
+    const auto col = static_cast<int64>(subwarp_rank);
+    auto partial = identity;
+    // accumulate within a thread
+    if (col < cols) {
+        for (auto row = subwarp_id; row < rows; row += subwarp_num) {
+            partial = op(partial, fn(row, col, args...));
+        }
+    }
+    // accumulate between all subwarps in the warp
+#pragma unroll
+    for (unsigned i = subwarp_size; i < warp_size; i *= 2) {
+        partial = op(partial, warp.shfl_xor(partial, i));
+    }  // store the result to shared memory
+    if (local_subwarp_id == 0) {
+        block_partial[local_warp_id * subwarp_size + subwarp_rank] = partial;
+    }
+    block.sync();
+    // in a single thread: accumulate the results
+    if (local_warp_id == 0) {
+        partial = identity;
+        // accumulate the partial results within a thread
+        if (shared_storage >= warp_size) {
+#pragma unroll
+            for (int i = 0; i < shared_storage; i += warp_size) {
+                partial = op(partial, block_partial[i + warp_rank]);
+            }
+        } else if (warp_rank < shared_storage) {
+            partial = op(partial, block_partial[warp_rank]);
+        }
+        // accumulate between all subwarps in the warp
+#pragma unroll
+        for (unsigned i = subwarp_size; i < warp_size; i *= 2) {
+            partial = op(partial, warp.shfl_xor(partial, i));
+        }
+        if (warp_rank < cols) {
+            result[warp_rank + blockIdx.x * cols] = finalize(partial);
+        }
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked(
+        int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
+        FinalizeOp finalize, ValueType identity, ValueType* result,
+        KernelArgs... args)
+{
+    constexpr auto warp_size = config::warp_size;
+    __shared__ UninitializedArray<ValueType, default_block_size> block_partial;
+    const auto warp_id = thread::get_subwarp_id_flat<warp_size, int64>();
+    const auto warp_num = thread::get_subwarp_num_flat<warp_size, int64>();
+    const auto block = group::this_thread_block();
+    const auto warp = group::tiled_partition<warp_size>(block);
+    const auto warp_rank = warp.thread_rank();
+    const auto col = warp_rank + static_cast<int64>(blockIdx.y) * warp_size;
+    auto partial = identity;
+    // accumulate within a thread
+    if (col < cols) {
+        for (auto row = warp_id; row < rows; row += warp_num) {
+            partial = op(partial, fn(row, col, args...));
+        }
+    }
+    block_partial[threadIdx.x] = partial;
+    block.sync();
+    // in a single warp: accumulate the results
+    if (threadIdx.x < warp_size) {
+        partial = identity;
+        // accumulate the partial results within a thread
+#pragma unroll
+        for (int i = 0; i < default_block_size; i += warp_size) {
+            partial = op(partial, block_partial[i + warp_rank]);
+        }
+        if (col < cols) {
+            result[col + blockIdx.x * cols] = finalize(partial);
+        }
+    }
+}
+
+
+template <typename ValueType, typename ReductionOp, typename FinalizeOp>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d(
+        int64 num_results, int64 num_blocks, ReductionOp op,
+        FinalizeOp finalize, ValueType identity, const ValueType* input,
+        int64 result_stride, ValueType* result)
+{
+    const auto idx = thread::get_thread_id_flat<int64>();
+    if (idx >= num_results) {
+        return;
+    }
+    auto partial = identity;
+    for (int64 block = 0; block < num_blocks; block++) {
+        partial = op(partial, input[idx + block * num_results]);
+    }
+    result[idx * result_stride] = finalize(partial);
+}
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+void run_generic_kernel_row_reduction(syn::value_list<int, subwarp_size>,
+                                      int64 rows, int64 cols, int64 col_blocks,
+                                      KernelFunction fn, ReductionOp op,
+                                      FinalizeOp finalize, ValueType identity,
+                                      ValueType* result, int64 result_stride,
+                                      KernelArgs... args)
+{
+    const auto num_blocks =
+        ceildiv(rows * col_blocks * subwarp_size, default_block_size);
+    generic_kernel_row_reduction_2d<subwarp_size>
+        <<<num_blocks, default_block_size>>>(
+            rows, cols, col_blocks, fn, op, finalize, as_cuda_type(identity),
+            as_cuda_type(result), result_stride, args...);
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction,
+                                    run_generic_kernel_row_reduction)
+
+
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_generic_col_reduction_small(syn::value_list<int, subwarp_size>,
+                                     int64 max_blocks,
+                                     std::shared_ptr<const CudaExecutor> exec,
+                                     KernelFunction fn, ReductionOp op,
+                                     FinalizeOp finalize, ValueType identity,
+                                     ValueType* result, dim<2> size,
+                                     MappedKernelArgs... args)
+{
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto num_blocks = std::min<int64>(
+        ceildiv(rows * subwarp_size, default_block_size), max_blocks);
+    if (num_blocks <= 1) {
+        generic_kernel_col_reduction_2d_small<subwarp_size>
+            <<<1, default_block_size>>>(rows, cols, fn, op, finalize,
+                                        as_cuda_type(identity),
+                                        as_cuda_type(result), args...);
+    } else {
+        Array<ValueType> tmp_storage{exec,
+                                     static_cast<size_type>(num_blocks * cols)};
+        generic_kernel_col_reduction_2d_small<subwarp_size>
+            <<<num_blocks, default_block_size>>>(
+                rows, cols, fn, op, [] __device__(auto v) { return v; },
+                as_cuda_type(identity), as_cuda_type(tmp_storage.get_data()),
+                args...);
+        generic_kernel_reduction_finalize_2d<<<
+            ceildiv(cols, default_block_size), default_block_size>>>(
+            cols, num_blocks, op, finalize, as_cuda_type(identity),
+            as_cuda_type(tmp_storage.get_const_data()), 1,
+            as_cuda_type(result));
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small,
+                                    run_generic_col_reduction_small);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_row_reduction(std::shared_ptr<const CudaExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType identity,
+                              ValueType* result, size_type result_stride,
+                              dim<2> size, KernelArgs&&... args)
+{
+    using subwarp_sizes =
+        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    constexpr int oversubscription = 16;
+    gko::cuda::device_guard guard{exec->get_device_id()};
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto resources =
+        exec->get_num_warps() * config::warp_size * oversubscription;
+    if (rows * cols > resources && rows < cols) {
+        const auto col_blocks = ceildiv(rows * cols, resources);
+        Array<ValueType> partial{exec,
+                                 static_cast<size_type>(col_blocks * rows)};
+        const auto num_blocks =
+            ceildiv(rows * col_blocks * config::warp_size, default_block_size);
+        generic_kernel_row_reduction_2d<config::warp_size>
+            <<<num_blocks, default_block_size>>>(
+                rows, cols, col_blocks, fn, op,
+                [] __device__(auto v) { return v; }, as_cuda_type(identity),
+                as_cuda_type(partial.get_data()), 1, map_to_device(args)...);
+        const auto num_finalize_blocks = ceildiv(rows, default_block_size);
+        generic_kernel_reduction_finalize_2d<<<num_finalize_blocks,
+                                               default_block_size>>>(
+            rows, col_blocks, op, finalize, as_cuda_type(identity),
+            as_cuda_type(partial.get_const_data()),
+            static_cast<int64>(result_stride), as_cuda_type(result));
+    } else {
+        select_run_generic_kernel_row_reduction(
+            subwarp_sizes(),
+            [&](int compiled_subwarp_size) {
+                return compiled_subwarp_size >= cols ||
+                       compiled_subwarp_size == config::warp_size;
+            },
+            syn::value_list<int>(), syn::type_list<>(), rows, cols, 1, fn, op,
+            finalize, identity, result, static_cast<int64>(result_stride),
+            map_to_device(args)...);
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_col_reduction(std::shared_ptr<const CudaExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType identity,
+                              ValueType* result, dim<2> size,
+                              KernelArgs&&... args)
+{
+    using subwarp_sizes =
+        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    constexpr int oversubscription = 16;
+    gko::cuda::device_guard guard{exec->get_device_id()};
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto max_blocks = exec->get_num_warps() * config::warp_size *
+                            oversubscription / default_block_size;
+    if (cols <= config::warp_size) {
+        select_generic_col_reduction_small(
+            subwarp_sizes(),
+            [&](int compiled_subwarp_size) {
+                return compiled_subwarp_size >= cols ||
+                       compiled_subwarp_size == config::warp_size;
+            },
+            syn::value_list<int>(), syn::type_list<>(), max_blocks, exec, fn,
+            op, finalize, identity, result, size, map_to_device(args)...);
+    } else {
+        const auto col_blocks = ceildiv(cols, config::warp_size);
+        const auto row_blocks =
+            ceildiv(std::min<int64>(
+                        ceildiv(rows * config::warp_size, default_block_size),
+                        max_blocks),
+                    col_blocks);
+        if (row_blocks <= 1) {
+            generic_kernel_col_reduction_2d_blocked<<<dim3(1, col_blocks),
+                                                      default_block_size>>>(
+                rows, cols, fn, op, finalize, as_cuda_type(identity),
+                as_cuda_type(result), map_to_device(args)...);
+        } else {
+            Array<ValueType> tmp_storage{
+                exec, static_cast<size_type>(row_blocks * cols)};
+            generic_kernel_col_reduction_2d_blocked<<<
+                dim3(row_blocks, col_blocks), default_block_size>>>(
+                rows, cols, fn, op, [] __device__(auto v) { return v; },
+                as_cuda_type(identity), as_cuda_type(tmp_storage.get_data()),
+                map_to_device(args)...);
+            generic_kernel_reduction_finalize_2d<<<
+                ceildiv(cols, default_block_size), default_block_size>>>(
+                cols, row_blocks, op, finalize, as_cuda_type(identity),
+                as_cuda_type(tmp_storage.get_const_data()), 1,
+                as_cuda_type(result));
+        }
+    }
+}
+
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/base/kernel_launch_solver.cuh b/cuda/base/kernel_launch_solver.cuh
index bf2f6e1a995..f4da60ddede 100644
--- a/cuda/base/kernel_launch_solver.cuh
+++ b/cuda/base/kernel_launch_solver.cuh
@@ -43,10 +43,10 @@ namespace cuda {
 
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver(
-    size_type rows, size_type cols, size_type default_stride, KernelFunction fn,
+    int64 rows, int64 cols, int64 default_stride, KernelFunction fn,
     KernelArgs... args)
 {
-    auto tidx = thread::get_thread_id_flat();
+    auto tidx = thread::get_thread_id_flat<int64>();
     auto col = tidx % cols;
     auto row = tidx / cols;
     if (row >= rows) {
@@ -66,7 +66,8 @@ void run_kernel_solver(std::shared_ptr<const CudaExecutor> exec,
     constexpr auto block_size = default_block_size;
     auto num_blocks = ceildiv(size[0] * size[1], block_size);
     generic_kernel_2d_solver<<<num_blocks, block_size>>>(
-        size[0], size[1], default_stride, fn, map_to_device(args)...);
+        static_cast<int64>(size[0]), static_cast<int64>(size[1]),
+        static_cast<int64>(default_stride), fn, map_to_device(args)...);
 }
 
 
diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh
index 95ac3d8a417..8e0a962145e 100644
--- a/cuda/components/reduction.cuh
+++ b/cuda/components/reduction.cuh
@@ -53,7 +53,7 @@ namespace kernels {
 namespace cuda {
 
 
-constexpr int default_block_size = 512;
+constexpr int default_reduce_block_size = 512;
 
 
 #include "common/cuda_hip/components/reduction.hpp.inc"
@@ -75,13 +75,14 @@ __host__ ValueType reduce_add_array(std::shared_ptr<const CudaExecutor> exec,
     auto block_results_val = source;
     size_type grid_dim = size;
     auto block_results = Array<ValueType>(exec);
-    if (size > default_block_size) {
-        const auto n = ceildiv(size, default_block_size);
-        grid_dim = (n <= default_block_size) ? n : default_block_size;
+    if (size > default_reduce_block_size) {
+        const auto n = ceildiv(size, default_reduce_block_size);
+        grid_dim =
+            (n <= default_reduce_block_size) ? n : default_reduce_block_size;
 
         block_results.resize_and_reset(grid_dim);
 
-        reduce_add_array<<<grid_dim, default_block_size>>>(
+        reduce_add_array<<<grid_dim, default_reduce_block_size>>>(
             size, as_cuda_type(source), as_cuda_type(block_results.get_data()));
 
         block_results_val = block_results.get_const_data();
@@ -89,7 +90,7 @@ __host__ ValueType reduce_add_array(std::shared_ptr<const CudaExecutor> exec,
 
     auto d_result = Array<ValueType>(exec, 1);
 
-    reduce_add_array<<<1, default_block_size>>>(
+    reduce_add_array<<<1, default_reduce_block_size>>>(
         grid_dim, as_cuda_type(block_results_val),
         as_cuda_type(d_result.get_data()));
     auto answer = exec->copy_val_to_host(d_result.get_const_data());
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 7a18bb06e39..2b8fb8157c6 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -117,133 +117,6 @@ void apply(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
-template <typename ValueType>
-void compute_dot(std::shared_ptr<const CudaExecutor> exec,
-                 const matrix::Dense<ValueType>* x,
-                 const matrix::Dense<ValueType>* y,
-                 matrix::Dense<ValueType>* result)
-{
-    if (cublas::is_supported<ValueType>::value) {
-        // TODO: write a custom kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            cublas::dot(exec->get_cublas_handle(), x->get_size()[0],
-                        x->get_const_values() + col, x->get_stride(),
-                        y->get_const_values() + col, y->get_stride(),
-                        result->get_values() + col);
-        }
-    } else {
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        constexpr int block_size = 1024;
-
-        constexpr auto work_per_block = work_per_thread * block_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{config::warp_size, 1,
-                             block_size / config::warp_size};
-        Array<ValueType> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            kernel::compute_partial_dot<block_size><<<grid_dim, block_dim>>>(
-                x->get_size()[0], as_cuda_type(x->get_const_values() + col),
-                x->get_stride(), as_cuda_type(y->get_const_values() + col),
-                y->get_stride(), as_cuda_type(work.get_data()));
-            kernel::finalize_sum_reduce_computation<block_size>
-                <<<1, block_dim>>>(grid_dim.x,
-                                   as_cuda_type(work.get_const_data()),
-                                   as_cuda_type(result->get_values() + col));
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot(std::shared_ptr<const CudaExecutor> exec,
-                      const matrix::Dense<ValueType>* x,
-                      const matrix::Dense<ValueType>* y,
-                      matrix::Dense<ValueType>* result)
-{
-    if (cublas::is_supported<ValueType>::value) {
-        // TODO: write a custom kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            cublas::conj_dot(exec->get_cublas_handle(), x->get_size()[0],
-                             x->get_const_values() + col, x->get_stride(),
-                             y->get_const_values() + col, y->get_stride(),
-                             result->get_values() + col);
-        }
-    } else {
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        constexpr int block_size = 1024;
-
-        constexpr auto work_per_block = work_per_thread * block_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{config::warp_size, 1,
-                             block_size / config::warp_size};
-        Array<ValueType> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            kernel::compute_partial_conj_dot<block_size>
-                <<<grid_dim, block_dim>>>(
-                    x->get_size()[0], as_cuda_type(x->get_const_values() + col),
-                    x->get_stride(), as_cuda_type(y->get_const_values() + col),
-                    y->get_stride(), as_cuda_type(work.get_data()));
-            kernel::finalize_sum_reduce_computation<block_size>
-                <<<1, block_dim>>>(grid_dim.x,
-                                   as_cuda_type(work.get_const_data()),
-                                   as_cuda_type(result->get_values() + col));
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2(std::shared_ptr<const CudaExecutor> exec,
-                   const matrix::Dense<ValueType>* x,
-                   matrix::Dense<remove_complex<ValueType>>* result)
-{
-    if (cublas::is_supported<ValueType>::value) {
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            cublas::norm2(exec->get_cublas_handle(), x->get_size()[0],
-                          x->get_const_values() + col, x->get_stride(),
-                          result->get_values() + col);
-        }
-    } else {
-        using norm_type = remove_complex<ValueType>;
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        constexpr int block_size = 1024;
-
-        constexpr auto work_per_block = work_per_thread * block_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{config::warp_size, 1,
-                             block_size / config::warp_size};
-        Array<norm_type> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            kernel::compute_partial_norm2<block_size><<<grid_dim, block_dim>>>(
-                x->get_size()[0], as_cuda_type(x->get_const_values() + col),
-                x->get_stride(), as_cuda_type(work.get_data()));
-            kernel::finalize_sqrt_reduce_computation<block_size>
-                <<<1, block_dim>>>(grid_dim.x,
-                                   as_cuda_type(work.get_const_data()),
-                                   as_cuda_type(result->get_values() + col));
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const CudaExecutor> exec,
                     const matrix::Dense<ValueType>* source,
diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu
index abd4775290c..e2f6583f930 100644
--- a/cuda/test/base/kernel_launch.cu
+++ b/cuda/test/base/kernel_launch.cu
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/unified/base/kernel_launch_reduction.hpp"
 #include "common/unified/base/kernel_launch_solver.hpp"
 #include "core/test/utils.hpp"
 
@@ -54,6 +55,7 @@ namespace {
 
 
 using gko::dim;
+using gko::int64;
 using gko::size_type;
 using std::is_same;
 
@@ -104,7 +106,7 @@ void run1d(std::shared_ptr<gko::CudaExecutor> exec, size_type dim, int* data)
     gko::kernels::cuda::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             d[i] = i;
         },
@@ -124,7 +126,7 @@ void run1d(std::shared_ptr<gko::CudaExecutor> exec, gko::Array<int>& data)
     gko::kernels::cuda::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
             if (d == d_ptr) {
@@ -149,7 +151,7 @@ void run1d(std::shared_ptr<gko::CudaExecutor> exec, gko::matrix::Dense<>* m)
     gko::kernels::cuda::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
                           "type");
@@ -185,8 +187,8 @@ void run2d(std::shared_ptr<gko::CudaExecutor> exec, int* data)
     gko::kernels::cuda::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
-            static_assert(is_same<decltype(j), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             d[i + 4 * j] = 4 * i + j;
         },
@@ -206,8 +208,8 @@ void run2d(std::shared_ptr<gko::CudaExecutor> exec, gko::Array<int>& data)
     gko::kernels::cuda::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
-            static_assert(is_same<decltype(j), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
             if (d == d_ptr) {
@@ -234,7 +236,7 @@ void run2d(std::shared_ptr<gko::CudaExecutor> exec, gko::matrix::Dense<>* m1,
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3,
                       auto d4, auto d2_ptr, auto d3_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
                           "type");
@@ -275,4 +277,195 @@ TEST_F(KernelLaunch, Runs2DDense)
 }
 
 
+void run1d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
+{
+    gko::Array<int64> output{exec, 1};
+
+    gko::kernels::cuda::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return i + 1;
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
+        size_type{100000}, output);
+
+    // 2 * sum i=0...99999 (i+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL);
+
+    gko::kernels::cuda::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return i + 1;
+        },
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "a");
+            static_assert(is_same<decltype(i), int64>::value, "b");
+            return i + j;
+        },
+        [] GKO_KERNEL(auto j) {
+            static_assert(is_same<decltype(j), int64>::value, "value");
+            return j * 2;
+        },
+        int64{}, output.get_data(), size_type{100}, output);
+
+    // 2 * sum i=0...99 (i+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL);
+}
+
+TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); }
+
+
+void run2d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
+{
+    gko::Array<int64> output{exec, 1};
+
+    gko::kernels::cuda::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "a");
+            static_assert(is_same<decltype(i), int64>::value, "b");
+            return i + j;
+        },
+        [] GKO_KERNEL(auto j) {
+            static_assert(is_same<decltype(j), int64>::value, "value");
+            return j * 4;
+        },
+        int64{}, output.get_data(), gko::dim<2>{1000, 100}, output);
+
+    // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL);
+
+    gko::kernels::cuda::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "a");
+            static_assert(is_same<decltype(i), int64>::value, "b");
+            return i + j;
+        },
+        [] GKO_KERNEL(auto j) {
+            static_assert(is_same<decltype(j), int64>::value, "value");
+            return j * 4;
+        },
+        int64{}, output.get_data(), gko::dim<2>{10, 10}, output);
+
+    // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL);
+}
+
+TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); }
+
+
+void run2d_row_reduction(std::shared_ptr<gko::CudaExecutor> exec)
+{
+    for (auto num_rows : {0, 100, 1000, 10000}) {
+        for (auto num_cols : {0, 10, 100, 1000, 10000}) {
+            SCOPED_TRACE(std::to_string(num_rows) + " rows, " +
+                         std::to_string(num_cols) + " cols");
+            gko::Array<int64> host_ref{exec->get_master(),
+                                       static_cast<size_type>(2 * num_rows)};
+            std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
+            gko::Array<int64> output{exec, host_ref};
+            for (int64 i = 0; i < num_rows; i++) {
+                // we are computing 2 * sum {j=0, j<cols} (i+1)*(j+1) for each
+                // row i and storing it with stride 2
+                host_ref.get_data()[2 * i] =
+                    static_cast<int64>(num_cols) * (num_cols + 1) * (i + 1);
+            }
+
+            gko::kernels::cuda::run_kernel_row_reduction(
+                exec,
+                [] GKO_KERNEL(auto i, auto j, auto a) {
+                    static_assert(is_same<decltype(i), int64>::value, "index");
+                    static_assert(is_same<decltype(j), int64>::value, "index");
+                    static_assert(is_same<decltype(a), int64*>::value, "value");
+                    return (i + 1) * (j + 1);
+                },
+                [] GKO_KERNEL(auto i, auto j) {
+                    static_assert(is_same<decltype(i), int64>::value, "a");
+                    static_assert(is_same<decltype(i), int64>::value, "b");
+                    return i + j;
+                },
+                [] GKO_KERNEL(auto j) {
+                    static_assert(is_same<decltype(j), int64>::value, "value");
+                    return j * 2;
+                },
+                int64{}, output.get_data(), 2,
+                gko::dim<2>{static_cast<size_type>(num_rows),
+                            static_cast<size_type>(num_cols)},
+                output);
+
+            GKO_ASSERT_ARRAY_EQ(host_ref, output);
+        }
+    }
+}
+
+TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); }
+
+
+void run2d_col_reduction(std::shared_ptr<gko::CudaExecutor> exec)
+{
+    // empty, most threads idle, most threads busy, multiple blocks
+    for (auto num_rows : {0, 10, 100, 1000, 10000}) {
+        // check different edge cases: subwarp sizes, blocked mode
+        for (auto num_cols :
+             {0, 1, 2, 3, 4, 5, 7, 8, 9, 16, 31, 32, 63, 127, 128, 129}) {
+            SCOPED_TRACE(std::to_string(num_rows) + " rows, " +
+                         std::to_string(num_cols) + " cols");
+            gko::Array<int64> host_ref{exec->get_master(),
+                                       static_cast<size_type>(num_cols)};
+            gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
+            for (int64 i = 0; i < num_cols; i++) {
+                // we are computing 2 * sum {j=0, j<row} (i+1)*(j+1) for each
+                // column i
+                host_ref.get_data()[i] =
+                    static_cast<int64>(num_rows) * (num_rows + 1) * (i + 1);
+            }
+
+            gko::kernels::cuda::run_kernel_col_reduction(
+                exec,
+                [] GKO_KERNEL(auto i, auto j, auto a) {
+                    static_assert(is_same<decltype(i), int64>::value, "index");
+                    static_assert(is_same<decltype(j), int64>::value, "index");
+                    static_assert(is_same<decltype(a), int64*>::value, "value");
+                    return (i + 1) * (j + 1);
+                },
+                [] GKO_KERNEL(auto i, auto j) {
+                    static_assert(is_same<decltype(i), int64>::value, "a");
+                    static_assert(is_same<decltype(i), int64>::value, "b");
+                    return i + j;
+                },
+                [] GKO_KERNEL(auto j) {
+                    static_assert(is_same<decltype(j), int64>::value, "value");
+                    return j * 2;
+                },
+                int64{}, output.get_data(),
+                gko::dim<2>{static_cast<size_type>(num_rows),
+                            static_cast<size_type>(num_cols)},
+                output);
+
+            GKO_ASSERT_ARRAY_EQ(host_ref, output);
+        }
+    }
+}
+
+TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); }
+
+
 }  // namespace
diff --git a/dev_tools/scripts/config b/dev_tools/scripts/config
index 696d886d489..a8b0cb3841a 100644
--- a/dev_tools/scripts/config
+++ b/dev_tools/scripts/config
@@ -30,6 +30,9 @@
     - FixInclude: "common/unified/base/kernel_launch.hpp"
 - "(cuda|hip|dpcpp|omp)/base/kernel_launch_solver\."
     - FixInclude: "common/unified/base/kernel_launch_solver.hpp"
+- "common/unified/.*.cpp"
+    - PathIgnore: "2"
+    - PathPrefix: "core"
 - "core/test/base/(extended_float|iterator_factory)"
     - RemoveTest: "true"
 - "core/test/base/allocator"
diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp
index 90ec1cc05fe..f215864e01d 100644
--- a/dpcpp/base/helper.hpp
+++ b/dpcpp/base/helper.hpp
@@ -166,7 +166,7 @@ bool validate(sycl::queue* queue, unsigned workgroup_size,
  * @return the first valid config
  */
 template <typename IterArr, typename Validate>
-std::uint32_t get_first_cfg(IterArr& arr, Validate verify)
+std::uint32_t get_first_cfg(const IterArr& arr, Validate verify)
 {
     for (auto& cfg : arr) {
         if (verify(cfg)) {
diff --git a/dpcpp/base/kernel_launch.dp.hpp b/dpcpp/base/kernel_launch.dp.hpp
index 4fe161ff320..2b99e98ac36 100644
--- a/dpcpp/base/kernel_launch.dp.hpp
+++ b/dpcpp/base/kernel_launch.dp.hpp
@@ -45,25 +45,27 @@ namespace dpcpp {
 
 
 template <typename KernelFunction, typename... KernelArgs>
-void generic_kernel_1d(sycl::handler& cgh, size_type size, KernelFunction fn,
+void generic_kernel_1d(sycl::handler& cgh, int64 size, KernelFunction fn,
                        KernelArgs... args)
 {
-    cgh.parallel_for(sycl::range<1>{size}, [=](sycl::id<1> idx_id) {
-        auto idx = static_cast<size_type>(idx_id[0]);
-        fn(idx, args...);
-    });
+    cgh.parallel_for(sycl::range<1>{static_cast<std::size_t>(size)},
+                     [=](sycl::id<1> idx_id) {
+                         auto idx = static_cast<int64>(idx_id[0]);
+                         fn(idx, args...);
+                     });
 }
 
 
 template <typename KernelFunction, typename... KernelArgs>
-void generic_kernel_2d(sycl::handler& cgh, size_type rows, size_type cols,
+void generic_kernel_2d(sycl::handler& cgh, int64 rows, int64 cols,
                        KernelFunction fn, KernelArgs... args)
 {
-    cgh.parallel_for(sycl::range<2>{rows, cols}, [=](sycl::id<2> idx) {
-        auto row = static_cast<size_type>(idx[0]);
-        auto col = static_cast<size_type>(idx[1]);
-        fn(row, col, args...);
-    });
+    cgh.parallel_for(sycl::range<1>{static_cast<std::size_t>(rows * cols)},
+                     [=](sycl::id<1> idx) {
+                         auto row = static_cast<int64>(idx[0]) / cols;
+                         auto col = static_cast<int64>(idx[0]) % cols;
+                         fn(row, col, args...);
+                     });
 }
 
 
@@ -72,7 +74,8 @@ void run_kernel(std::shared_ptr<const DpcppExecutor> exec, KernelFunction fn,
                 size_type size, KernelArgs&&... args)
 {
     exec->get_queue()->submit([&](sycl::handler& cgh) {
-        generic_kernel_1d(cgh, size, fn, map_to_device(args)...);
+        generic_kernel_1d(cgh, static_cast<int64>(size), fn,
+                          map_to_device(args)...);
     });
 }
 
@@ -81,7 +84,9 @@ void run_kernel(std::shared_ptr<const DpcppExecutor> exec, KernelFunction fn,
                 dim<2> size, KernelArgs&&... args)
 {
     exec->get_queue()->submit([&](sycl::handler& cgh) {
-        generic_kernel_2d(cgh, size[0], size[1], fn, map_to_device(args)...);
+        generic_kernel_2d(cgh, static_cast<int64>(size[0]),
+                          static_cast<int64>(size[1]), fn,
+                          map_to_device(args)...);
     });
 }
 
diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp
new file mode 100644
index 00000000000..5ebf06b0f71
--- /dev/null
+++ b/dpcpp/base/kernel_launch_reduction.dp.hpp
@@ -0,0 +1,705 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#error \
+    "This file can only be used from inside common/base/kernel_launch_reduction.hpp"
+#endif
+
+
+#include <algorithm>
+
+
+#include "core/synthesizer/implementation_selection.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/components/uninitialized_array.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+using KCFG_1D = ConfigSet<11, 7>;
+constexpr auto kcfg_1d_list_simple_reduction =
+    syn::value_list<std::uint32_t, KCFG_1D::encode(512, 64),
+                    KCFG_1D::encode(512, 32), KCFG_1D::encode(512, 16),
+                    KCFG_1D::encode(256, 32), KCFG_1D::encode(256, 16),
+                    KCFG_1D::encode(256, 8)>();
+
+
+template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size,
+                                 int64 num_workgroups, KernelFunction fn,
+                                 ReductionOp op, FinalizeOp finalize,
+                                 ValueType identity, ValueType* storage,
+                                 MappedKernelArgs... args)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    constexpr auto num_partials = wg_size / sg_size;
+    sycl::accessor<UninitializedArray<ValueType, num_partials>, 0,
+                   sycl::access_mode::read_write, sycl::access::target::local>
+        subgroup_partial_acc(cgh);
+    const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size));
+    const auto global_size = num_workgroups * wg_size;
+
+    cgh.parallel_for(
+        range, [=
+    ](sycl::nd_item<3> idx) [[intel::reqd_sub_group_size(sg_size)]] {
+            auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0];
+            const auto tidx = thread::get_thread_id_flat<int64>(idx);
+            const auto local_tidx = static_cast<int64>(tidx % wg_size);
+            auto subgroup =
+                group::tiled_partition<sg_size>(group::this_thread_block(idx));
+            auto partial = identity;
+            for (int64 i = tidx; i < size; i += global_size) {
+                partial = op(partial, fn(i, args...));
+            }
+            partial = ::gko::kernels::dpcpp::reduce(subgroup, partial, op);
+            if (subgroup.thread_rank() == 0) {
+                subgroup_partial[local_tidx / sg_size] = partial;
+            }
+            idx.barrier(sycl::access::fence_space::local_space);
+            if (local_tidx < sg_size) {
+                partial = identity;
+                for (int64 i = local_tidx; i < num_partials; i += sg_size) {
+                    partial = op(partial, subgroup_partial[i]);
+                }
+                partial = ::gko::kernels::dpcpp::reduce(subgroup, partial, op);
+                if (subgroup.thread_rank() == 0) {
+                    storage[tidx / wg_size] = finalize(partial);
+                }
+            }
+        });
+}
+
+
+template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols,
+                                 int64 num_workgroups, KernelFunction fn,
+                                 ReductionOp op, FinalizeOp finalize,
+                                 ValueType identity, ValueType* storage,
+                                 MappedKernelArgs... args)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    constexpr auto num_partials = wg_size / sg_size;
+    sycl::accessor<UninitializedArray<ValueType, num_partials>, 0,
+                   sycl::access_mode::read_write, sycl::access::target::local>
+        subgroup_partial_acc(cgh);
+    const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size));
+    const auto global_size = num_workgroups * wg_size;
+
+    cgh.parallel_for(
+        range, [=
+    ](sycl::nd_item<3> idx) [[intel::reqd_sub_group_size(sg_size)]] {
+            auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0];
+            const auto tidx = thread::get_thread_id_flat<int64>(idx);
+            const auto local_tidx = static_cast<int64>(tidx % wg_size);
+            auto subgroup =
+                group::tiled_partition<sg_size>(group::this_thread_block(idx));
+            auto partial = identity;
+            for (int64 i = tidx; i < rows * cols; i += global_size) {
+                const auto row = i / cols;
+                const auto col = i % cols;
+                partial = op(partial, fn(row, col, args...));
+            }
+            partial = ::gko::kernels::dpcpp::reduce(subgroup, partial, op);
+            if (subgroup.thread_rank() == 0) {
+                subgroup_partial[local_tidx / sg_size] = partial;
+            }
+            idx.barrier(sycl::access::fence_space::local_space);
+            if (local_tidx < sg_size) {
+                partial = identity;
+                for (int64 i = local_tidx; i < num_partials; i += sg_size) {
+                    partial = op(partial, subgroup_partial[i]);
+                }
+                partial = ::gko::kernels::dpcpp::reduce(subgroup, partial, op);
+                if (subgroup.thread_rank() == 0) {
+                    storage[tidx / wg_size] = finalize(partial);
+                }
+            }
+        });
+}
+
+
+template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
+                               KernelFunction fn, ReductionOp op,
+                               FinalizeOp finalize, ValueType identity,
+                               ValueType* result, size_type size,
+                               MappedKernelArgs... args)
+{
+    constexpr int oversubscription = 4;
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    const auto num_workgroups =
+        std::min<int64>(ceildiv(size, wg_size),
+                        exec->get_num_computing_units() * oversubscription);
+    auto queue = exec->get_queue();
+    if (num_workgroups > 1) {
+        Array<ValueType> partial{exec, static_cast<size_type>(num_workgroups)};
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_reduction_1d<cfg>(
+                cgh, static_cast<int64>(size), num_workgroups, fn, op,
+                [](auto v) { return v; }, identity, partial.get_data(),
+                args...);
+        });
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_reduction_1d<cfg>(
+                cgh, static_cast<int64>(num_workgroups), 1,
+                [](auto i, auto v) { return v[i]; }, op, finalize, identity,
+                result, partial.get_const_data());
+        });
+    } else {
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_reduction_1d<cfg>(cgh, static_cast<int64>(size),
+                                             num_workgroups, fn, op, finalize,
+                                             identity, result, args...);
+        });
+    }
+}
+
+
+template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
+                               KernelFunction fn, ReductionOp op,
+                               FinalizeOp finalize, ValueType identity,
+                               ValueType* result, dim<2> size,
+                               MappedKernelArgs... args)
+{
+    constexpr int oversubscription = 4;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto flat_size = rows * cols;
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    const auto num_workgroups =
+        std::min<int64>(ceildiv(flat_size, wg_size),
+                        exec->get_num_computing_units() * oversubscription);
+    auto queue = exec->get_queue();
+    if (num_workgroups > 1) {
+        Array<ValueType> partial{exec, static_cast<size_type>(num_workgroups)};
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_reduction_2d<cfg>(
+                cgh, rows, cols, num_workgroups, fn, op,
+                [](auto v) { return v; }, identity, partial.get_data(),
+                args...);
+        });
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_reduction_1d<cfg>(
+                cgh, static_cast<int64>(num_workgroups), 1,
+                [](auto i, auto v) { return v[i]; }, op, finalize, identity,
+                result, partial.get_const_data());
+        });
+    } else {
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_reduction_2d<cfg>(cgh, rows, cols, num_workgroups,
+                                             fn, op, finalize, identity, result,
+                                             args...);
+        });
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_run_kernel_reduction,
+                                           run_kernel_reduction_impl)
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType identity,
+                          ValueType* result, dim<2> size, KernelArgs&&... args)
+{
+    const auto desired_cfg = get_first_cfg(
+        as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
+            return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    select_run_kernel_reduction(
+        kcfg_1d_list_simple_reduction,
+        [&](std::uint32_t cfg) { return cfg == desired_cfg; },
+        syn::value_list<bool>(), syn::value_list<int>(),
+        syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
+        finalize, identity, result, size, map_to_device(args)...);
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType identity,
+                          ValueType* result, size_type size,
+                          KernelArgs&&... args)
+{
+    const auto desired_cfg = get_first_cfg(
+        as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
+            return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    select_run_kernel_reduction(
+        kcfg_1d_list_simple_reduction,
+        [&](std::uint32_t cfg) { return cfg == desired_cfg; },
+        syn::value_list<bool>(), syn::value_list<int>(),
+        syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
+        finalize, identity, result, size, map_to_device(args)...);
+}
+
+
+namespace {
+
+
+template <std::uint32_t cfg, int ssg_size, typename ValueType,
+          typename KernelFunction, typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void generic_kernel_row_reduction_2d(syn::value_list<int, ssg_size>,
+                                     std::shared_ptr<const DpcppExecutor> exec,
+                                     int64 rows, int64 cols, int64 col_blocks,
+                                     KernelFunction fn, ReductionOp op,
+                                     FinalizeOp finalize, ValueType identity,
+                                     ValueType* result, int64 result_stride,
+                                     MappedKernelArgs... args)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    static_assert(ssg_size <= sg_size, "ssg must be smaller than sg");
+    const auto num_workgroups = ceildiv(rows * col_blocks * ssg_size, wg_size);
+    const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size));
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(
+            range, [=
+        ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] {
+                const auto idx =
+                    thread::get_subwarp_id_flat<ssg_size, int64>(id);
+                const auto row = idx % rows;
+                const auto col_block = idx / rows;
+                auto partial = identity;
+                auto subgroup = group::tiled_partition<sg_size>(
+                    group::this_thread_block(id));
+                auto ssg_rank =
+                    static_cast<int64>(subgroup.thread_rank() % ssg_size);
+                if (col_block < col_blocks) {
+                    const auto cols_per_part =
+                        ceildiv(ceildiv(cols, ssg_size), col_blocks) * ssg_size;
+                    const auto begin = cols_per_part * col_block;
+                    const auto end = min(begin + cols_per_part, cols);
+                    for (auto col = begin + ssg_rank; col < end;
+                         col += ssg_size) {
+                        partial = op(partial, fn(row, col, args...));
+                    }
+                }
+// since we do a sub-subgroup reduction, we can't use reduce
+#pragma unroll
+                for (int i = 1; i < ssg_size; i *= 2) {
+                    partial = op(partial, subgroup.shfl_xor(partial, i));
+                }
+                if (col_block < col_blocks && ssg_rank == 0) {
+                    result[(row + col_block * rows) * result_stride] =
+                        finalize(partial);
+                }
+            });
+    });
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_kernel_row_reduction_2d,
+                                    generic_kernel_row_reduction_2d);
+
+
+template <std::uint32_t cfg, int ssg_size, typename ValueType,
+          typename KernelFunction, typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void generic_kernel_col_reduction_2d_small(
+    sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks,
+    KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity,
+    ValueType* result, MappedKernelArgs... args)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    static_assert(ssg_size <= sg_size, "ssg must be smaller than sg");
+    constexpr auto subgroups_per_workgroup = wg_size / sg_size;
+    // stores the subwarp_size partial sums from each warp, grouped by warp
+    constexpr auto shared_storage = subgroups_per_workgroup * ssg_size;
+    sycl::accessor<UninitializedArray<ValueType, shared_storage>, 0,
+                   sycl::access_mode::read_write, sycl::access::target::local>
+        block_partial_acc(cgh);
+    const auto range = sycl_nd_range(dim3(row_blocks), dim3(wg_size));
+    cgh.parallel_for(
+        range, [=
+    ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] {
+            auto block_partial = &(*block_partial_acc.get_pointer())[0];
+            const auto ssg_id =
+                thread::get_subwarp_id_flat<ssg_size, int64>(id);
+            const auto local_sg_id = id.get_local_id(2) / sg_size;
+            const auto local_ssg_id = id.get_local_id(2) % sg_size / ssg_size;
+            const auto ssg_num =
+                thread::get_subwarp_num_flat<ssg_size, int64>(id);
+            const auto workgroup = group::this_thread_block(id);
+            const auto subgroup = group::tiled_partition<sg_size>(workgroup);
+            const auto sg_rank = subgroup.thread_rank();
+            const auto ssg_rank = sg_rank % ssg_size;
+            const auto col = static_cast<int64>(ssg_rank);
+            auto partial = identity;
+            // accumulate within a thread
+            if (col < cols) {
+                for (auto row = ssg_id; row < rows; row += ssg_num) {
+                    partial = op(partial, fn(row, col, args...));
+                }
+            }
+        // accumulate between all subsubgroups in the subgroup
+#pragma unroll
+            for (unsigned i = ssg_size; i < sg_size; i *= 2) {
+                partial = op(partial, subgroup.shfl_xor(partial, i));
+            }
+            // store the result to shared memory
+            if (local_ssg_id == 0) {
+                block_partial[local_sg_id * ssg_size + ssg_rank] = partial;
+            }
+            workgroup.sync();
+            // in a single thread: accumulate the results
+            if (local_sg_id == 0) {
+                partial = identity;
+                // accumulate the partial results within a thread
+                if (shared_storage >= sg_size) {
+#pragma unroll
+                    for (int i = 0; i < shared_storage; i += sg_size) {
+                        partial = op(partial, block_partial[i + sg_rank]);
+                    }
+                } else if (sg_rank < shared_storage) {
+                    partial = op(partial, block_partial[sg_rank]);
+                }
+            // accumulate between all subsubgroups in the subgroup
+#pragma unroll
+                for (unsigned i = ssg_size; i < sg_size; i *= 2) {
+                    partial = op(partial, subgroup.shfl_xor(partial, i));
+                }
+                if (sg_rank < cols) {
+                    result[sg_rank + id.get_group(2) * cols] =
+                        finalize(partial);
+                }
+            }
+        });
+}
+
+
+template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void generic_kernel_col_reduction_2d_blocked(
+    sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks,
+    int64 col_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize,
+    ValueType identity, ValueType* result, MappedKernelArgs... args)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    const auto range =
+        sycl_nd_range(dim3(row_blocks, col_blocks), dim3(wg_size));
+    sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
+                   sycl::access_mode::read_write, sycl::access::target::local>
+        block_partial_acc(cgh);
+    cgh.parallel_for(
+        range, [=
+    ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] {
+            const auto sg_id = thread::get_subwarp_id_flat<sg_size, int64>(id);
+            const auto sg_num =
+                thread::get_subwarp_num_flat<sg_size, int64>(id);
+            const auto workgroup = group::this_thread_block(id);
+            const auto subgroup = group::tiled_partition<sg_size>(workgroup);
+            const auto sg_rank = subgroup.thread_rank();
+            const auto col =
+                sg_rank + static_cast<int64>(id.get_group(1)) * sg_size;
+            auto block_partial = &(*block_partial_acc.get_pointer())[0];
+            auto partial = identity;
+            // accumulate within a thread
+            if (col < cols) {
+                for (auto row = sg_id; row < rows; row += sg_num) {
+                    partial = op(partial, fn(row, col, args...));
+                }
+            }
+            block_partial[id.get_local_id(2)] = partial;
+            workgroup.sync();
+            // in a single warp: accumulate the results
+            if (id.get_local_id(2) < sg_size) {
+                partial = identity;
+            // accumulate the partial results within a thread
+#pragma unroll
+                for (int i = 0; i < wg_size; i += sg_size) {
+                    partial = op(partial, block_partial[i + sg_rank]);
+                }
+                if (col < cols) {
+                    result[col + id.get_group(2) * cols] = finalize(partial);
+                }
+            }
+        });
+}
+
+
+template <typename ValueType, typename ReductionOp, typename FinalizeOp>
+void generic_kernel_reduction_finalize_2d(
+    sycl::handler& cgh, int64 num_results, int64 num_blocks, ReductionOp op,
+    FinalizeOp finalize, ValueType identity, const ValueType* input,
+    int64 result_stride, ValueType* result)
+{
+    cgh.parallel_for(sycl::range<1>{static_cast<std::size_t>(num_results)},
+                     [=](sycl::id<1> id) {
+                         auto partial = identity;
+                         for (int64 block = 0; block < num_blocks; block++) {
+                             partial = op(partial,
+                                          input[id[0] + block * num_results]);
+                         }
+                         result[id[0] * result_stride] = finalize(partial);
+                     });
+}
+
+
+template <std::uint32_t cfg, int ssg_size, typename ValueType,
+          typename KernelFunction, typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_generic_col_reduction_small(syn::value_list<int, ssg_size>,
+                                     std::shared_ptr<const DpcppExecutor> exec,
+                                     int64 max_workgroups, KernelFunction fn,
+                                     ReductionOp op, FinalizeOp finalize,
+                                     ValueType identity, ValueType* result,
+                                     dim<2> size, MappedKernelArgs... args)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    static_assert(ssg_size <= sg_size, "ssg must be smaller than sg");
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto row_blocks =
+        std::min<int64>(ceildiv(rows * ssg_size, wg_size), max_workgroups);
+    auto queue = exec->get_queue();
+    if (row_blocks <= 1) {
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_col_reduction_2d_small<cfg, ssg_size>(
+                cgh, rows, cols, 1, fn, op, finalize, identity, result,
+                args...);
+        });
+    } else {
+        Array<ValueType> tmp_storage{exec,
+                                     static_cast<size_type>(row_blocks * cols)};
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_col_reduction_2d_small<cfg, ssg_size>(
+                cgh, rows, cols, row_blocks, fn, op, [](auto v) { return v; },
+                identity, tmp_storage.get_data(), args...);
+        });
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_reduction_finalize_2d(
+                cgh, cols, row_blocks, op, finalize, identity,
+                tmp_storage.get_const_data(), 1, result);
+        });
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small,
+                                    run_generic_col_reduction_small);
+
+
+template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_kernel_row_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
+                                     KernelFunction fn, ReductionOp op,
+                                     FinalizeOp finalize, ValueType identity,
+                                     ValueType* result, size_type result_stride,
+                                     dim<2> size, MappedKernelArgs... args)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    using subsubgroup_sizes =
+        syn::value_list<int, 1, 2, 4, 8, std::min<int>(16, sg_size),
+                        std::min<int>(32, sg_size), sg_size>;
+    constexpr int oversubscription = 16;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto resources =
+        exec->get_num_computing_units() * sg_size * oversubscription;
+    auto queue = exec->get_queue();
+    if (rows * cols > resources && rows < cols) {
+        const auto col_blocks = ceildiv(rows * cols, resources);
+        Array<ValueType> partial{exec,
+                                 static_cast<size_type>(col_blocks * rows)};
+        generic_kernel_row_reduction_2d<cfg, sg_size>(
+            syn::value_list<int, sg_size>{}, exec, rows, cols, col_blocks, fn,
+            op, [](auto v) { return v; }, identity, partial.get_data(), 1,
+            args...);
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_reduction_finalize_2d(
+                cgh, rows, col_blocks, op, finalize, identity,
+                partial.get_const_data(), static_cast<int64>(result_stride),
+                result);
+        });
+    } else {
+        select_generic_kernel_row_reduction_2d(
+            subsubgroup_sizes(),
+            [&](int compiled_ssg_size) {
+                return compiled_ssg_size >= cols ||
+                       compiled_ssg_size == sg_size;
+            },
+            syn::value_list<int, cfg>(), syn::type_list<>(), exec, rows, cols,
+            1, fn, op, finalize, identity, result,
+            static_cast<int64>(result_stride), args...);
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_kernel_row_reduction_stage1,
+                                           run_kernel_row_reduction_stage1);
+
+
+template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_kernel_col_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
+                                     KernelFunction fn, ReductionOp op,
+                                     FinalizeOp finalize, ValueType identity,
+                                     ValueType* result, dim<2> size,
+                                     MappedKernelArgs... args)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    using subsubgroup_sizes =
+        syn::value_list<int, 1, 2, 4, 8, std::min<int>(16, sg_size),
+                        std::min<int>(32, sg_size), sg_size>;
+    constexpr int oversubscription = 16;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto max_blocks =
+        exec->get_num_computing_units() * sg_size * oversubscription / wg_size;
+    if (cols <= sg_size) {
+        select_generic_col_reduction_small(
+            subsubgroup_sizes(),
+            [&](int compiled_ssg_size) {
+                return compiled_ssg_size >= cols ||
+                       compiled_ssg_size == sg_size;
+            },
+            syn::value_list<int, cfg>(), syn::type_list<>(), exec, max_blocks,
+            fn, op, finalize, identity, result, size, args...);
+    } else {
+        const auto col_blocks = ceildiv(cols, sg_size);
+        const auto row_blocks = ceildiv(
+            std::min<int64>(ceildiv(rows * sg_size, wg_size), max_blocks),
+            col_blocks);
+        auto queue = exec->get_queue();
+        if (row_blocks <= 1) {
+            queue->submit([&](sycl::handler& cgh) {
+                generic_kernel_col_reduction_2d_blocked<cfg>(
+                    cgh, rows, cols, 1, col_blocks, fn, op, finalize, identity,
+                    result, args...);
+            });
+        } else {
+            Array<ValueType> tmp_storage{
+                exec, static_cast<size_type>(row_blocks * cols)};
+            queue->submit([&](sycl::handler& cgh) {
+                generic_kernel_col_reduction_2d_blocked<cfg>(
+                    cgh, rows, cols, row_blocks, col_blocks, fn, op,
+                    [](auto v) { return v; }, identity, tmp_storage.get_data(),
+                    args...);
+            });
+            queue->submit([&](sycl::handler& cgh) {
+                generic_kernel_reduction_finalize_2d(
+                    cgh, cols, row_blocks, op, finalize, identity,
+                    tmp_storage.get_const_data(), 1, result);
+            });
+        }
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_kernel_col_reduction_stage1,
+                                           run_kernel_col_reduction_stage1);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_row_reduction(std::shared_ptr<const DpcppExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType identity,
+                              ValueType* result, size_type result_stride,
+                              dim<2> size, KernelArgs&&... args)
+{
+    const auto desired_cfg = get_first_cfg(
+        as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
+            return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    select_kernel_row_reduction_stage1(
+        kcfg_1d_list_simple_reduction,
+        [&](std::uint32_t cfg) { return cfg == desired_cfg; },
+        syn::value_list<bool>(), syn::value_list<int>(),
+        syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
+        finalize, identity, result, result_stride, size,
+        map_to_device(args)...);
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_col_reduction(std::shared_ptr<const DpcppExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType identity,
+                              ValueType* result, dim<2> size,
+                              KernelArgs&&... args)
+{
+    const auto desired_cfg = get_first_cfg(
+        as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
+            return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    select_kernel_col_reduction_stage1(
+        kcfg_1d_list_simple_reduction,
+        [&](std::uint32_t cfg) { return cfg == desired_cfg; },
+        syn::value_list<bool>(), syn::value_list<int>(),
+        syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
+        finalize, identity, result, size, map_to_device(args)...);
+}
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/base/kernel_launch_solver.dp.hpp b/dpcpp/base/kernel_launch_solver.dp.hpp
index 5cec5b55d79..68ef10ac1fe 100644
--- a/dpcpp/base/kernel_launch_solver.dp.hpp
+++ b/dpcpp/base/kernel_launch_solver.dp.hpp
@@ -42,17 +42,18 @@ namespace dpcpp {
 
 
 template <typename KernelFunction, typename... KernelArgs>
-void generic_kernel_2d_solver(sycl::handler& cgh, size_type rows,
-                              size_type cols, size_type default_stride,
-                              KernelFunction fn, KernelArgs... args)
+void generic_kernel_2d_solver(sycl::handler& cgh, int64 rows, int64 cols,
+                              int64 default_stride, KernelFunction fn,
+                              KernelArgs... args)
 {
-    cgh.parallel_for(sycl::range<2>{rows, cols}, [=](sycl::id<2> idx) {
-        auto row = static_cast<size_type>(idx[0]);
-        auto col = static_cast<size_type>(idx[1]);
-        fn(row, col,
-           device_unpack_solver_impl<KernelArgs>::unpack(args,
-                                                         default_stride)...);
-    });
+    cgh.parallel_for(sycl::range<1>{static_cast<std::size_t>(rows * cols)},
+                     [=](sycl::id<1> idx) {
+                         auto row = static_cast<int64>(idx[0] / cols);
+                         auto col = static_cast<int64>(idx[0] % cols);
+                         fn(row, col,
+                            device_unpack_solver_impl<KernelArgs>::unpack(
+                                args, default_stride)...);
+                     });
 }
 
 
@@ -63,7 +64,8 @@ void run_kernel_solver(std::shared_ptr<const DpcppExecutor> exec,
 {
     exec->get_queue()->submit([&](sycl::handler& cgh) {
         kernels::dpcpp::generic_kernel_2d_solver(
-            cgh, size[0], size[1], default_stride, fn,
+            cgh, static_cast<int64>(size[0]), static_cast<int64>(size[1]),
+            static_cast<int64>(default_stride), fn,
             kernels::dpcpp::map_to_device(args)...);
     });
 }
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 9a86ab9cd15..7873a687e4b 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -84,284 +84,6 @@ constexpr int default_block_size = 256;
 namespace kernel {
 
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename OutType,
-          typename CallableGetValue, typename CallableReduce>
-void compute_partial_reduce(
-    size_type num_rows, OutType* __restrict__ work, CallableGetValue get_value,
-    CallableReduce reduce_op, sycl::nd_item<3> item_ct1,
-    UninitializedArray<OutType, KCFG_1D::decode<0>(cfg)>& tmp_work)
-{
-    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
-
-    constexpr auto warps_per_block = wg_size / sg_size;
-
-    const auto num_blocks = item_ct1.get_group_range(2);
-    const auto local_id = thread::get_local_thread_id<sg_size>(item_ct1);
-    const auto global_id =
-        thread::get_thread_id<sg_size, warps_per_block>(item_ct1);
-
-    OutType* tmp_work_array = tmp_work;
-    auto tmp = zero<OutType>();
-    for (auto i = global_id; i < num_rows; i += wg_size * num_blocks) {
-        tmp = reduce_op(tmp, get_value(i));
-    }
-
-    tmp_work_array[local_id] = tmp;
-
-    ::gko::kernels::dpcpp::reduce<sg_size>(group::this_thread_block(item_ct1),
-                                           tmp_work_array, reduce_op);
-
-    if (local_id == 0) {
-        work[thread::get_block_id(item_ct1)] = tmp_work_array[0];
-    }
-}
-
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType,
-          typename CallableReduce, typename CallableFinalize>
-void finalize_reduce_computation(
-    size_type size, const ValueType* work, ValueType* result,
-    CallableReduce reduce_op, CallableFinalize finalize_op,
-    sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)>& tmp_work)
-{
-    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
-
-    const auto local_id = thread::get_local_thread_id<sg_size>(item_ct1);
-
-    ValueType tmp = zero<ValueType>();
-    for (auto i = local_id; i < size; i += wg_size) {
-        tmp = reduce_op(tmp, work[i]);
-    }
-    ValueType* tmp_work_array = tmp_work;
-    tmp_work_array[local_id] = tmp;
-
-    ::gko::kernels::dpcpp::reduce<sg_size>(group::this_thread_block(item_ct1),
-                                           tmp_work_array, reduce_op);
-
-    if (local_id == 0) {
-        *result = finalize_op(tmp_work_array[0]);
-    }
-}
-
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void compute_partial_dot(
-    size_type num_rows, const ValueType* __restrict__ x, size_type stride_x,
-    const ValueType* __restrict__ y, size_type stride_y,
-    ValueType* __restrict__ work, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)>& tmp_work)
-{
-    compute_partial_reduce<cfg>(
-        num_rows, work,
-        [x, stride_x, y, stride_y](size_type i) {
-            return x[i * stride_x] * y[i * stride_y];
-        },
-        [](const ValueType& x, const ValueType& y) { return x + y; }, item_ct1,
-        tmp_work);
-}
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void compute_partial_dot(dim3 grid, dim3 block, size_type dynamic_shared_memory,
-                         sycl::queue* queue, size_type num_rows,
-                         const ValueType* x, size_type stride_x,
-                         const ValueType* y, size_type stride_y,
-                         ValueType* work)
-{
-    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
-                       sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            tmp_work_acc_ct1(cgh);
-
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                compute_partial_dot<cfg>(num_rows, x, stride_x, y, stride_y,
-                                         work, item_ct1,
-                                         *tmp_work_acc_ct1.get_pointer());
-            });
-    });
-}
-
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_dot,
-                                           compute_partial_dot)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_dot_call, compute_partial_dot,
-                               kcfg_1d_list)
-
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void compute_partial_conj_dot(
-    size_type num_rows, const ValueType* __restrict__ x, size_type stride_x,
-    const ValueType* __restrict__ y, size_type stride_y,
-    ValueType* __restrict__ work, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)>& tmp_work)
-{
-    compute_partial_reduce<cfg>(
-        num_rows, work,
-        [x, stride_x, y, stride_y](size_type i) {
-            return conj(x[i * stride_x]) * y[i * stride_y];
-        },
-        [](const ValueType& x, const ValueType& y) { return x + y; }, item_ct1,
-        tmp_work);
-}
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void compute_partial_conj_dot(dim3 grid, dim3 block,
-                              size_type dynamic_shared_memory,
-                              sycl::queue* queue, size_type num_rows,
-                              const ValueType* x, size_type stride_x,
-                              const ValueType* y, size_type stride_y,
-                              ValueType* work)
-{
-    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
-                       sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            tmp_work_acc_ct1(cgh);
-
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                compute_partial_conj_dot<cfg>(num_rows, x, stride_x, y,
-                                              stride_y, work, item_ct1,
-                                              *tmp_work_acc_ct1.get_pointer());
-            });
-    });
-}
-
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_conj_dot,
-                                           compute_partial_conj_dot)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_conj_dot_call,
-                               compute_partial_conj_dot, kcfg_1d_list)
-
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void finalize_sum_reduce_computation(
-    size_type size, const ValueType* work, ValueType* result,
-    sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)>& tmp_work)
-{
-    finalize_reduce_computation<cfg>(
-        size, work, result,
-        [](const ValueType& x, const ValueType& y) { return x + y; },
-        [](const ValueType& x) { return x; }, item_ct1, tmp_work);
-}
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void finalize_sum_reduce_computation(dim3 grid, dim3 block,
-                                     size_type dynamic_shared_memory,
-                                     sycl::queue* queue, size_type size,
-                                     const ValueType* work, ValueType* result)
-{
-    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
-                       sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            tmp_work_acc_ct1(cgh);
-
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             finalize_sum_reduce_computation<cfg>(
-                                 size, work, result, item_ct1,
-                                 *tmp_work_acc_ct1.get_pointer());
-                         });
-    });
-}
-
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sum_reduce_computation,
-                                           finalize_sum_reduce_computation)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sum_reduce_computation_call,
-                               finalize_sum_reduce_computation, kcfg_1d_list)
-
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void compute_partial_norm2(
-    size_type num_rows, const ValueType* __restrict__ x, size_type stride_x,
-    remove_complex<ValueType>* __restrict__ work, sycl::nd_item<3> item_ct1,
-    UninitializedArray<remove_complex<ValueType>, KCFG_1D::decode<0>(cfg)>&
-        tmp_work)
-{
-    using norm_type = remove_complex<ValueType>;
-    compute_partial_reduce<cfg>(
-        num_rows, work,
-        [x, stride_x](size_type i) { return squared_norm(x[i * stride_x]); },
-        [](const norm_type& x, const norm_type& y) { return x + y; }, item_ct1,
-        tmp_work);
-}
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void compute_partial_norm2(dim3 grid, dim3 block,
-                           size_type dynamic_shared_memory, sycl::queue* queue,
-                           size_type num_rows, const ValueType* x,
-                           size_type stride_x, remove_complex<ValueType>* work)
-{
-    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<remove_complex<ValueType>, wg_size>,
-                       0, sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            tmp_work_acc_ct1(cgh);
-
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                compute_partial_norm2<cfg>(num_rows, x, stride_x, work,
-                                           item_ct1,
-                                           *tmp_work_acc_ct1.get_pointer());
-            });
-    });
-}
-
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_norm2,
-                                           compute_partial_norm2)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_norm2_call,
-                               compute_partial_norm2, kcfg_1d_list)
-
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void finalize_sqrt_reduce_computation(
-    size_type size, const ValueType* work, ValueType* result,
-    sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)>& tmp_work)
-{
-    finalize_reduce_computation<cfg>(
-        size, work, result,
-        [](const ValueType& x, const ValueType& y) { return x + y; },
-        [](const ValueType& x) { return std::sqrt(x); }, item_ct1, tmp_work);
-}
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void finalize_sqrt_reduce_computation(dim3 grid, dim3 block,
-                                      size_type dynamic_shared_memory,
-                                      sycl::queue* queue, size_type size,
-                                      const ValueType* work, ValueType* result)
-{
-    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
-                       sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            tmp_work_acc_ct1(cgh);
-
-
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             finalize_sqrt_reduce_computation<cfg>(
-                                 size, work, result, item_ct1,
-                                 *tmp_work_acc_ct1.get_pointer());
-                         });
-    });
-}
-
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sqrt_reduce_computation,
-                                           finalize_sqrt_reduce_computation)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sqrt_reduce_computation_call,
-                               finalize_sqrt_reduce_computation, kcfg_1d_list)
-
-
 template <typename ValueType, typename IndexType>
 void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride,
                  const size_type* __restrict__ row_ptrs,
@@ -812,144 +534,6 @@ void apply(std::shared_ptr<const DpcppExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
-template <typename ValueType>
-void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
-                 const matrix::Dense<ValueType>* x,
-                 const matrix::Dense<ValueType>* y,
-                 matrix::Dense<ValueType>* result)
-{
-    if (x->get_size()[1] == 1) {
-        // TODO: write a custom kernel which does this more efficiently
-        onemkl::dot(*exec->get_queue(), x->get_size()[0], x->get_const_values(),
-                    x->get_stride(), y->get_const_values(), y->get_stride(),
-                    result->get_values());
-    } else {
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        auto queue = exec->get_queue();
-        constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-        const std::uint32_t cfg =
-            get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
-                return validate(queue, KCFG_1D::decode<0>(cfg),
-                                KCFG_1D::decode<1>(cfg));
-            });
-        const auto wg_size = KCFG_1D::decode<0>(cfg);
-        const auto sg_size = KCFG_1D::decode<1>(cfg);
-        const auto work_per_block = work_per_thread * wg_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{sg_size, 1, wg_size / sg_size};
-        Array<ValueType> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            kernel::compute_partial_dot_call(
-                cfg, grid_dim, block_dim, 0, exec->get_queue(),
-                x->get_size()[0], x->get_const_values() + col, x->get_stride(),
-                y->get_const_values() + col, y->get_stride(), work.get_data());
-            kernel::finalize_sum_reduce_computation_call(
-                cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
-                work.get_const_data(), result->get_values() + col);
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot(std::shared_ptr<const DpcppExecutor> exec,
-                      const matrix::Dense<ValueType>* x,
-                      const matrix::Dense<ValueType>* y,
-                      matrix::Dense<ValueType>* result)
-{
-    if (x->get_size()[1] == 1) {
-        // TODO: write a custom kernel which does this more efficiently
-        onemkl::conj_dot(*exec->get_queue(), x->get_size()[0],
-                         x->get_const_values(), x->get_stride(),
-                         y->get_const_values(), y->get_stride(),
-                         result->get_values());
-
-    } else {
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        auto queue = exec->get_queue();
-        constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-        const std::uint32_t cfg =
-            get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
-                return validate(queue, KCFG_1D::decode<0>(cfg),
-                                KCFG_1D::decode<1>(cfg));
-            });
-        const auto wg_size = KCFG_1D::decode<0>(cfg);
-        const auto sg_size = KCFG_1D::decode<1>(cfg);
-
-        const auto work_per_block = work_per_thread * wg_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{sg_size, 1, wg_size / sg_size};
-        Array<ValueType> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            kernel::compute_partial_conj_dot_call(
-                cfg, grid_dim, block_dim, 0, exec->get_queue(),
-                x->get_size()[0], x->get_const_values() + col, x->get_stride(),
-                y->get_const_values() + col, y->get_stride(), work.get_data());
-            kernel::finalize_sum_reduce_computation_call(
-                cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
-                work.get_const_data(), result->get_values() + col);
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
-                   const matrix::Dense<ValueType>* x,
-                   matrix::Dense<remove_complex<ValueType>>* result)
-{
-    if (x->get_size()[1] == 1) {
-        oneapi::mkl::blas::row_major::nrm2(
-            *exec->get_queue(), x->get_size()[0], x->get_const_values(),
-            x->get_stride(), result->get_values());
-    } else {
-        using norm_type = remove_complex<ValueType>;
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        auto queue = exec->get_queue();
-        constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-        const std::uint32_t cfg =
-            get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
-                return validate(queue, KCFG_1D::decode<0>(cfg),
-                                KCFG_1D::decode<1>(cfg));
-            });
-        const auto wg_size = KCFG_1D::decode<0>(cfg);
-        const auto sg_size = KCFG_1D::decode<1>(cfg);
-
-        const auto work_per_block = work_per_thread * wg_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{sg_size, 1, wg_size / sg_size};
-        Array<norm_type> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            kernel::compute_partial_norm2_call(
-                cfg, grid_dim, block_dim, 0, exec->get_queue(),
-                x->get_size()[0], x->get_const_values() + col, x->get_stride(),
-                work.get_data());
-            kernel::finalize_sqrt_reduce_computation_call(
-                cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
-                work.get_const_data(), result->get_values() + col);
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const DpcppExecutor> exec,
                     const matrix::Dense<ValueType>* source,
diff --git a/dpcpp/test/base/kernel_launch.dp.cpp b/dpcpp/test/base/kernel_launch.dp.cpp
index 27d3f1abd12..2c7a08cdb36 100644
--- a/dpcpp/test/base/kernel_launch.dp.cpp
+++ b/dpcpp/test/base/kernel_launch.dp.cpp
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/unified/base/kernel_launch_reduction.hpp"
 #include "common/unified/base/kernel_launch_solver.hpp"
 #include "core/test/utils.hpp"
 
@@ -54,6 +55,7 @@ namespace {
 
 
 using gko::dim;
+using gko::int64;
 using gko::size_type;
 using std::is_same;
 
@@ -110,7 +112,7 @@ TEST_F(KernelLaunch, Runs1D)
     gko::kernels::dpcpp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             d[i] = i;
         },
@@ -125,7 +127,7 @@ TEST_F(KernelLaunch, Runs1DArray)
     gko::kernels::dpcpp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
             if (d == d_ptr) {
@@ -145,7 +147,7 @@ TEST_F(KernelLaunch, Runs1DDense)
     gko::kernels::dpcpp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), value_type&>::value,
                           "type");
             static_assert(is_same<decltype(d2(0, 0)), const value_type&>::value,
@@ -177,8 +179,8 @@ TEST_F(KernelLaunch, Runs2D)
     gko::kernels::dpcpp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
-            static_assert(is_same<decltype(j), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             d[i + 4 * j] = 4 * i + j;
         },
@@ -193,8 +195,8 @@ TEST_F(KernelLaunch, Runs2DArray)
     gko::kernels::dpcpp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
-            static_assert(is_same<decltype(j), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
             if (d == d_ptr) {
@@ -215,11 +217,9 @@ TEST_F(KernelLaunch, Runs2DDense)
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3,
                       auto d4, auto d2_ptr, auto d3_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), value_type&>::value,
                           "type");
-            static_assert(is_same<decltype(d2(0, 0)), const value_type&>::value,
-                          "type");
             static_assert(is_same<decltype(d_ptr), const value_type*>::value,
                           "type");
             static_assert(is_same<decltype(d3(0, 0)), value_type&>::value,
@@ -257,4 +257,174 @@ TEST_F(KernelLaunch, Runs2DDense)
 }
 
 
+TEST_F(KernelLaunch, Reduction1D)
+{
+    gko::Array<int64> output{exec, 1};
+
+    gko::kernels::dpcpp::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return i + 1;
+        },
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "i");
+            static_assert(is_same<decltype(j), int64>::value, "j");
+            return i + j;
+        },
+        [] GKO_KERNEL(auto j) {
+            static_assert(is_same<decltype(j), int64>::value, "j");
+            return j * 2;
+        },
+        int64{}, output.get_data(), size_type{100000}, output);
+
+    // 2 * sum i=0...99999 (i+1)
+    EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL);
+
+    gko::kernels::dpcpp::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return i + 1;
+        },
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "i");
+            static_assert(is_same<decltype(j), int64>::value, "j");
+            return i + j;
+        },
+        [] GKO_KERNEL(auto j) {
+            static_assert(is_same<decltype(j), int64>::value, "j");
+            return j * 2;
+        },
+        int64{}, output.get_data(), size_type{100}, output);
+
+    // 2 * sum i=0...99 (i+1)
+    EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL);
+}
+
+
+TEST_F(KernelLaunch, Reduction2D)
+{
+    gko::Array<int64> output{exec, 1};
+
+    gko::kernels::dpcpp::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "i");
+            static_assert(is_same<decltype(j), int64>::value, "j");
+            return i + j;
+        },
+        [] GKO_KERNEL(auto j) {
+            static_assert(is_same<decltype(j), int64>::value, "j");
+            return j * 4;
+        },
+        int64{}, output.get_data(), gko::dim<2>{1000, 100}, output);
+
+    // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1)
+    EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL);
+
+    gko::kernels::dpcpp::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "i");
+            static_assert(is_same<decltype(j), int64>::value, "j");
+            return i + j;
+        },
+        [] GKO_KERNEL(auto j) {
+            static_assert(is_same<decltype(j), int64>::value, "j");
+            return j * 4;
+        },
+        int64{}, output.get_data(), gko::dim<2>{10, 10}, output);
+
+    // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL);
+}
+
+
+TEST_F(KernelLaunch, ReductionRow2D)
+{
+    for (auto num_rows : {0, 1, 10, 100, 1000, 10000}) {
+        for (auto num_cols : {0, 1, 10, 100, 1000, 10000}) {
+            SCOPED_TRACE(std::to_string(num_rows) + " rows, " +
+                         std::to_string(num_cols) + " cols");
+            gko::Array<int64> host_ref{exec->get_master(),
+                                       static_cast<size_type>(2 * num_rows)};
+            std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
+            gko::Array<int64> output{exec, host_ref};
+            for (int i = 0; i < num_rows; i++) {
+                // we are computing 2 * sum {j=0, j<cols} (i+1)*(j+1) for each
+                // row i and storing it with stride 2
+                host_ref.get_data()[2 * i] =
+                    static_cast<int64>(num_cols) * (num_cols + 1) * (i + 1);
+            }
+
+            gko::kernels::dpcpp::run_kernel_row_reduction(
+                exec,
+                [] GKO_KERNEL(auto i, auto j, auto a) {
+                    static_assert(is_same<decltype(i), int64>::value, "index");
+                    static_assert(is_same<decltype(a), int64*>::value, "value");
+                    return (i + 1) * (j + 1);
+                },
+                [] GKO_KERNEL(auto i, auto j) { return i + j; },
+                [] GKO_KERNEL(auto j) { return 2 * j; }, int64{},
+                output.get_data(), 2,
+                gko::dim<2>{static_cast<size_type>(num_rows),
+                            static_cast<size_type>(num_cols)},
+                output);
+
+            GKO_ASSERT_ARRAY_EQ(host_ref, output);
+        }
+    }
+}
+
+
+TEST_F(KernelLaunch, ReductionCol2D)
+{
+    for (int num_rows : {0, 1, 10, 100, 1000, 10000}) {
+        for (int num_cols :
+             {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 40, 100, 1000}) {
+            SCOPED_TRACE(std::to_string(num_rows) + " rows, " +
+                         std::to_string(num_cols) + " cols");
+            gko::Array<int64> host_ref{exec->get_master(),
+                                       static_cast<size_type>(num_cols)};
+            gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
+            for (int i = 0; i < num_cols; i++) {
+                // we are computing 2 * sum {j=0, j<row} (i+1)*(j+1) for each
+                // column i
+                host_ref.get_data()[i] =
+                    static_cast<int64>(num_rows) * (num_rows + 1) * (i + 1);
+            }
+
+            gko::kernels::dpcpp::run_kernel_col_reduction(
+                exec,
+                [] GKO_KERNEL(auto i, auto j, auto a) {
+                    static_assert(is_same<decltype(i), int64>::value, "index");
+                    static_assert(is_same<decltype(a), int64*>::value, "value");
+                    return (i + 1) * (j + 1);
+                },
+                [] GKO_KERNEL(auto i, auto j) { return i + j; },
+                [] GKO_KERNEL(auto j) { return j * 2; }, int64{},
+                output.get_data(),
+                gko::dim<2>{static_cast<size_type>(num_rows),
+                            static_cast<size_type>(num_cols)},
+                output);
+
+            GKO_ASSERT_ARRAY_EQ(host_ref, output);
+        }
+    }
+}
+
+
 }  // namespace
diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp
index 8967ee5597d..6c627838fea 100644
--- a/hip/base/kernel_launch.hip.hpp
+++ b/hip/base/kernel_launch.hip.hpp
@@ -54,9 +54,9 @@ constexpr int default_block_size = 512;
 
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_1d(
-    size_type size, KernelFunction fn, KernelArgs... args)
+    int64 size, KernelFunction fn, KernelArgs... args)
 {
-    auto tidx = thread::get_thread_id_flat();
+    auto tidx = thread::get_thread_id_flat<int64>();
     if (tidx >= size) {
         return;
     }
@@ -66,9 +66,9 @@ __global__ __launch_bounds__(default_block_size) void generic_kernel_1d(
 
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_2d(
-    size_type rows, size_type cols, KernelFunction fn, KernelArgs... args)
+    int64 rows, int64 cols, KernelFunction fn, KernelArgs... args)
 {
-    auto tidx = thread::get_thread_id_flat();
+    auto tidx = thread::get_thread_id_flat<int64>();
     auto col = tidx % cols;
     auto row = tidx / cols;
     if (row >= rows) {
@@ -85,8 +85,8 @@ void run_kernel(std::shared_ptr<const HipExecutor> exec, KernelFunction fn,
     gko::hip::device_guard guard{exec->get_device_id()};
     constexpr auto block_size = default_block_size;
     auto num_blocks = ceildiv(size, block_size);
-    hipLaunchKernelGGL(generic_kernel_1d, num_blocks, block_size, 0, 0, size,
-                       fn, map_to_device(args)...);
+    hipLaunchKernelGGL(generic_kernel_1d, num_blocks, block_size, 0, 0,
+                       static_cast<int64>(size), fn, map_to_device(args)...);
 }
 
 template <typename KernelFunction, typename... KernelArgs>
@@ -96,8 +96,9 @@ void run_kernel(std::shared_ptr<const HipExecutor> exec, KernelFunction fn,
     gko::hip::device_guard guard{exec->get_device_id()};
     constexpr auto block_size = default_block_size;
     auto num_blocks = ceildiv(size[0] * size[1], block_size);
-    hipLaunchKernelGGL(generic_kernel_2d, num_blocks, block_size, 0, 0, size[0],
-                       size[1], fn, map_to_device(args)...);
+    hipLaunchKernelGGL(generic_kernel_2d, num_blocks, block_size, 0, 0,
+                       static_cast<int64>(size[0]), static_cast<int64>(size[1]),
+                       fn, map_to_device(args)...);
 }
 
 
diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp
new file mode 100644
index 00000000000..40e4268dccb
--- /dev/null
+++ b/hip/base/kernel_launch_reduction.hip.hpp
@@ -0,0 +1,540 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#error \
+    "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp"
+#endif
+
+
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/base/device_guard.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+__global__ __launch_bounds__(
+    default_block_size) void generic_kernel_reduction_1d(int64 size,
+                                                         KernelFunction fn,
+                                                         ReductionOp op,
+                                                         FinalizeOp finalize,
+                                                         ValueType identity,
+                                                         ValueType* storage,
+                                                         KernelArgs... args)
+{
+    __shared__
+        UninitializedArray<ValueType, default_block_size / config::warp_size>
+            warp_partial;
+    static_assert(default_block_size / config::warp_size <= config::warp_size,
+                  "needs third reduction level");
+    auto tidx = thread::get_thread_id_flat<int64>();
+    auto grid_size = thread::get_thread_num_flat<int64>();
+    auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    auto partial = identity;
+    for (int64 i = tidx; i < size; i += grid_size) {
+        partial = op(partial, fn(i, args...));
+    }
+    partial = reduce(warp, partial, op);
+    if (warp.thread_rank() == 0) {
+        warp_partial[threadIdx.x / config::warp_size] = partial;
+    }
+    __syncthreads();
+    if (threadIdx.x < config::warp_size) {
+        partial = reduce(warp,
+                         threadIdx.x < default_block_size / config::warp_size
+                             ? warp_partial[threadIdx.x]
+                             : identity,
+                         op);
+        if (threadIdx.x == 0) {
+            storage[blockIdx.x] = finalize(partial);
+        }
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+__global__ __launch_bounds__(
+    default_block_size) void generic_kernel_reduction_2d(int64 rows, int64 cols,
+                                                         KernelFunction fn,
+                                                         ReductionOp op,
+                                                         FinalizeOp finalize,
+                                                         ValueType identity,
+                                                         ValueType* storage,
+                                                         KernelArgs... args)
+{
+    __shared__
+        UninitializedArray<ValueType, default_block_size / config::warp_size>
+            warp_partial;
+    static_assert(default_block_size / config::warp_size <= config::warp_size,
+                  "needs third reduction level");
+    auto tidx = thread::get_thread_id_flat<int64>();
+    auto grid_size = thread::get_thread_num_flat<int64>();
+    auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    auto partial = identity;
+    for (int64 i = tidx; i < rows * cols; i += grid_size) {
+        const auto row = i / cols;
+        const auto col = i % cols;
+        partial = op(partial, fn(row, col, args...));
+    }
+    partial = reduce(warp, partial, op);
+    if (warp.thread_rank() == 0) {
+        warp_partial[threadIdx.x / config::warp_size] = partial;
+    }
+    __syncthreads();
+    if (threadIdx.x < config::warp_size) {
+        partial = reduce(warp,
+                         threadIdx.x < default_block_size / config::warp_size
+                             ? warp_partial[threadIdx.x]
+                             : identity,
+                         op);
+        if (threadIdx.x == 0) {
+            storage[blockIdx.x] = finalize(partial);
+        }
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const HipExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType identity,
+                          ValueType* result, size_type size,
+                          KernelArgs&&... args)
+{
+    constexpr int oversubscription = 16;
+    gko::hip::device_guard guard{exec->get_device_id()};
+    constexpr auto block_size = default_block_size;
+    const auto num_blocks = std::min<int64>(
+        ceildiv(size, block_size), exec->get_num_warps() * oversubscription);
+    if (num_blocks > 1) {
+        Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
+        hipLaunchKernelGGL(
+            generic_kernel_reduction_1d, num_blocks, block_size, 0, 0,
+            static_cast<int64>(size), fn, op,
+            [] __device__(auto v) { return v; }, as_hip_type(identity),
+            as_hip_type(partial.get_data()), map_to_device(args)...);
+        hipLaunchKernelGGL(
+            generic_kernel_reduction_1d, 1, block_size, 0, 0,
+            static_cast<int64>(num_blocks),
+            [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
+            as_hip_type(identity), as_hip_type(result),
+            as_hip_type(partial.get_const_data()));
+    } else {
+        hipLaunchKernelGGL(generic_kernel_reduction_1d, 1, block_size, 0, 0,
+                           static_cast<int64>(size), fn, op, finalize,
+                           as_hip_type(identity), as_hip_type(result),
+                           map_to_device(args)...);
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const HipExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType identity,
+                          ValueType* result, dim<2> size, KernelArgs&&... args)
+{
+    constexpr int oversubscription = 16;
+    gko::hip::device_guard guard{exec->get_device_id()};
+    constexpr auto block_size = default_block_size;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto num_blocks =
+        std::min<int64>(ceildiv(rows * cols, block_size),
+                        exec->get_num_warps() * oversubscription);
+    if (num_blocks > 1) {
+        Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
+        hipLaunchKernelGGL(
+            generic_kernel_reduction_2d, num_blocks, block_size, 0, 0, rows,
+            cols, fn, op, [] __device__(auto v) { return v; },
+            as_hip_type(identity), as_hip_type(partial.get_data()),
+            map_to_device(args)...);
+        hipLaunchKernelGGL(
+            generic_kernel_reduction_1d, 1, block_size, 0, 0,
+            static_cast<int64>(num_blocks),
+            [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
+            as_hip_type(identity), as_hip_type(result),
+            as_hip_type(partial.get_const_data()));
+    } else {
+        hipLaunchKernelGGL(generic_kernel_reduction_2d, 1, block_size, 0, 0,
+                           rows, cols, fn, op, finalize, as_hip_type(identity),
+                           as_hip_type(result), map_to_device(args)...);
+    }
+}
+
+
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d(
+        int64 rows, int64 cols, int64 col_blocks, KernelFunction fn,
+        ReductionOp op, FinalizeOp finalize, ValueType identity,
+        ValueType* result, int64 result_stride, KernelArgs... args)
+{
+    const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
+    const auto row = idx % rows;
+    const auto col_block = idx / rows;
+    if (col_block >= col_blocks) {
+        return;
+    }
+    const auto cols_per_part =
+        ceildiv(ceildiv(cols, subwarp_size), col_blocks) * subwarp_size;
+    const auto begin = cols_per_part * col_block;
+    const auto end = min(begin + cols_per_part, cols);
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    auto partial = identity;
+    for (auto col = begin + subwarp.thread_rank(); col < end;
+         col += subwarp_size) {
+        partial = op(partial, fn(row, col, args...));
+    }
+    partial = reduce(subwarp, partial, op);
+    if (subwarp.thread_rank() == 0) {
+        result[(row + col_block * rows) * result_stride] = finalize(partial);
+    }
+}
+
+
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small(
+        int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
+        FinalizeOp finalize, ValueType identity, ValueType* result,
+        KernelArgs... args)
+{
+    constexpr auto warp_size = config::warp_size;
+    constexpr auto warps_per_block = default_block_size / warp_size;
+    // stores the subwarp_size partial sums from each warp, grouped by warp
+    constexpr auto shared_storage = warps_per_block * subwarp_size;
+    __shared__ UninitializedArray<ValueType, shared_storage> block_partial;
+    const auto subwarp_id = thread::get_subwarp_id_flat<subwarp_size, int64>();
+    const auto local_warp_id = threadIdx.x / warp_size;
+    const auto local_subwarp_id = threadIdx.x % warp_size / subwarp_size;
+    const auto subwarp_num =
+        thread::get_subwarp_num_flat<subwarp_size, int64>();
+    const auto block = group::this_thread_block();
+    const auto warp = group::tiled_partition<warp_size>(block);
+    const auto warp_rank = warp.thread_rank();
+    const auto subwarp_rank = warp_rank % subwarp_size;
+    const auto col = static_cast<int64>(subwarp_rank);
+    auto partial = identity;
+    // accumulate within a thread
+    if (col < cols) {
+        for (auto row = subwarp_id; row < rows; row += subwarp_num) {
+            partial = op(partial, fn(row, col, args...));
+        }
+    }
+    // accumulate between all subwarps in the warp
+#pragma unroll
+    for (unsigned i = subwarp_size; i < warp_size; i *= 2) {
+        partial = op(partial, warp.shfl_xor(partial, i));
+    }  // store the result to shared memory
+    if (local_subwarp_id == 0) {
+        block_partial[local_warp_id * subwarp_size + subwarp_rank] = partial;
+    }
+    block.sync();
+    // in a single thread: accumulate the results
+    if (local_warp_id == 0) {
+        partial = identity;
+        // accumulate the partial results within a thread
+        if (shared_storage >= warp_size) {
+#pragma unroll
+            for (int i = 0; i < shared_storage; i += warp_size) {
+                partial = op(partial, block_partial[i + warp_rank]);
+            }
+        } else if (warp_rank < shared_storage) {
+            partial = op(partial, block_partial[warp_rank]);
+        }
+        // accumulate between all subwarps in the warp
+#pragma unroll
+        for (unsigned i = subwarp_size; i < warp_size; i *= 2) {
+            partial = op(partial, warp.shfl_xor(partial, i));
+        }
+        if (warp_rank < cols) {
+            result[warp_rank + blockIdx.x * cols] = finalize(partial);
+        }
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked(
+        int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
+        FinalizeOp finalize, ValueType identity, ValueType* result,
+        KernelArgs... args)
+{
+    constexpr auto warp_size = config::warp_size;
+    __shared__ UninitializedArray<ValueType, default_block_size> block_partial;
+    const auto warp_id = thread::get_subwarp_id_flat<warp_size, int64>();
+    const auto warp_num = thread::get_subwarp_num_flat<warp_size, int64>();
+    const auto block = group::this_thread_block();
+    const auto warp = group::tiled_partition<warp_size>(block);
+    const auto warp_rank = warp.thread_rank();
+    const auto col = warp_rank + static_cast<int64>(blockIdx.y) * warp_size;
+    auto partial = identity;
+    // accumulate within a thread
+    if (col < cols) {
+        for (auto row = warp_id; row < rows; row += warp_num) {
+            partial = op(partial, fn(row, col, args...));
+        }
+    }
+    block_partial[threadIdx.x] = partial;
+    block.sync();
+    // in a single warp: accumulate the results
+    if (threadIdx.x < warp_size) {
+        partial = identity;
+        // accumulate the partial results within a thread
+#pragma unroll
+        for (int i = 0; i < default_block_size; i += warp_size) {
+            partial = op(partial, block_partial[i + warp_rank]);
+        }
+        if (col < cols) {
+            result[col + blockIdx.x * cols] = finalize(partial);
+        }
+    }
+}
+
+
+template <typename ValueType, typename ReductionOp, typename FinalizeOp>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d(
+        int64 num_results, int64 num_blocks, ReductionOp op,
+        FinalizeOp finalize, ValueType identity, const ValueType* input,
+        int64 result_stride, ValueType* result)
+{
+    const auto idx = thread::get_thread_id_flat<int64>();
+    if (idx >= num_results) {
+        return;
+    }
+    auto partial = identity;
+    for (int64 block = 0; block < num_blocks; block++) {
+        partial = op(partial, input[idx + block * num_results]);
+    }
+    result[idx * result_stride] = finalize(partial);
+}
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+void run_generic_kernel_row_reduction(syn::value_list<int, subwarp_size>,
+                                      int64 rows, int64 cols, int64 col_blocks,
+                                      KernelFunction fn, ReductionOp op,
+                                      FinalizeOp finalize, ValueType identity,
+                                      ValueType* result, int64 result_stride,
+                                      KernelArgs... args)
+{
+    const auto num_blocks =
+        ceildiv(rows * col_blocks * subwarp_size, default_block_size);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(generic_kernel_row_reduction_2d<subwarp_size>),
+        num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn, op,
+        finalize, as_hip_type(identity), as_hip_type(result), result_stride,
+        args...);
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction,
+                                    run_generic_kernel_row_reduction)
+
+
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_generic_col_reduction_small(syn::value_list<int, subwarp_size>,
+                                     int64 max_blocks,
+                                     std::shared_ptr<const HipExecutor> exec,
+                                     KernelFunction fn, ReductionOp op,
+                                     FinalizeOp finalize, ValueType identity,
+                                     ValueType* result, dim<2> size,
+                                     MappedKernelArgs... args)
+{
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto num_blocks = std::min<int64>(
+        ceildiv(rows * subwarp_size, default_block_size), max_blocks);
+    if (num_blocks <= 1) {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                generic_kernel_col_reduction_2d_small<subwarp_size>),
+            1, default_block_size, 0, 0, rows, cols, fn, op, finalize,
+            as_hip_type(identity), as_hip_type(result), args...);
+    } else {
+        Array<ValueType> tmp_storage{exec,
+                                     static_cast<size_type>(num_blocks * cols)};
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                generic_kernel_col_reduction_2d_small<subwarp_size>),
+            num_blocks, default_block_size, 0, 0, rows, cols, fn, op,
+            [] __device__(auto v) { return v; }, as_hip_type(identity),
+            as_hip_type(tmp_storage.get_data()), args...);
+        hipLaunchKernelGGL(
+            generic_kernel_reduction_finalize_2d,
+            ceildiv(cols, default_block_size), default_block_size, 0, 0, cols,
+            num_blocks, op, finalize, as_hip_type(identity),
+            as_hip_type(tmp_storage.get_const_data()), 1, as_hip_type(result));
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small,
+                                    run_generic_col_reduction_small);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_row_reduction(std::shared_ptr<const HipExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType identity,
+                              ValueType* result, size_type result_stride,
+                              dim<2> size, KernelArgs&&... args)
+{
+    using subwarp_sizes =
+        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    constexpr int oversubscription = 16;
+    gko::hip::device_guard guard{exec->get_device_id()};
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto resources =
+        exec->get_num_warps() * config::warp_size * oversubscription;
+    if (rows * cols > resources && rows < cols) {
+        const auto col_blocks = ceildiv(rows * cols, resources);
+        Array<ValueType> partial{exec,
+                                 static_cast<size_type>(col_blocks * rows)};
+        const auto num_blocks =
+            ceildiv(rows * col_blocks * config::warp_size, default_block_size);
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(generic_kernel_row_reduction_2d<config::warp_size>),
+            num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn,
+            op, [] __device__(auto v) { return v; }, as_hip_type(identity),
+            as_hip_type(partial.get_data()), 1, map_to_device(args)...);
+        const auto num_finalize_blocks = ceildiv(rows, default_block_size);
+        hipLaunchKernelGGL(
+            generic_kernel_reduction_finalize_2d, num_finalize_blocks,
+            default_block_size, 0, 0, rows, col_blocks, op, finalize,
+            as_hip_type(identity), as_hip_type(partial.get_const_data()),
+            static_cast<int64>(result_stride), as_hip_type(result));
+    } else {
+        select_run_generic_kernel_row_reduction(
+            subwarp_sizes(),
+            [&](int compiled_subwarp_size) {
+                return compiled_subwarp_size >= cols ||
+                       compiled_subwarp_size == config::warp_size;
+            },
+            syn::value_list<int>(), syn::type_list<>(), rows, cols, 1, fn, op,
+            finalize, identity, result, static_cast<int64>(result_stride),
+            map_to_device(args)...);
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_col_reduction(std::shared_ptr<const HipExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType identity,
+                              ValueType* result, dim<2> size,
+                              KernelArgs&&... args)
+{
+    using subwarp_sizes =
+        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    constexpr int oversubscription = 16;
+    gko::hip::device_guard guard{exec->get_device_id()};
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto max_blocks = exec->get_num_warps() * config::warp_size *
+                            oversubscription / default_block_size;
+    if (cols <= config::warp_size) {
+        select_generic_col_reduction_small(
+            subwarp_sizes(),
+            [&](int compiled_subwarp_size) {
+                return compiled_subwarp_size >= cols ||
+                       compiled_subwarp_size == config::warp_size;
+            },
+            syn::value_list<int>(), syn::type_list<>(), max_blocks, exec, fn,
+            op, finalize, identity, result, size, map_to_device(args)...);
+    } else {
+        const auto col_blocks = ceildiv(cols, config::warp_size);
+        const auto row_blocks =
+            ceildiv(std::min<int64>(
+                        ceildiv(rows * config::warp_size, default_block_size),
+                        max_blocks),
+                    col_blocks);
+        if (row_blocks <= 1) {
+            hipLaunchKernelGGL(generic_kernel_col_reduction_2d_blocked,
+                               dim3(1, col_blocks), default_block_size, 0, 0,
+                               rows, cols, fn, op, finalize,
+                               as_hip_type(identity), as_hip_type(result),
+                               map_to_device(args)...);
+        } else {
+            Array<ValueType> tmp_storage{
+                exec, static_cast<size_type>(row_blocks * cols)};
+            hipLaunchKernelGGL(
+                generic_kernel_col_reduction_2d_blocked,
+                dim3(row_blocks, col_blocks), default_block_size, 0, 0, rows,
+                cols, fn, op, [] __device__(auto v) { return v; },
+                as_hip_type(identity), as_hip_type(tmp_storage.get_data()),
+                map_to_device(args)...);
+            hipLaunchKernelGGL(generic_kernel_reduction_finalize_2d,
+                               ceildiv(cols, default_block_size),
+                               default_block_size, 0, 0, cols, row_blocks, op,
+                               finalize, as_hip_type(identity),
+                               as_hip_type(tmp_storage.get_const_data()), 1,
+                               as_hip_type(result));
+        }
+    }
+}
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/base/kernel_launch_solver.hip.hpp b/hip/base/kernel_launch_solver.hip.hpp
index a8335851a0e..9798f6c4fbc 100644
--- a/hip/base/kernel_launch_solver.hip.hpp
+++ b/hip/base/kernel_launch_solver.hip.hpp
@@ -46,10 +46,10 @@ namespace hip {
 
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver(
-    size_type rows, size_type cols, size_type default_stride, KernelFunction fn,
+    int64 rows, int64 cols, int64 default_stride, KernelFunction fn,
     KernelArgs... args)
 {
-    auto tidx = thread::get_thread_id_flat();
+    auto tidx = thread::get_thread_id_flat<int64>();
     auto col = tidx % cols;
     auto row = tidx / cols;
     if (row >= rows) {
@@ -69,7 +69,9 @@ void run_kernel_solver(std::shared_ptr<const HipExecutor> exec,
     constexpr auto block_size = kernels::hip::default_block_size;
     auto num_blocks = ceildiv(size[0] * size[1], block_size);
     hipLaunchKernelGGL(kernels::hip::generic_kernel_2d_solver, num_blocks,
-                       block_size, 0, 0, size[0], size[1], default_stride, fn,
+                       block_size, 0, 0, static_cast<int64>(size[0]),
+                       static_cast<int64>(size[1]),
+                       static_cast<int64>(default_stride), fn,
                        kernels::hip::map_to_device(args)...);
 }
 
diff --git a/hip/components/reduction.hip.hpp b/hip/components/reduction.hip.hpp
index 87d6b518123..7850b9f65a1 100644
--- a/hip/components/reduction.hip.hpp
+++ b/hip/components/reduction.hip.hpp
@@ -55,7 +55,7 @@ namespace kernels {
 namespace hip {
 
 
-constexpr int default_block_size = 512;
+constexpr int default_reduce_block_size = 512;
 
 
 #include "common/cuda_hip/components/reduction.hpp.inc"
@@ -77,23 +77,26 @@ __host__ ValueType reduce_add_array(std::shared_ptr<const HipExecutor> exec,
     auto block_results_val = source;
     size_type grid_dim = size;
     auto block_results = Array<ValueType>(exec);
-    if (size > default_block_size) {
-        const auto n = ceildiv(size, default_block_size);
-        grid_dim = (n <= default_block_size) ? n : default_block_size;
+    if (size > default_reduce_block_size) {
+        const auto n = ceildiv(size, default_reduce_block_size);
+        grid_dim =
+            (n <= default_reduce_block_size) ? n : default_reduce_block_size;
 
         block_results.resize_and_reset(grid_dim);
 
-        hipLaunchKernelGGL(
-            reduce_add_array, dim3(grid_dim), dim3(default_block_size), 0, 0,
-            size, as_hip_type(source), as_hip_type(block_results.get_data()));
+        hipLaunchKernelGGL(reduce_add_array, dim3(grid_dim),
+                           dim3(default_reduce_block_size), 0, 0, size,
+                           as_hip_type(source),
+                           as_hip_type(block_results.get_data()));
 
         block_results_val = block_results.get_const_data();
     }
 
     auto d_result = Array<ValueType>(exec, 1);
 
-    hipLaunchKernelGGL(reduce_add_array, dim3(1), dim3(default_block_size), 0,
-                       0, grid_dim, as_hip_type(block_results_val),
+    hipLaunchKernelGGL(reduce_add_array, dim3(1),
+                       dim3(default_reduce_block_size), 0, 0, grid_dim,
+                       as_hip_type(block_results_val),
                        as_hip_type(d_result.get_data()));
     auto answer = exec->copy_val_to_host(d_result.get_const_data());
     return answer;
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index 56ed5c327b9..d4c815c9539 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -120,144 +120,6 @@ void apply(std::shared_ptr<const HipExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
-template <typename ValueType>
-void compute_dot(std::shared_ptr<const HipExecutor> exec,
-                 const matrix::Dense<ValueType>* x,
-                 const matrix::Dense<ValueType>* y,
-                 matrix::Dense<ValueType>* result)
-{
-    if (hipblas::is_supported<ValueType>::value) {
-        // TODO: write a custom kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            hipblas::dot(exec->get_hipblas_handle(), x->get_size()[0],
-                         x->get_const_values() + col, x->get_stride(),
-                         y->get_const_values() + col, y->get_stride(),
-                         result->get_values() + col);
-        }
-    } else {
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        constexpr int block_size = 1024;
-
-        constexpr auto work_per_block = work_per_thread * block_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{config::warp_size, 1,
-                             block_size / config::warp_size};
-        Array<ValueType> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            hipLaunchKernelGGL(
-                HIP_KERNEL_NAME(kernel::compute_partial_dot<block_size>),
-                dim3(grid_dim), dim3(block_dim), 0, 0, x->get_size()[0],
-                as_hip_type(x->get_const_values() + col), x->get_stride(),
-                as_hip_type(y->get_const_values() + col), y->get_stride(),
-                as_hip_type(work.get_data()));
-            hipLaunchKernelGGL(
-                HIP_KERNEL_NAME(
-                    kernel::finalize_sum_reduce_computation<block_size>),
-                dim3(1), dim3(block_dim), 0, 0, grid_dim.x,
-                as_hip_type(work.get_const_data()),
-                as_hip_type(result->get_values() + col));
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot(std::shared_ptr<const HipExecutor> exec,
-                      const matrix::Dense<ValueType>* x,
-                      const matrix::Dense<ValueType>* y,
-                      matrix::Dense<ValueType>* result)
-{
-    if (hipblas::is_supported<ValueType>::value) {
-        // TODO: write a custom kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            hipblas::conj_dot(exec->get_hipblas_handle(), x->get_size()[0],
-                              x->get_const_values() + col, x->get_stride(),
-                              y->get_const_values() + col, y->get_stride(),
-                              result->get_values() + col);
-        }
-    } else {
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        constexpr int block_size = 1024;
-
-        constexpr auto work_per_block = work_per_thread * block_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{config::warp_size, 1,
-                             block_size / config::warp_size};
-        Array<ValueType> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            hipLaunchKernelGGL(
-                HIP_KERNEL_NAME(kernel::compute_partial_conj_dot<block_size>),
-                dim3(grid_dim), dim3(block_dim), 0, 0, x->get_size()[0],
-                as_hip_type(x->get_const_values() + col), x->get_stride(),
-                as_hip_type(y->get_const_values() + col), y->get_stride(),
-                as_hip_type(work.get_data()));
-            hipLaunchKernelGGL(
-                HIP_KERNEL_NAME(
-                    kernel::finalize_sum_reduce_computation<block_size>),
-                dim3(1), dim3(block_dim), 0, 0, grid_dim.x,
-                as_hip_type(work.get_const_data()),
-                as_hip_type(result->get_values() + col));
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2(std::shared_ptr<const HipExecutor> exec,
-                   const matrix::Dense<ValueType>* x,
-                   matrix::Dense<remove_complex<ValueType>>* result)
-{
-    if (hipblas::is_supported<ValueType>::value) {
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            hipblas::norm2(exec->get_hipblas_handle(), x->get_size()[0],
-                           x->get_const_values() + col, x->get_stride(),
-                           result->get_values() + col);
-        }
-    } else {
-        using norm_type = remove_complex<ValueType>;
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        constexpr int block_size = 1024;
-
-        constexpr auto work_per_block = work_per_thread * block_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{config::warp_size, 1,
-                             block_size / config::warp_size};
-        Array<norm_type> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            hipLaunchKernelGGL(
-                HIP_KERNEL_NAME(kernel::compute_partial_norm2<block_size>),
-                dim3(grid_dim), dim3(block_dim), 0, 0, x->get_size()[0],
-                as_hip_type(x->get_const_values() + col), x->get_stride(),
-                as_hip_type(work.get_data()));
-            hipLaunchKernelGGL(
-                HIP_KERNEL_NAME(
-                    kernel::finalize_sqrt_reduce_computation<block_size>),
-                dim3(1), dim3(block_dim), 0, 0, grid_dim.x,
-                as_hip_type(work.get_const_data()),
-                as_hip_type(result->get_values() + col));
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const HipExecutor> exec,
                     const matrix::Dense<ValueType>* source,
diff --git a/hip/test/base/kernel_launch.hip.cpp b/hip/test/base/kernel_launch.hip.cpp
index 55ddb3fd01e..c7add9ddca8 100644
--- a/hip/test/base/kernel_launch.hip.cpp
+++ b/hip/test/base/kernel_launch.hip.cpp
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/unified/base/kernel_launch_reduction.hpp"
 #include "common/unified/base/kernel_launch_solver.hpp"
 #include "core/test/utils.hpp"
 
@@ -54,6 +55,7 @@ namespace {
 
 
 using gko::dim;
+using gko::int64;
 using gko::size_type;
 using std::is_same;
 
@@ -103,7 +105,7 @@ void run1d(std::shared_ptr<gko::HipExecutor> exec, size_type dim, int* data)
     gko::kernels::hip::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             d[i] = i;
         },
@@ -123,7 +125,7 @@ void run1d(std::shared_ptr<gko::HipExecutor> exec, gko::Array<int>& data)
     gko::kernels::hip::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
             if (d == d_ptr) {
@@ -148,7 +150,7 @@ void run1d(std::shared_ptr<gko::HipExecutor> exec, gko::matrix::Dense<>* m)
     gko::kernels::hip::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
                           "type");
@@ -184,8 +186,8 @@ void run2d(std::shared_ptr<gko::HipExecutor> exec, int* data)
     gko::kernels::hip::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
-            static_assert(is_same<decltype(j), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             d[i + 4 * j] = 4 * i + j;
         },
@@ -205,8 +207,8 @@ void run2d(std::shared_ptr<gko::HipExecutor> exec, gko::Array<int>& data)
     gko::kernels::hip::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
-            static_assert(is_same<decltype(j), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
             if (d == d_ptr) {
@@ -233,7 +235,7 @@ void run2d(std::shared_ptr<gko::HipExecutor> exec, gko::matrix::Dense<>* m1,
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3,
                       auto d4, auto d2_ptr, auto d3_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
                           "type");
@@ -274,4 +276,141 @@ TEST_F(KernelLaunch, Runs2DDense)
 }
 
 
+void run1d_reduction(std::shared_ptr<gko::HipExecutor> exec)
+{
+    gko::Array<int64> output{exec, 1};
+
+    gko::kernels::hip::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return i + 1;
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
+        size_type{100000}, output);
+
+    // 2 * sum i=0...99999 (i+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL);
+
+    gko::kernels::hip::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return i + 1;
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
+        size_type{100}, output);
+
+    // 2 * sum i=0...99 (i+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL);
+}
+
+TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); }
+
+
+void run2d_reduction(std::shared_ptr<gko::HipExecutor> exec)
+{
+    gko::Array<int64> output{exec, 1};
+
+    gko::kernels::hip::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
+        gko::dim<2>{1000, 100}, output);
+
+    // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL);
+
+    gko::kernels::hip::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
+        gko::dim<2>{10, 10}, output);
+
+    // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL);
+}
+
+TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); }
+
+
+void run2d_row_reduction(std::shared_ptr<gko::HipExecutor> exec)
+{
+    int num_rows = 1000;
+    int num_cols = 100;
+    gko::Array<int64> host_ref{exec->get_master(),
+                               static_cast<size_type>(2 * num_rows)};
+    std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
+    gko::Array<int64> output{exec, host_ref};
+    for (int i = 0; i < num_rows; i++) {
+        // we are computing 2 * sum {j=0, j<cols} (i+1)*(j+1) for each
+        // row i and storing it with stride 2
+        host_ref.get_data()[2 * i] = num_cols * (num_cols + 1) * (i + 1);
+    }
+
+    gko::kernels::hip::run_kernel_row_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2,
+        gko::dim<2>{static_cast<size_type>(num_rows),
+                    static_cast<size_type>(num_cols)},
+        output);
+
+    GKO_ASSERT_ARRAY_EQ(host_ref, output);
+}
+
+TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); }
+
+
+void run2d_col_reduction(std::shared_ptr<gko::HipExecutor> exec)
+{
+    int num_rows = 1000;
+    int num_cols = 100;
+    gko::Array<int64> host_ref{exec->get_master(),
+                               static_cast<size_type>(num_cols)};
+    gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
+    for (int i = 0; i < num_cols; i++) {
+        // we are computing 2 * sum {j=0, j<row} (i+1)*(j+1) for each
+        // column i
+        host_ref.get_data()[i] = num_rows * (num_rows + 1) * (i + 1);
+    }
+
+    gko::kernels::hip::run_kernel_col_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
+        gko::dim<2>{static_cast<size_type>(num_rows),
+                    static_cast<size_type>(num_cols)},
+        output);
+
+    GKO_ASSERT_ARRAY_EQ(host_ref, output);
+}
+
+TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); }
+
+
 }  // namespace
diff --git a/omp/base/kernel_launch.hpp b/omp/base/kernel_launch.hpp
index 7df6ff4c313..b6ef373fdee 100644
--- a/omp/base/kernel_launch.hpp
+++ b/omp/base/kernel_launch.hpp
@@ -36,108 +36,101 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
+#include "core/synthesizer/implementation_selection.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace omp {
 
 
-template <typename KernelFunction, typename... KernelArgs>
-void run_kernel(std::shared_ptr<const OmpExecutor> exec, KernelFunction fn,
-                size_type size, KernelArgs&&... args)
-{
-#pragma omp parallel for
-    for (size_type i = 0; i < size; i++) {
-        [&]() { fn(i, map_to_device(args)...); }();
-    }
-}
+namespace {
 
-template <size_type cols, typename KernelFunction, typename... MappedKernelArgs>
-void run_kernel_fixed_cols_impl(std::shared_ptr<const OmpExecutor> exec,
-                                KernelFunction fn, dim<2> size,
-                                MappedKernelArgs... args)
+
+template <typename KernelFunction, typename... MappedKernelArgs>
+void run_kernel_impl(std::shared_ptr<const OmpExecutor> exec, KernelFunction fn,
+                     size_type size, MappedKernelArgs... args)
 {
-    const auto rows = size[0];
 #pragma omp parallel for
-    for (size_type row = 0; row < rows; row++) {
-#pragma unroll
-        for (size_type col = 0; col < cols; col++) {
-            [&]() { fn(row, col, args...); }();
-        }
+    for (int64 i = 0; i < static_cast<int64>(size); i++) {
+        [&]() { fn(i, args...); }();
     }
 }
 
-template <size_type remainder_cols, size_type block_size,
-          typename KernelFunction, typename... MappedKernelArgs>
-void run_kernel_blocked_cols_impl(std::shared_ptr<const OmpExecutor> exec,
-                                  KernelFunction fn, dim<2> size,
-                                  MappedKernelArgs... args)
+
+template <int block_size, int remainder_cols, typename KernelFunction,
+          typename... MappedKernelArgs>
+void run_kernel_sized_impl(syn::value_list<int, remainder_cols>,
+                           std::shared_ptr<const OmpExecutor> exec,
+                           KernelFunction fn, dim<2> size,
+                           MappedKernelArgs... args)
 {
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
     static_assert(remainder_cols < block_size, "remainder too large");
-    const auto rows = size[0];
-    const auto cols = size[1];
     const auto rounded_cols = cols / block_size * block_size;
     GKO_ASSERT(rounded_cols + remainder_cols == cols);
+    if (rounded_cols == 0 || cols == block_size) {
+        // we group all sizes <= block_size here and unroll explicitly
+        constexpr auto local_cols =
+            remainder_cols == 0 ? block_size : remainder_cols;
 #pragma omp parallel for
-    for (size_type row = 0; row < rows; row++) {
-        for (size_type base_col = 0; base_col < rounded_cols;
-             base_col += block_size) {
+        for (int64 row = 0; row < rows; row++) {
 #pragma unroll
-            for (size_type i = 0; i < block_size; i++) {
-                [&]() { fn(row, base_col + i, args...); }();
+            for (int64 col = 0; col < local_cols; col++) {
+                [&]() { fn(row, col, args...); }();
             }
         }
+    } else {
+        // we operate in block_size blocks plus an explicitly unrolled remainder
+#pragma omp parallel for
+        for (int64 row = 0; row < rows; row++) {
+            for (int64 base_col = 0; base_col < rounded_cols;
+                 base_col += block_size) {
+#pragma unroll
+                for (int64 i = 0; i < block_size; i++) {
+                    [&]() { fn(row, base_col + i, args...); }();
+                }
+            }
 #pragma unroll
-        for (size_type i = 0; i < remainder_cols; i++) {
-            [&]() { fn(row, rounded_cols + i, args...); }();
+            for (int64 i = 0; i < remainder_cols; i++) {
+                [&]() { fn(row, rounded_cols + i, args...); }();
+            }
         }
     }
 }
 
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_sized,
+                                    run_kernel_sized_impl)
+
+
 template <typename KernelFunction, typename... MappedKernelArgs>
 void run_kernel_impl(std::shared_ptr<const OmpExecutor> exec, KernelFunction fn,
                      dim<2> size, MappedKernelArgs... args)
 {
-    const auto rows = size[0];
-    const auto cols = size[1];
-    constexpr size_type block_size = 4;
+    const auto cols = static_cast<int64>(size[1]);
+    constexpr int block_size = 8;
+    using remainders = syn::as_list<syn::range<0, block_size, 1>>;
+
     if (cols <= 0) {
         return;
     }
-    if (cols == 1) {
-        run_kernel_fixed_cols_impl<1>(exec, fn, size, args...);
-        return;
-    }
-    if (cols == 2) {
-        run_kernel_fixed_cols_impl<2>(exec, fn, size, args...);
-        return;
-    }
-    if (cols == 3) {
-        run_kernel_fixed_cols_impl<3>(exec, fn, size, args...);
-        return;
-    }
-    if (cols == 4) {
-        run_kernel_fixed_cols_impl<4>(exec, fn, size, args...);
-        return;
-    }
-    const auto rem_cols = cols % block_size;
-    if (rem_cols == 0) {
-        run_kernel_blocked_cols_impl<0, block_size>(exec, fn, size, args...);
-        return;
-    }
-    if (rem_cols == 1) {
-        run_kernel_blocked_cols_impl<1, block_size>(exec, fn, size, args...);
-        return;
-    }
-    if (rem_cols == 2) {
-        run_kernel_blocked_cols_impl<2, block_size>(exec, fn, size, args...);
-        return;
-    }
-    if (rem_cols == 3) {
-        run_kernel_blocked_cols_impl<3, block_size>(exec, fn, size, args...);
-        return;
-    }
-    // should be unreachable
-    GKO_ASSERT(false);
+    select_run_kernel_sized(
+        remainders(),
+        [&](int remainder) { return remainder == cols % block_size; },
+        syn::value_list<int, block_size>(), syn::type_list<>(), exec, fn, size,
+        args...);
+}
+
+
+}  // namespace
+
+
+template <typename KernelFunction, typename... KernelArgs>
+void run_kernel(std::shared_ptr<const OmpExecutor> exec, KernelFunction fn,
+                size_type size, KernelArgs&&... args)
+{
+    run_kernel_impl(exec, fn, size, map_to_device(args)...);
 }
 
 
diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
new file mode 100644
index 00000000000..030da93c245
--- /dev/null
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -0,0 +1,405 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#error \
+    "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp"
+#endif
+
+
+#include <numeric>
+
+
+#include <omp.h>
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+
+
+// how many more reduction tasks we launch relative to the number of threads
+constexpr int reduction_kernel_oversubscription = 4;
+
+
+namespace {
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... MappedKernelArgs>
+void run_kernel_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
+                               KernelFunction fn, ReductionOp op,
+                               FinalizeOp finalize, ValueType identity,
+                               ValueType* result, size_type size,
+                               MappedKernelArgs... args)
+{
+    const auto num_threads = static_cast<int64>(omp_get_max_threads());
+    const auto ssize = static_cast<int64>(size);
+    const auto work_per_thread = ceildiv(ssize, num_threads);
+    Array<ValueType> partial{exec, static_cast<size_type>(num_threads)};
+#pragma omp parallel num_threads(num_threads)
+    {
+        const auto thread_id = omp_get_thread_num();
+        const auto begin = thread_id * work_per_thread;
+        const auto end = std::min(ssize, begin + work_per_thread);
+
+        auto local_partial = identity;
+        for (auto i = begin; i < end; i++) {
+            local_partial = op(local_partial, fn(i, map_to_device(args)...));
+        }
+        partial.get_data()[thread_id] = local_partial;
+    }
+    *result = finalize(std::accumulate(partial.get_const_data(),
+                                       partial.get_const_data() + num_threads,
+                                       identity, op));
+}
+
+
+template <int block_size, int remainder_cols, typename ValueType,
+          typename KernelFunction, typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
+                                     std::shared_ptr<const OmpExecutor> exec,
+                                     KernelFunction fn, ReductionOp op,
+                                     FinalizeOp finalize, ValueType identity,
+                                     ValueType* result, dim<2> size,
+                                     MappedKernelArgs... args)
+{
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto num_threads = static_cast<int64>(omp_get_max_threads());
+    const auto work_per_thread = ceildiv(rows, num_threads);
+    Array<ValueType> partial{exec, static_cast<size_type>(num_threads)};
+    static_assert(remainder_cols < block_size, "remainder too large");
+    const auto rounded_cols = cols / block_size * block_size;
+    GKO_ASSERT(rounded_cols + remainder_cols == cols);
+#pragma omp parallel
+    {
+        const auto thread_id = omp_get_thread_num();
+        const auto begin = thread_id * work_per_thread;
+        const auto end = std::min(rows, begin + work_per_thread);
+
+        auto local_partial = identity;
+        if (rounded_cols == 0 || cols == block_size) {
+            // we group all sizes <= block_size here and unroll explicitly
+            constexpr auto local_cols =
+                remainder_cols == 0 ? block_size : remainder_cols;
+            for (auto row = begin; row < end; row++) {
+#pragma unroll
+                for (int64 col = 0; col < local_cols; col++) {
+                    local_partial = op(local_partial, fn(row, col, args...));
+                }
+            }
+        } else {
+            // we operate in block_size blocks plus an explicitly unrolled
+            // remainder
+            for (auto row = begin; row < end; row++) {
+                for (int64 base_col = 0; base_col < rounded_cols;
+                     base_col += block_size) {
+#pragma unroll
+                    for (int64 i = 0; i < block_size; i++) {
+                        local_partial =
+                            op(local_partial, fn(row, base_col + i, args...));
+                    }
+                }
+#pragma unroll
+                for (int64 i = 0; i < remainder_cols; i++) {
+                    local_partial =
+                        op(local_partial, fn(row, rounded_cols + i, args...));
+                }
+            }
+        }
+        partial.get_data()[thread_id] = local_partial;
+    }
+    *result = finalize(std::accumulate(partial.get_const_data(),
+                                       partial.get_const_data() + num_threads,
+                                       identity, op));
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized,
+                                    run_kernel_reduction_sized_impl)
+
+
+}  // namespace
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType identity,
+                          ValueType* result, size_type size,
+                          KernelArgs&&... args)
+{
+    run_kernel_reduction_impl(exec, fn, op, finalize, identity, result, size,
+                              map_to_device(args)...);
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType identity,
+                          ValueType* result, dim<2> size, KernelArgs&&... args)
+{
+    const auto cols = static_cast<int64>(size[1]);
+    constexpr int block_size = 8;
+    using remainders = syn::as_list<syn::range<0, block_size, 1>>;
+
+    if (cols <= 0) {
+        *result = identity;
+        return;
+    }
+    select_run_kernel_reduction_sized(
+        remainders(),
+        [&](int remainder) { return remainder == cols % block_size; },
+        syn::value_list<int, block_size>(), syn::type_list<>(), exec, fn, op,
+        finalize, identity, result, size, map_to_device(args)...);
+}
+
+
+namespace {
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... MappedKernelArgs>
+void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
+                                   KernelFunction fn, ReductionOp op,
+                                   FinalizeOp finalize, ValueType identity,
+                                   ValueType* result, size_type result_stride,
+                                   dim<2> size, MappedKernelArgs... args)
+{
+    constexpr int block_size = 8;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto num_threads = static_cast<int64>(omp_get_max_threads());
+    if (rows <= 0) {
+        return;
+    }
+    // enough work to keep all threads busy or only very small reduction sizes
+    if (rows >= reduction_kernel_oversubscription * num_threads ||
+        cols < rows) {
+#pragma omp parallel for
+        for (int64 row = 0; row < rows; row++) {
+            [&]() {
+                auto partial = identity;
+                for (int64 col = 0; col < cols; col++) {
+                    partial = op(partial, fn(row, col, args...));
+                }
+                result[result_stride * row] = finalize(partial);
+            }();
+        }
+    } else {
+        // small number of rows and large reduction sizes: do partial sum first
+        const auto work_per_thread = ceildiv(cols, num_threads);
+        Array<ValueType> partial{exec,
+                                 static_cast<size_type>(rows * num_threads)};
+#pragma omp parallel num_threads(num_threads)
+        {
+            const auto thread_id = static_cast<int64>(omp_get_thread_num());
+            const auto begin = thread_id * work_per_thread;
+            const auto end = std::min(begin + work_per_thread, cols);
+            for (int64 row = 0; row < rows; row++) {
+                auto local_partial = identity;
+                for (int64 col = begin; col < end; col++) {
+                    local_partial = op(local_partial, [&]() {
+                        return fn(row, col, args...);
+                    }());
+                }
+                partial.get_data()[row * num_threads + thread_id] =
+                    local_partial;
+            }
+        }
+        // then accumulate the partial sums and write to result
+#pragma omp parallel for
+        for (int64 row = 0; row < rows; row++) {
+            [&] {
+                auto local_partial = identity;
+                for (int64 thread_id = 0; thread_id < num_threads;
+                     thread_id++) {
+                    local_partial = op(
+                        local_partial,
+                        partial
+                            .get_const_data()[row * num_threads + thread_id]);
+                }
+                result[row * result_stride] = finalize(local_partial);
+            }();
+        }
+    }
+}
+
+
+template <int local_cols, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_kernel_col_reduction_sized_block_impl(
+    KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity,
+    ValueType* result, int64 row_begin, int64 row_end, int64 base_col,
+    MappedKernelArgs... args)
+{
+    std::array<ValueType, local_cols> partial;
+    partial.fill(identity);
+    for (auto row = row_begin; row < row_end; row++) {
+#pragma unroll
+        for (int64 rel_col = 0; rel_col < local_cols; rel_col++) {
+            partial[rel_col] =
+                op(partial[rel_col], fn(row, base_col + rel_col, args...));
+        }
+    }
+#pragma unroll
+    for (int64 rel_col = 0; rel_col < local_cols; rel_col++) {
+        result[base_col + rel_col] = finalize(partial[rel_col]);
+    }
+}
+
+
+template <int block_size, int remainder_cols, typename ValueType,
+          typename KernelFunction, typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_kernel_col_reduction_sized_impl(
+    syn::value_list<int, remainder_cols>,
+    std::shared_ptr<const OmpExecutor> exec, KernelFunction fn, ReductionOp op,
+    FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size,
+    MappedKernelArgs... args)
+{
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto num_threads = static_cast<int64>(omp_get_max_threads());
+    static_assert(remainder_cols < block_size, "remainder too large");
+    GKO_ASSERT(cols % block_size == remainder_cols);
+    const auto num_col_blocks = ceildiv(cols, block_size);
+    // enough work to keep all threads busy or only very small reduction sizes
+    if (cols >= reduction_kernel_oversubscription * num_threads ||
+        rows < cols) {
+#pragma omp parallel for
+        for (int64 col_block = 0; col_block < num_col_blocks; col_block++) {
+            const auto base_col = col_block * block_size;
+            if (base_col + block_size <= cols) {
+                run_kernel_col_reduction_sized_block_impl<block_size>(
+                    fn, op, finalize, identity, result, 0, rows, base_col,
+                    args...);
+            } else {
+                run_kernel_col_reduction_sized_block_impl<remainder_cols>(
+                    fn, op, finalize, identity, result, 0, rows, base_col,
+                    args...);
+            }
+        }
+    } else {
+        // number of blocks that need to be reduced afterwards
+        const auto reduction_size =
+            ceildiv(reduction_kernel_oversubscription * num_threads, cols);
+        const auto rows_per_thread = ceildiv(rows, reduction_size);
+        Array<ValueType> partial{exec,
+                                 static_cast<size_type>(reduction_size * cols)};
+#pragma omp parallel for
+        for (int64 i = 0; i < reduction_size * num_col_blocks; i++) {
+            const auto col_block = i % num_col_blocks;
+            const auto row_block = i / num_col_blocks;
+            const auto begin = row_block * rows_per_thread;
+            const auto end = std::min(begin + rows_per_thread, rows);
+            const auto base_col = col_block * block_size;
+            const auto identity_fn = [](auto i) { return i; };
+            if (base_col + block_size <= cols) {
+                run_kernel_col_reduction_sized_block_impl<block_size>(
+                    fn, op, identity_fn, identity,
+                    partial.get_data() + cols * row_block, begin, end, base_col,
+                    args...);
+            } else {
+                run_kernel_col_reduction_sized_block_impl<remainder_cols>(
+                    fn, op, identity_fn, identity,
+                    partial.get_data() + cols * row_block, begin, end, base_col,
+                    args...);
+            }
+        }
+#pragma omp parallel for
+        for (int64 col = 0; col < cols; col++) {
+            [&] {
+                auto total = identity;
+                for (int64 row_block = 0; row_block < reduction_size;
+                     row_block++) {
+                    total =
+                        op(total,
+                           partial.get_const_data()[col + cols * row_block]);
+                }
+                result[col] = finalize(total);
+            }();
+        }
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_col_reduction_sized,
+                                    run_kernel_col_reduction_sized_impl)
+
+
+}  // namespace
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... MappedKernelArgs>
+void run_kernel_row_reduction(std::shared_ptr<const OmpExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType identity,
+                              ValueType* result, size_type result_stride,
+                              dim<2> size, MappedKernelArgs... args)
+{
+    run_kernel_row_reduction_impl(exec, fn, op, finalize, identity, result,
+                                  result_stride, size, map_to_device(args)...);
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_col_reduction(std::shared_ptr<const OmpExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType identity,
+                              ValueType* result, dim<2> size,
+                              KernelArgs&&... args)
+{
+    constexpr auto block_size = 8;
+    using remainders = syn::as_list<syn::range<0, block_size, 1>>;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    if (cols <= 0) {
+        return;
+    }
+    select_run_kernel_col_reduction_sized(
+        remainders(),
+        [&](int remainder) { return remainder == cols % block_size; },
+        syn::value_list<int, block_size>(), syn::type_list<>(), exec, fn, op,
+        finalize, identity, result, size, map_to_device(args)...);
+}
+
+
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/base/kernel_launch_solver.hpp b/omp/base/kernel_launch_solver.hpp
index dd85ba21915..b5c936c847b 100644
--- a/omp/base/kernel_launch_solver.hpp
+++ b/omp/base/kernel_launch_solver.hpp
@@ -43,7 +43,7 @@ namespace omp {
 
 template <typename T>
 typename device_unpack_solver_impl<typename to_device_type_impl<T>::type>::type
-map_to_device_solver(T&& param, size_type default_stride)
+map_to_device_solver(T&& param, int64 default_stride)
 {
     return device_unpack_solver_impl<typename to_device_type_impl<T>::type>::
         unpack(to_device_type_impl<T>::map_to_device(param), default_stride);
@@ -55,8 +55,9 @@ void run_kernel_solver(std::shared_ptr<const OmpExecutor> exec,
                        KernelFunction fn, dim<2> size, size_type default_stride,
                        KernelArgs&&... args)
 {
-    run_kernel_impl(exec, fn, size,
-                    map_to_device_solver(args, default_stride)...);
+    run_kernel_impl(
+        exec, fn, size,
+        map_to_device_solver(args, static_cast<int64>(default_stride))...);
 }
 
 
diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp
index c0e4ca75ae3..b9df5fddf24 100644
--- a/omp/matrix/dense_kernels.cpp
+++ b/omp/matrix/dense_kernels.cpp
@@ -127,73 +127,6 @@ void apply(std::shared_ptr<const OmpExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
-template <typename ValueType>
-void compute_dot(std::shared_ptr<const OmpExecutor> exec,
-                 const matrix::Dense<ValueType>* x,
-                 const matrix::Dense<ValueType>* y,
-                 matrix::Dense<ValueType>* result)
-{
-#pragma omp parallel for
-    for (size_type j = 0; j < x->get_size()[1]; ++j) {
-        result->at(0, j) = zero<ValueType>();
-    }
-#pragma omp parallel for
-    for (size_type j = 0; j < x->get_size()[1]; ++j) {
-        for (size_type i = 0; i < x->get_size()[0]; ++i) {
-            result->at(0, j) += x->at(i, j) * y->at(i, j);
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot(std::shared_ptr<const OmpExecutor> exec,
-                      const matrix::Dense<ValueType>* x,
-                      const matrix::Dense<ValueType>* y,
-                      matrix::Dense<ValueType>* result)
-{
-#pragma omp parallel for
-    for (size_type j = 0; j < x->get_size()[1]; ++j) {
-        result->at(0, j) = zero<ValueType>();
-    }
-#pragma omp parallel for
-    for (size_type j = 0; j < x->get_size()[1]; ++j) {
-        for (size_type i = 0; i < x->get_size()[0]; ++i) {
-            result->at(0, j) += conj(x->at(i, j)) * y->at(i, j);
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2(std::shared_ptr<const OmpExecutor> exec,
-                   const matrix::Dense<ValueType>* x,
-                   matrix::Dense<remove_complex<ValueType>>* result)
-{
-    using norm_type = remove_complex<ValueType>;
-#pragma omp parallel for
-    for (size_type j = 0; j < x->get_size()[1]; ++j) {
-        result->at(0, j) = zero<norm_type>();
-    }
-#pragma omp parallel for
-    for (size_type j = 0; j < x->get_size()[1]; ++j) {
-        for (size_type i = 0; i < x->get_size()[0]; ++i) {
-            result->at(0, j) += squared_norm(x->at(i, j));
-        }
-    }
-#pragma omp parallel for
-    for (size_type j = 0; j < x->get_size()[1]; ++j) {
-        result->at(0, j) = sqrt(result->at(0, j));
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const OmpExecutor> exec,
                     const matrix::Dense<ValueType>* source,
diff --git a/omp/test/base/kernel_launch.cpp b/omp/test/base/kernel_launch.cpp
index 2c4712cfa52..f2c6a7e1465 100644
--- a/omp/test/base/kernel_launch.cpp
+++ b/omp/test/base/kernel_launch.cpp
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/unified/base/kernel_launch_reduction.hpp"
 #include "common/unified/base/kernel_launch_solver.hpp"
 #include "core/test/utils.hpp"
 
@@ -54,6 +55,7 @@ namespace {
 
 
 using gko::dim;
+using gko::int64;
 using gko::size_type;
 using std::is_same;
 
@@ -96,7 +98,7 @@ TEST_F(KernelLaunch, Runs1D)
     gko::kernels::omp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             d[i] = i;
         },
@@ -111,7 +113,7 @@ TEST_F(KernelLaunch, Runs1DArray)
     gko::kernels::omp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
             if (d == d_ptr) {
@@ -131,7 +133,7 @@ TEST_F(KernelLaunch, Runs1DDense)
     gko::kernels::omp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
                           "type");
@@ -163,8 +165,8 @@ TEST_F(KernelLaunch, Runs2D)
     gko::kernels::omp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
-            static_assert(is_same<decltype(j), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             d[i + 4 * j] = 4 * i + j;
         },
@@ -179,8 +181,8 @@ TEST_F(KernelLaunch, Runs2DArray)
     gko::kernels::omp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
-            static_assert(is_same<decltype(j), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
             if (d == d_ptr) {
@@ -201,7 +203,7 @@ TEST_F(KernelLaunch, Runs2DDense)
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3,
                       auto d4, auto d2_ptr, auto d3_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
                           "type");
@@ -238,5 +240,204 @@ TEST_F(KernelLaunch, Runs2DDense)
     GKO_ASSERT_MTX_NEAR(zero_dense2, iota_dense, 0.0);
 }
 
+TEST_F(KernelLaunch, Reduction1D)
+{
+    gko::Array<int64> output{exec, 1};
+
+    gko::kernels::omp::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return i + 1;
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
+        size_type{100000}, output);
+
+    // 2 * sum i=0...99999 (i+1)
+    ASSERT_EQ(*output.get_const_data(), 10000100000LL);
+
+    gko::kernels::omp::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return i + 1;
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
+        size_type{10}, output);
+
+    // 2 * sum i=0...9 (i+1)
+    ASSERT_EQ(*output.get_const_data(), 110LL);
+}
+
+
+TEST_F(KernelLaunch, Reduction2DSmallRows)
+{
+    gko::Array<int64> output{exec, 1};
+
+    for (int cols = 0; cols < 17; cols++) {
+        gko::kernels::omp::run_kernel_reduction(
+            exec,
+            [] GKO_KERNEL(auto i, auto j, auto a) {
+                static_assert(is_same<decltype(i), int64>::value, "index");
+                static_assert(is_same<decltype(a), int64*>::value, "value");
+                return (i + 1) * (j + 1);
+            },
+            [] GKO_KERNEL(auto i, auto j) { return i + j; },
+            [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
+            gko::dim<2>{10, cols}, output);
+
+        // 4 * sum i=0...9 sum j=0...cols-1 of (i+1)*(j+1)
+        ASSERT_EQ(*output.get_const_data(), 110LL * cols * (cols + 1));
+    }
+}
+
+
+TEST_F(KernelLaunch, Reduction2DLargeRows)
+{
+    gko::Array<int64> output{exec, 1};
+
+    for (int cols = 0; cols < 17; cols++) {
+        gko::kernels::omp::run_kernel_reduction(
+            exec,
+            [] GKO_KERNEL(auto i, auto j, auto a) {
+                static_assert(is_same<decltype(i), int64>::value, "index");
+                static_assert(is_same<decltype(a), int64*>::value, "value");
+                return (i + 1) * (j + 1);
+            },
+            [] GKO_KERNEL(auto i, auto j) { return i + j; },
+            [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
+            gko::dim<2>{1000, cols}, output);
+
+        // 4 * sum i=0...999 sum j=0...cols-1 of (i+1)*(j+1)
+        ASSERT_EQ(*output.get_const_data(), 1001000LL * cols * (cols + 1));
+    }
+}
+
+
+TEST_F(KernelLaunch, Reduction2D)
+{
+    gko::Array<int64> output{exec, 1};
+
+    gko::kernels::omp::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
+        gko::dim<2>{1000, 100}, output);
+
+
+    // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1)
+    ASSERT_EQ(*output.get_const_data(), 10110100000LL);
+}
+
+
+TEST_F(KernelLaunch, ReductionRow2DSmall)
+{
+    // 4 rows, with oversubscription this means we use multiple threads per row
+    // if OMP_NUM_THREADS >= 2
+    int num_rows = 4;
+    int num_cols = 100;
+    gko::Array<int64> host_ref{exec->get_master(),
+                               static_cast<size_type>(2 * num_rows)};
+    std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
+    gko::Array<int64> output{exec, host_ref};
+    for (int i = 0; i < num_rows; i++) {
+        // we are computing 2 * sum {j=0, j<cols} (i+1)*(j+1) for each
+        // row i and storing it with stride 2
+        host_ref.get_data()[2 * i] =
+            static_cast<int64>(num_cols) * (num_cols + 1) * (i + 1);
+    }
+
+    gko::kernels::omp::run_kernel_row_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2,
+        gko::dim<2>{static_cast<size_type>(num_rows),
+                    static_cast<size_type>(num_cols)},
+        output);
+
+    GKO_ASSERT_ARRAY_EQ(host_ref, output);
+}
+
+
+TEST_F(KernelLaunch, ReductionRow2D)
+{
+    int num_rows = 1000;
+    int num_cols = 100;
+    gko::Array<int64> host_ref{exec->get_master(),
+                               static_cast<size_type>(2 * num_rows)};
+    std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
+    gko::Array<int64> output{exec, host_ref};
+    for (int i = 0; i < num_rows; i++) {
+        // we are computing 2 * sum {j=0, j<cols} (i+1)*(j+1) for each
+        // row i and storing it with stride 2
+        host_ref.get_data()[2 * i] =
+            static_cast<int64>(num_cols) * (num_cols + 1) * (i + 1);
+    }
+
+    gko::kernels::omp::run_kernel_row_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2,
+        gko::dim<2>{static_cast<size_type>(num_rows),
+                    static_cast<size_type>(num_cols)},
+        output);
+
+    GKO_ASSERT_ARRAY_EQ(host_ref, output);
+}
+
+
+TEST_F(KernelLaunch, ReductionCol2D)
+{
+    for (int num_rows : {0, 1, 10, 1000, 1000}) {
+        for (int num_cols :
+             {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 40, 100, 1000}) {
+            gko::Array<int64> host_ref{exec->get_master(),
+                                       static_cast<size_type>(num_cols)};
+            gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
+            for (int i = 0; i < num_cols; i++) {
+                // we are computing 2 * sum {j=0, j<row} (i+1)*(j+1) for each
+                // column i
+                host_ref.get_data()[i] =
+                    static_cast<int64>(num_rows) * (num_rows + 1) * (i + 1);
+            }
+
+            gko::kernels::omp::run_kernel_col_reduction(
+                exec,
+                [] GKO_KERNEL(auto i, auto j, auto a) {
+                    static_assert(is_same<decltype(i), int64>::value, "index");
+                    static_assert(is_same<decltype(a), int64*>::value, "value");
+                    return (i + 1) * (j + 1);
+                },
+                [] GKO_KERNEL(auto i, auto j) { return i + j; },
+                [] GKO_KERNEL(auto j) { return j * 2; }, int64{},
+                output.get_data(),
+                gko::dim<2>{static_cast<size_type>(num_rows),
+                            static_cast<size_type>(num_cols)},
+                output);
+
+            GKO_ASSERT_ARRAY_EQ(host_ref, output);
+        }
+    }
+}
+
 
 }  // namespace