From 631b4f6950963d7511f69d78703784b9118e5c51 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Mon, 12 Jul 2021 15:44:09 +0200
Subject: [PATCH 01/25] use signed integers in simple kernels

---
 common/unified/base/kernel_launch.hpp        | 13 +++++----
 common/unified/base/kernel_launch_solver.hpp |  6 ++--
 cuda/base/kernel_launch.cuh                  | 15 +++++-----
 cuda/base/kernel_launch_solver.cuh           |  7 +++--
 cuda/test/base/kernel_launch.cu              | 17 ++++++------
 dpcpp/base/kernel_launch.dp.hpp              | 17 +++++++-----
 dpcpp/base/kernel_launch_solver.dp.hpp       | 13 +++++----
 dpcpp/test/base/kernel_launch.dp.cpp         | 19 ++++++-------
 hip/base/kernel_launch.hip.hpp               | 17 ++++++------
 hip/base/kernel_launch_solver.hip.hpp        |  8 ++++--
 hip/test/base/kernel_launch.hip.cpp          | 17 ++++++------
 omp/base/kernel_launch.hpp                   | 29 ++++++++++----------
 omp/base/kernel_launch_solver.hpp            |  7 +++--
 omp/test/base/kernel_launch.cpp              | 17 ++++++------
 14 files changed, 108 insertions(+), 94 deletions(-)
diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp
index 6b3a698768c..bf403d3a656 100644
--- a/common/unified/base/kernel_launch.hpp
+++ b/common/unified/base/kernel_launch.hpp
@@ -170,14 +170,13 @@ namespace GKO_DEVICE_NAMESPACE {
 template <typename ValueType>
 struct matrix_accessor {
     ValueType* data;
-    size_type stride;
+    int64 stride;
 
     /**
      * @internal
      * Returns a reference to the element at position (row, col).
      */
-    GKO_INLINE GKO_ATTRIBUTES ValueType& operator()(size_type row,
-                                                    size_type col)
+    GKO_INLINE GKO_ATTRIBUTES ValueType& operator()(int64 row, int64 col)
     {
         return data[row * stride + col];
     }
@@ -187,7 +186,7 @@ struct matrix_accessor {
      * Returns a reference to the element at position idx in the underlying
      * storage.
      */
-    GKO_INLINE GKO_ATTRIBUTES ValueType& operator[](size_type idx)
+    GKO_INLINE GKO_ATTRIBUTES ValueType& operator[](int64 idx)
     {
         return data[idx];
     }
@@ -223,7 +222,8 @@ struct to_device_type_impl<matrix::Dense<ValueType>*&> {
     using type = matrix_accessor<device_type<ValueType>>;
     static type map_to_device(matrix::Dense<ValueType>* mtx)
     {
-        return {as_device_type(mtx->get_values()), mtx->get_stride()};
+        return {as_device_type(mtx->get_values()),
+                static_cast<int64>(mtx->get_stride())};
     }
 };
 
@@ -232,7 +232,8 @@ struct to_device_type_impl<const matrix::Dense<ValueType>*&> {
     using type = matrix_accessor<const device_type<ValueType>>;
     static type map_to_device(const matrix::Dense<ValueType>* mtx)
     {
-        return {as_device_type(mtx->get_const_values()), mtx->get_stride()};
+        return {as_device_type(mtx->get_const_values()),
+                static_cast<int64>(mtx->get_stride())};
     }
 };
 
diff --git a/common/unified/base/kernel_launch_solver.hpp b/common/unified/base/kernel_launch_solver.hpp
index 6c8a1296b83..716bd94a093 100644
--- a/common/unified/base/kernel_launch_solver.hpp
+++ b/common/unified/base/kernel_launch_solver.hpp
@@ -63,7 +63,7 @@ struct default_stride_dense_wrapper {
 template <typename T>
 struct device_unpack_solver_impl {
     using type = T;
-    static GKO_INLINE GKO_ATTRIBUTES type unpack(T param, size_type)
+    static GKO_INLINE GKO_ATTRIBUTES type unpack(T param, int64)
     {
         return param;
     }
@@ -72,8 +72,8 @@ struct device_unpack_solver_impl {
 template <typename ValueType>
 struct device_unpack_solver_impl<default_stride_dense_wrapper<ValueType>> {
     using type = matrix_accessor<ValueType>;
-    static GKO_INLINE GKO_ATTRIBUTES type unpack(
-        default_stride_dense_wrapper<ValueType> param, size_type default_stride)
+    static GKO_INLINE GKO_ATTRIBUTES type
+    unpack(default_stride_dense_wrapper<ValueType> param, int64 default_stride)
     {
         return {param.data, default_stride};
     }
diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh
index d55faed5053..5179a5cc27d 100644
--- a/cuda/base/kernel_launch.cuh
+++ b/cuda/base/kernel_launch.cuh
@@ -51,9 +51,9 @@ constexpr int default_block_size = 512;
 
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_1d(
-    size_type size, KernelFunction fn, KernelArgs... args)
+    int64 size, KernelFunction fn, KernelArgs... args)
 {
-    auto tidx = thread::get_thread_id_flat();
+    auto tidx = thread::get_thread_id_flat<int64>();
     if (tidx >= size) {
         return;
     }
@@ -63,9 +63,9 @@ __global__ __launch_bounds__(default_block_size) void generic_kernel_1d(
 
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_2d(
-    size_type rows, size_type cols, KernelFunction fn, KernelArgs... args)
+    int64 rows, int64 cols, KernelFunction fn, KernelArgs... args)
 {
-    auto tidx = thread::get_thread_id_flat();
+    auto tidx = thread::get_thread_id_flat<int64>();
     auto col = tidx % cols;
     auto row = tidx / cols;
     if (row >= rows) {
@@ -82,7 +82,7 @@ void run_kernel(std::shared_ptr<const CudaExecutor> exec, KernelFunction fn,
     gko::cuda::device_guard guard{exec->get_device_id()};
     constexpr auto block_size = default_block_size;
     auto num_blocks = ceildiv(size, block_size);
-    generic_kernel_1d<<<num_blocks, block_size>>>(size, fn,
+    generic_kernel_1d<<<num_blocks, block_size>>>(static_cast<int64>(size), fn,
                                                   map_to_device(args)...);
 }
 
@@ -93,8 +93,9 @@ void run_kernel(std::shared_ptr<const CudaExecutor> exec, KernelFunction fn,
     gko::cuda::device_guard guard{exec->get_device_id()};
     constexpr auto block_size = default_block_size;
     auto num_blocks = ceildiv(size[0] * size[1], block_size);
-    generic_kernel_2d<<<num_blocks, block_size>>>(size[0], size[1], fn,
-                                                  map_to_device(args)...);
+    generic_kernel_2d<<<num_blocks, block_size>>>(static_cast<int64>(size[0]),
+                                                  static_cast<int64>(size[1]),
+                                                  fn, map_to_device(args)...);
 }
 
 
diff --git a/cuda/base/kernel_launch_solver.cuh b/cuda/base/kernel_launch_solver.cuh
index bf2f6e1a995..f4da60ddede 100644
--- a/cuda/base/kernel_launch_solver.cuh
+++ b/cuda/base/kernel_launch_solver.cuh
@@ -43,10 +43,10 @@ namespace cuda {
 
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver(
-    size_type rows, size_type cols, size_type default_stride, KernelFunction fn,
+    int64 rows, int64 cols, int64 default_stride, KernelFunction fn,
     KernelArgs... args)
 {
-    auto tidx = thread::get_thread_id_flat();
+    auto tidx = thread::get_thread_id_flat<int64>();
     auto col = tidx % cols;
     auto row = tidx / cols;
     if (row >= rows) {
@@ -66,7 +66,8 @@ void run_kernel_solver(std::shared_ptr<const CudaExecutor> exec,
     constexpr auto block_size = default_block_size;
     auto num_blocks = ceildiv(size[0] * size[1], block_size);
     generic_kernel_2d_solver<<<num_blocks, block_size>>>(
-        size[0], size[1], default_stride, fn, map_to_device(args)...);
+        static_cast<int64>(size[0]), static_cast<int64>(size[1]),
+        static_cast<int64>(default_stride), fn, map_to_device(args)...);
 }
 
 
diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu
index abd4775290c..adf443445a5 100644
--- a/cuda/test/base/kernel_launch.cu
+++ b/cuda/test/base/kernel_launch.cu
@@ -54,6 +54,7 @@ namespace {
 
 
 using gko::dim;
+using gko::int64;
 using gko::size_type;
 using std::is_same;
 
@@ -104,7 +105,7 @@ void run1d(std::shared_ptr<gko::CudaExecutor> exec, size_type dim, int* data)
     gko::kernels::cuda::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             d[i] = i;
         },
@@ -124,7 +125,7 @@ void run1d(std::shared_ptr<gko::CudaExecutor> exec, gko::Array<int>& data)
     gko::kernels::cuda::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
             if (d == d_ptr) {
@@ -149,7 +150,7 @@ void run1d(std::shared_ptr<gko::CudaExecutor> exec, gko::matrix::Dense<>* m)
     gko::kernels::cuda::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
                           "type");
@@ -185,8 +186,8 @@ void run2d(std::shared_ptr<gko::CudaExecutor> exec, int* data)
     gko::kernels::cuda::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
-            static_assert(is_same<decltype(j), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             d[i + 4 * j] = 4 * i + j;
         },
@@ -206,8 +207,8 @@ void run2d(std::shared_ptr<gko::CudaExecutor> exec, gko::Array<int>& data)
     gko::kernels::cuda::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
-            static_assert(is_same<decltype(j), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
             if (d == d_ptr) {
@@ -234,7 +235,7 @@ void run2d(std::shared_ptr<gko::CudaExecutor> exec, gko::matrix::Dense<>* m1,
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3,
                       auto d4, auto d2_ptr, auto d3_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
                           "type");
diff --git a/dpcpp/base/kernel_launch.dp.hpp b/dpcpp/base/kernel_launch.dp.hpp
index 4fe161ff320..0941fc7d524 100644
--- a/dpcpp/base/kernel_launch.dp.hpp
+++ b/dpcpp/base/kernel_launch.dp.hpp
@@ -45,23 +45,23 @@ namespace dpcpp {
 
 
 template <typename KernelFunction, typename... KernelArgs>
-void generic_kernel_1d(sycl::handler& cgh, size_type size, KernelFunction fn,
+void generic_kernel_1d(sycl::handler& cgh, int64 size, KernelFunction fn,
                        KernelArgs... args)
 {
     cgh.parallel_for(sycl::range<1>{size}, [=](sycl::id<1> idx_id) {
-        auto idx = static_cast<size_type>(idx_id[0]);
+        auto idx = static_cast<int64>(idx_id[0]);
         fn(idx, args...);
     });
 }
 
 
 template <typename KernelFunction, typename... KernelArgs>
-void generic_kernel_2d(sycl::handler& cgh, size_type rows, size_type cols,
+void generic_kernel_2d(sycl::handler& cgh, int64 rows, int64 cols,
                        KernelFunction fn, KernelArgs... args)
 {
     cgh.parallel_for(sycl::range<2>{rows, cols}, [=](sycl::id<2> idx) {
-        auto row = static_cast<size_type>(idx[0]);
-        auto col = static_cast<size_type>(idx[1]);
+        auto row = static_cast<int64>(idx[0]);
+        auto col = static_cast<int64>(idx[1]);
         fn(row, col, args...);
     });
 }
@@ -72,7 +72,8 @@ void run_kernel(std::shared_ptr<const DpcppExecutor> exec, KernelFunction fn,
                 size_type size, KernelArgs&&... args)
 {
     exec->get_queue()->submit([&](sycl::handler& cgh) {
-        generic_kernel_1d(cgh, size, fn, map_to_device(args)...);
+        generic_kernel_1d(cgh, static_cast<int64>(size), fn,
+                          map_to_device(args)...);
     });
 }
 
@@ -81,7 +82,9 @@ void run_kernel(std::shared_ptr<const DpcppExecutor> exec, KernelFunction fn,
                 dim<2> size, KernelArgs&&... args)
 {
     exec->get_queue()->submit([&](sycl::handler& cgh) {
-        generic_kernel_2d(cgh, size[0], size[1], fn, map_to_device(args)...);
+        generic_kernel_2d(cgh, static_cast<int64>(size[0]),
+                          static_cast<int64>(size[1]), fn,
+                          map_to_device(args)...);
     });
 }
 
diff --git a/dpcpp/base/kernel_launch_solver.dp.hpp b/dpcpp/base/kernel_launch_solver.dp.hpp
index 5cec5b55d79..aa25d167bf3 100644
--- a/dpcpp/base/kernel_launch_solver.dp.hpp
+++ b/dpcpp/base/kernel_launch_solver.dp.hpp
@@ -42,13 +42,13 @@ namespace dpcpp {
 
 
 template <typename KernelFunction, typename... KernelArgs>
-void generic_kernel_2d_solver(sycl::handler& cgh, size_type rows,
-                              size_type cols, size_type default_stride,
-                              KernelFunction fn, KernelArgs... args)
+void generic_kernel_2d_solver(sycl::handler& cgh, int64 rows, int64 cols,
+                              int64 default_stride, KernelFunction fn,
+                              KernelArgs... args)
 {
     cgh.parallel_for(sycl::range<2>{rows, cols}, [=](sycl::id<2> idx) {
-        auto row = static_cast<size_type>(idx[0]);
-        auto col = static_cast<size_type>(idx[1]);
+        auto row = static_cast<int64>(idx[0]);
+        auto col = static_cast<int64>(idx[1]);
         fn(row, col,
            device_unpack_solver_impl<KernelArgs>::unpack(args,
                                                          default_stride)...);
@@ -63,7 +63,8 @@ void run_kernel_solver(std::shared_ptr<const DpcppExecutor> exec,
 {
     exec->get_queue()->submit([&](sycl::handler& cgh) {
         kernels::dpcpp::generic_kernel_2d_solver(
-            cgh, size[0], size[1], default_stride, fn,
+            cgh, static_cast<int64>(size[0]), static_cast<int64>(size[1]),
+            static_cast<int64>(default_stride), fn,
             kernels::dpcpp::map_to_device(args)...);
     });
 }
diff --git a/dpcpp/test/base/kernel_launch.dp.cpp b/dpcpp/test/base/kernel_launch.dp.cpp
index 27d3f1abd12..decd2e8c64a 100644
--- a/dpcpp/test/base/kernel_launch.dp.cpp
+++ b/dpcpp/test/base/kernel_launch.dp.cpp
@@ -54,6 +54,7 @@ namespace {
 
 
 using gko::dim;
+using gko::int64;
 using gko::size_type;
 using std::is_same;
 
@@ -110,7 +111,7 @@ TEST_F(KernelLaunch, Runs1D)
     gko::kernels::dpcpp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             d[i] = i;
         },
@@ -125,7 +126,7 @@ TEST_F(KernelLaunch, Runs1DArray)
     gko::kernels::dpcpp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
             if (d == d_ptr) {
@@ -145,7 +146,7 @@ TEST_F(KernelLaunch, Runs1DDense)
     gko::kernels::dpcpp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), value_type&>::value,
                           "type");
             static_assert(is_same<decltype(d2(0, 0)), const value_type&>::value,
@@ -177,8 +178,8 @@ TEST_F(KernelLaunch, Runs2D)
     gko::kernels::dpcpp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
-            static_assert(is_same<decltype(j), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             d[i + 4 * j] = 4 * i + j;
         },
@@ -193,8 +194,8 @@ TEST_F(KernelLaunch, Runs2DArray)
     gko::kernels::dpcpp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
-            static_assert(is_same<decltype(j), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
             if (d == d_ptr) {
@@ -215,11 +216,9 @@ TEST_F(KernelLaunch, Runs2DDense)
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3,
                       auto d4, auto d2_ptr, auto d3_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), value_type&>::value,
                           "type");
-            static_assert(is_same<decltype(d2(0, 0)), const value_type&>::value,
-                          "type");
             static_assert(is_same<decltype(d_ptr), const value_type*>::value,
                           "type");
             static_assert(is_same<decltype(d3(0, 0)), value_type&>::value,
diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp
index 8967ee5597d..6c627838fea 100644
--- a/hip/base/kernel_launch.hip.hpp
+++ b/hip/base/kernel_launch.hip.hpp
@@ -54,9 +54,9 @@ constexpr int default_block_size = 512;
 
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_1d(
-    size_type size, KernelFunction fn, KernelArgs... args)
+    int64 size, KernelFunction fn, KernelArgs... args)
 {
-    auto tidx = thread::get_thread_id_flat();
+    auto tidx = thread::get_thread_id_flat<int64>();
     if (tidx >= size) {
         return;
     }
@@ -66,9 +66,9 @@ __global__ __launch_bounds__(default_block_size) void generic_kernel_1d(
 
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_2d(
-    size_type rows, size_type cols, KernelFunction fn, KernelArgs... args)
+    int64 rows, int64 cols, KernelFunction fn, KernelArgs... args)
 {
-    auto tidx = thread::get_thread_id_flat();
+    auto tidx = thread::get_thread_id_flat<int64>();
     auto col = tidx % cols;
     auto row = tidx / cols;
     if (row >= rows) {
@@ -85,8 +85,8 @@ void run_kernel(std::shared_ptr<const HipExecutor> exec, KernelFunction fn,
     gko::hip::device_guard guard{exec->get_device_id()};
     constexpr auto block_size = default_block_size;
     auto num_blocks = ceildiv(size, block_size);
-    hipLaunchKernelGGL(generic_kernel_1d, num_blocks, block_size, 0, 0, size,
-                       fn, map_to_device(args)...);
+    hipLaunchKernelGGL(generic_kernel_1d, num_blocks, block_size, 0, 0,
+                       static_cast<int64>(size), fn, map_to_device(args)...);
 }
 
 template <typename KernelFunction, typename... KernelArgs>
@@ -96,8 +96,9 @@ void run_kernel(std::shared_ptr<const HipExecutor> exec, KernelFunction fn,
     gko::hip::device_guard guard{exec->get_device_id()};
     constexpr auto block_size = default_block_size;
     auto num_blocks = ceildiv(size[0] * size[1], block_size);
-    hipLaunchKernelGGL(generic_kernel_2d, num_blocks, block_size, 0, 0, size[0],
-                       size[1], fn, map_to_device(args)...);
+    hipLaunchKernelGGL(generic_kernel_2d, num_blocks, block_size, 0, 0,
+                       static_cast<int64>(size[0]), static_cast<int64>(size[1]),
+                       fn, map_to_device(args)...);
 }
 
 
diff --git a/hip/base/kernel_launch_solver.hip.hpp b/hip/base/kernel_launch_solver.hip.hpp
index a8335851a0e..9798f6c4fbc 100644
--- a/hip/base/kernel_launch_solver.hip.hpp
+++ b/hip/base/kernel_launch_solver.hip.hpp
@@ -46,10 +46,10 @@ namespace hip {
 
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver(
-    size_type rows, size_type cols, size_type default_stride, KernelFunction fn,
+    int64 rows, int64 cols, int64 default_stride, KernelFunction fn,
     KernelArgs... args)
 {
-    auto tidx = thread::get_thread_id_flat();
+    auto tidx = thread::get_thread_id_flat<int64>();
     auto col = tidx % cols;
     auto row = tidx / cols;
     if (row >= rows) {
@@ -69,7 +69,9 @@ void run_kernel_solver(std::shared_ptr<const HipExecutor> exec,
     constexpr auto block_size = kernels::hip::default_block_size;
     auto num_blocks = ceildiv(size[0] * size[1], block_size);
     hipLaunchKernelGGL(kernels::hip::generic_kernel_2d_solver, num_blocks,
-                       block_size, 0, 0, size[0], size[1], default_stride, fn,
+                       block_size, 0, 0, static_cast<int64>(size[0]),
+                       static_cast<int64>(size[1]),
+                       static_cast<int64>(default_stride), fn,
                        kernels::hip::map_to_device(args)...);
 }
 
diff --git a/hip/test/base/kernel_launch.hip.cpp b/hip/test/base/kernel_launch.hip.cpp
index 55ddb3fd01e..ad3ba3cc643 100644
--- a/hip/test/base/kernel_launch.hip.cpp
+++ b/hip/test/base/kernel_launch.hip.cpp
@@ -54,6 +54,7 @@ namespace {
 
 
 using gko::dim;
+using gko::int64;
 using gko::size_type;
 using std::is_same;
 
@@ -103,7 +104,7 @@ void run1d(std::shared_ptr<gko::HipExecutor> exec, size_type dim, int* data)
     gko::kernels::hip::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             d[i] = i;
         },
@@ -123,7 +124,7 @@ void run1d(std::shared_ptr<gko::HipExecutor> exec, gko::Array<int>& data)
     gko::kernels::hip::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
             if (d == d_ptr) {
@@ -148,7 +149,7 @@ void run1d(std::shared_ptr<gko::HipExecutor> exec, gko::matrix::Dense<>* m)
     gko::kernels::hip::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
                           "type");
@@ -184,8 +185,8 @@ void run2d(std::shared_ptr<gko::HipExecutor> exec, int* data)
     gko::kernels::hip::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
-            static_assert(is_same<decltype(j), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             d[i + 4 * j] = 4 * i + j;
         },
@@ -205,8 +206,8 @@ void run2d(std::shared_ptr<gko::HipExecutor> exec, gko::Array<int>& data)
     gko::kernels::hip::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
-            static_assert(is_same<decltype(j), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
             if (d == d_ptr) {
@@ -233,7 +234,7 @@ void run2d(std::shared_ptr<gko::HipExecutor> exec, gko::matrix::Dense<>* m1,
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3,
                       auto d4, auto d2_ptr, auto d3_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
                           "type");
diff --git a/omp/base/kernel_launch.hpp b/omp/base/kernel_launch.hpp
index 7df6ff4c313..155ba8fd88b 100644
--- a/omp/base/kernel_launch.hpp
+++ b/omp/base/kernel_launch.hpp
@@ -46,48 +46,49 @@ void run_kernel(std::shared_ptr<const OmpExecutor> exec, KernelFunction fn,
                 size_type size, KernelArgs&&... args)
 {
 #pragma omp parallel for
-    for (size_type i = 0; i < size; i++) {
+    for (int64 i = 0; i < static_cast<int64>(size); i++) {
         [&]() { fn(i, map_to_device(args)...); }();
     }
 }
 
-template <size_type cols, typename KernelFunction, typename... MappedKernelArgs>
+
+template <int64 cols, typename KernelFunction, typename... MappedKernelArgs>
 void run_kernel_fixed_cols_impl(std::shared_ptr<const OmpExecutor> exec,
                                 KernelFunction fn, dim<2> size,
                                 MappedKernelArgs... args)
 {
-    const auto rows = size[0];
+    const auto rows = static_cast<int64>(size[0]);
 #pragma omp parallel for
-    for (size_type row = 0; row < rows; row++) {
+    for (int64 row = 0; row < rows; row++) {
 #pragma unroll
-        for (size_type col = 0; col < cols; col++) {
+        for (int64 col = 0; col < cols; col++) {
             [&]() { fn(row, col, args...); }();
         }
     }
 }
 
-template <size_type remainder_cols, size_type block_size,
-          typename KernelFunction, typename... MappedKernelArgs>
+template <int64 remainder_cols, int64 block_size, typename KernelFunction,
+          typename... MappedKernelArgs>
 void run_kernel_blocked_cols_impl(std::shared_ptr<const OmpExecutor> exec,
                                   KernelFunction fn, dim<2> size,
                                   MappedKernelArgs... args)
 {
     static_assert(remainder_cols < block_size, "remainder too large");
-    const auto rows = size[0];
-    const auto cols = size[1];
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
     const auto rounded_cols = cols / block_size * block_size;
     GKO_ASSERT(rounded_cols + remainder_cols == cols);
 #pragma omp parallel for
-    for (size_type row = 0; row < rows; row++) {
-        for (size_type base_col = 0; base_col < rounded_cols;
+    for (int64 row = 0; row < rows; row++) {
+        for (int64 base_col = 0; base_col < rounded_cols;
              base_col += block_size) {
 #pragma unroll
-            for (size_type i = 0; i < block_size; i++) {
+            for (int64 i = 0; i < block_size; i++) {
                 [&]() { fn(row, base_col + i, args...); }();
             }
         }
 #pragma unroll
-        for (size_type i = 0; i < remainder_cols; i++) {
+        for (int64 i = 0; i < remainder_cols; i++) {
             [&]() { fn(row, rounded_cols + i, args...); }();
         }
     }
@@ -99,7 +100,7 @@ void run_kernel_impl(std::shared_ptr<const OmpExecutor> exec, KernelFunction fn,
 {
     const auto rows = size[0];
     const auto cols = size[1];
-    constexpr size_type block_size = 4;
+    constexpr int64 block_size = 4;
     if (cols <= 0) {
         return;
     }
diff --git a/omp/base/kernel_launch_solver.hpp b/omp/base/kernel_launch_solver.hpp
index dd85ba21915..b5c936c847b 100644
--- a/omp/base/kernel_launch_solver.hpp
+++ b/omp/base/kernel_launch_solver.hpp
@@ -43,7 +43,7 @@ namespace omp {
 
 template <typename T>
 typename device_unpack_solver_impl<typename to_device_type_impl<T>::type>::type
-map_to_device_solver(T&& param, size_type default_stride)
+map_to_device_solver(T&& param, int64 default_stride)
 {
     return device_unpack_solver_impl<typename to_device_type_impl<T>::type>::
         unpack(to_device_type_impl<T>::map_to_device(param), default_stride);
@@ -55,8 +55,9 @@ void run_kernel_solver(std::shared_ptr<const OmpExecutor> exec,
                        KernelFunction fn, dim<2> size, size_type default_stride,
                        KernelArgs&&... args)
 {
-    run_kernel_impl(exec, fn, size,
-                    map_to_device_solver(args, default_stride)...);
+    run_kernel_impl(
+        exec, fn, size,
+        map_to_device_solver(args, static_cast<int64>(default_stride))...);
 }
 
 
diff --git a/omp/test/base/kernel_launch.cpp b/omp/test/base/kernel_launch.cpp
index 2c4712cfa52..dfdf85c3e0e 100644
--- a/omp/test/base/kernel_launch.cpp
+++ b/omp/test/base/kernel_launch.cpp
@@ -54,6 +54,7 @@ namespace {
 
 
 using gko::dim;
+using gko::int64;
 using gko::size_type;
 using std::is_same;
 
@@ -96,7 +97,7 @@ TEST_F(KernelLaunch, Runs1D)
     gko::kernels::omp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             d[i] = i;
         },
@@ -111,7 +112,7 @@ TEST_F(KernelLaunch, Runs1DArray)
     gko::kernels::omp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
             if (d == d_ptr) {
@@ -131,7 +132,7 @@ TEST_F(KernelLaunch, Runs1DDense)
     gko::kernels::omp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
                           "type");
@@ -163,8 +164,8 @@ TEST_F(KernelLaunch, Runs2D)
     gko::kernels::omp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
-            static_assert(is_same<decltype(j), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             d[i + 4 * j] = 4 * i + j;
         },
@@ -179,8 +180,8 @@ TEST_F(KernelLaunch, Runs2DArray)
     gko::kernels::omp::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
-            static_assert(is_same<decltype(j), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
             if (d == d_ptr) {
@@ -201,7 +202,7 @@ TEST_F(KernelLaunch, Runs2DDense)
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3,
                       auto d4, auto d2_ptr, auto d3_ptr) {
-            static_assert(is_same<decltype(i), size_type>::value, "index");
+            static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
                           "type");

From 7124077c66ba0146930c2bd2389517abcc48876e Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Mon, 12 Jul 2021 16:28:25 +0200
Subject: [PATCH 02/25] use synthesizer for omp simple kernels

---
 omp/base/kernel_launch.hpp | 117 +++++++++++++++++--------------------
 1 file changed, 53 insertions(+), 64 deletions(-)

diff --git a/omp/base/kernel_launch.hpp b/omp/base/kernel_launch.hpp
index 155ba8fd88b..79c65ef868e 100644
--- a/omp/base/kernel_launch.hpp
+++ b/omp/base/kernel_launch.hpp
@@ -36,6 +36,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
+#include "core/synthesizer/implementation_selection.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace omp {
@@ -52,96 +55,82 @@ void run_kernel(std::shared_ptr<const OmpExecutor> exec, KernelFunction fn,
 }
 
 
-template <int64 cols, typename KernelFunction, typename... MappedKernelArgs>
-void run_kernel_fixed_cols_impl(std::shared_ptr<const OmpExecutor> exec,
-                                KernelFunction fn, dim<2> size,
-                                MappedKernelArgs... args)
-{
-    const auto rows = static_cast<int64>(size[0]);
-#pragma omp parallel for
-    for (int64 row = 0; row < rows; row++) {
-#pragma unroll
-        for (int64 col = 0; col < cols; col++) {
-            [&]() { fn(row, col, args...); }();
-        }
-    }
-}
+namespace {
 
-template <int64 remainder_cols, int64 block_size, typename KernelFunction,
+
+template <int block_size, int remainder_cols, typename KernelFunction,
           typename... MappedKernelArgs>
-void run_kernel_blocked_cols_impl(std::shared_ptr<const OmpExecutor> exec,
-                                  KernelFunction fn, dim<2> size,
-                                  MappedKernelArgs... args)
+void run_kernel_sized_impl(syn::value_list<int, remainder_cols>,
+                           std::shared_ptr<const OmpExecutor> exec,
+                           KernelFunction fn, dim<2> size,
+                           MappedKernelArgs... args)
 {
-    static_assert(remainder_cols < block_size, "remainder too large");
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
+    static_assert(remainder_cols < block_size, "remainder too large");
     const auto rounded_cols = cols / block_size * block_size;
     GKO_ASSERT(rounded_cols + remainder_cols == cols);
+    if (rounded_cols == 0) {
 #pragma omp parallel for
-    for (int64 row = 0; row < rows; row++) {
-        for (int64 base_col = 0; base_col < rounded_cols;
-             base_col += block_size) {
+        for (int64 row = 0; row < rows; row++) {
 #pragma unroll
-            for (int64 i = 0; i < block_size; i++) {
-                [&]() { fn(row, base_col + i, args...); }();
+            for (int64 col = 0; col < remainder_cols; col++) {
+                [&]() { fn(row, col, args...); }();
             }
         }
+    } else if (cols == block_size) {
+#pragma omp parallel for
+        for (int64 row = 0; row < rows; row++) {
 #pragma unroll
-        for (int64 i = 0; i < remainder_cols; i++) {
-            [&]() { fn(row, rounded_cols + i, args...); }();
+            for (int64 col = 0; col < block_size; col++) {
+                [&]() { fn(row, col, args...); }();
+            }
+        }
+    } else {
+#pragma omp parallel for
+        for (int64 row = 0; row < rows; row++) {
+            for (int64 base_col = 0; base_col < rounded_cols;
+                 base_col += block_size) {
+#pragma unroll
+                for (int64 i = 0; i < block_size; i++) {
+                    [&]() { fn(row, base_col + i, args...); }();
+                }
+            }
+#pragma unroll
+            for (int64 i = 0; i < remainder_cols; i++) {
+                [&]() { fn(row, rounded_cols + i, args...); }();
+            }
         }
     }
 }
 
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_sized,
+                                    run_kernel_sized_impl)
+
+
 template <typename KernelFunction, typename... MappedKernelArgs>
 void run_kernel_impl(std::shared_ptr<const OmpExecutor> exec, KernelFunction fn,
                      dim<2> size, MappedKernelArgs... args)
 {
-    const auto rows = size[0];
-    const auto cols = size[1];
-    constexpr int64 block_size = 4;
+    const auto cols = static_cast<int64>(size[1]);
+    constexpr int block_size = 8;
+    using remainders = syn::as_list<syn::range<0, block_size, 1>>;
+
     if (cols <= 0) {
         return;
     }
-    if (cols == 1) {
-        run_kernel_fixed_cols_impl<1>(exec, fn, size, args...);
-        return;
-    }
-    if (cols == 2) {
-        run_kernel_fixed_cols_impl<2>(exec, fn, size, args...);
-        return;
-    }
-    if (cols == 3) {
-        run_kernel_fixed_cols_impl<3>(exec, fn, size, args...);
-        return;
-    }
-    if (cols == 4) {
-        run_kernel_fixed_cols_impl<4>(exec, fn, size, args...);
-        return;
-    }
-    const auto rem_cols = cols % block_size;
-    if (rem_cols == 0) {
-        run_kernel_blocked_cols_impl<0, block_size>(exec, fn, size, args...);
-        return;
-    }
-    if (rem_cols == 1) {
-        run_kernel_blocked_cols_impl<1, block_size>(exec, fn, size, args...);
-        return;
-    }
-    if (rem_cols == 2) {
-        run_kernel_blocked_cols_impl<2, block_size>(exec, fn, size, args...);
-        return;
-    }
-    if (rem_cols == 3) {
-        run_kernel_blocked_cols_impl<3, block_size>(exec, fn, size, args...);
-        return;
-    }
-    // should be unreachable
-    GKO_ASSERT(false);
+    select_run_kernel_sized(
+        remainders(),
+        [&](int remainder) { return remainder == cols % block_size; },
+        syn::value_list<int, block_size>(), syn::type_list<>(), exec, fn, size,
+        args...);
 }
 
 
+}  // namespace
+
+
 template <typename KernelFunction, typename... KernelArgs>
 void run_kernel(std::shared_ptr<const OmpExecutor> exec, KernelFunction fn,
                 dim<2> size, KernelArgs&&... args)

From 789a46c6f33adf70065e046e356e5d6875128a91 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Tue, 13 Jul 2021 18:47:01 +0200
Subject: [PATCH 03/25] add simple reduction kernels

---
 common/cuda_hip/components/reduction.hpp.inc  |   7 +-
 common/unified/base/kernel_launch.hpp         |   2 -
 .../unified/base/kernel_launch_reduction.hpp  |  51 +++++
 cuda/base/kernel_launch_reduction.cuh         | 193 +++++++++++++++++
 cuda/components/reduction.cuh                 |  13 +-
 cuda/test/base/kernel_launch.cu               |  59 ++++++
 hip/base/kernel_launch_reduction.hip.hpp      | 197 ++++++++++++++++++
 hip/components/reduction.hip.hpp              |  21 +-
 hip/test/base/kernel_launch.hip.cpp           |  59 ++++++
 omp/base/kernel_launch.hpp                    |  17 +-
 omp/base/kernel_launch_reduction.hpp          | 182 ++++++++++++++++
 omp/test/base/kernel_launch.cpp               |  79 +++++++
 12 files changed, 849 insertions(+), 31 deletions(-)
 create mode 100644 common/unified/base/kernel_launch_reduction.hpp
 create mode 100644 cuda/base/kernel_launch_reduction.cuh
 create mode 100644 hip/base/kernel_launch_reduction.hip.hpp
 create mode 100644 omp/base/kernel_launch_reduction.hpp

diff --git a/common/cuda_hip/components/reduction.hpp.inc b/common/cuda_hip/components/reduction.hpp.inc
index 9b4ed4cc8c7..3853fca6d44 100644
--- a/common/cuda_hip/components/reduction.hpp.inc
+++ b/common/cuda_hip/components/reduction.hpp.inc
@@ -208,14 +208,15 @@ __device__ void reduce_array(size_type size,
  *
  * Computes a reduction using the add operation (+) on an array
  * `source` of any size. Has to be called a second time on `result` to reduce
- * an array larger than `default_block_size`.
+ * an array larger than `default_reduce_block_size`.
  */
 template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void reduce_add_array(
+__global__ __launch_bounds__(default_reduce_block_size) void reduce_add_array(
     size_type size, const ValueType* __restrict__ source,
     ValueType* __restrict__ result)
 {
-    __shared__ UninitializedArray<ValueType, default_block_size> block_sum;
+    __shared__ UninitializedArray<ValueType, default_reduce_block_size>
+        block_sum;
     reduce_array(size, source, static_cast<ValueType*>(block_sum),
                  [](const ValueType& x, const ValueType& y) { return x + y; });
 
diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp
index bf403d3a656..0e25671c58a 100644
--- a/common/unified/base/kernel_launch.hpp
+++ b/common/unified/base/kernel_launch.hpp
@@ -268,8 +268,6 @@ typename to_device_type_impl<T>::type map_to_device(T&& param)
 }  // namespace gko
 
 
-// these files include this file again to make inclusion work from both sides,
-// this does not lead to issues due to the header guards.
 #if defined(GKO_COMPILING_CUDA)
 #include "cuda/base/kernel_launch.cuh"
 #elif defined(GKO_COMPILING_HIP)
diff --git a/common/unified/base/kernel_launch_reduction.hpp b/common/unified/base/kernel_launch_reduction.hpp
new file mode 100644
index 00000000000..78de06466aa
--- /dev/null
+++ b/common/unified/base/kernel_launch_reduction.hpp
@@ -0,0 +1,51 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#define GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+
+
+#include "common/unified/base/kernel_launch.hpp"
+
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/base/kernel_launch_reduction.cuh"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/kernel_launch_reduction.hip.hpp"
+#elif defined(GKO_COMPILING_DPCPP)
+#include "dpcpp/base/kernel_launch_reduction.dp.hpp"
+#elif defined(GKO_COMPILING_OMP)
+#include "omp/base/kernel_launch_reduction.hpp"
+#endif
+
+
+#endif  // GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
new file mode 100644
index 00000000000..3a661366b53
--- /dev/null
+++ b/cuda/base/kernel_launch_reduction.cuh
@@ -0,0 +1,193 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#error \
+    "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp"
+#endif
+
+
+#include "cuda/base/device_guard.hpp"
+#include "cuda/base/types.hpp"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+__global__ __launch_bounds__(
+    default_block_size) void generic_kernel_reduction_1d(int64 size,
+                                                         KernelFunction fn,
+                                                         ReductionOp op,
+                                                         FinalizeOp finalize,
+                                                         ValueType init,
+                                                         ValueType* storage,
+                                                         KernelArgs... args)
+{
+    __shared__
+        UninitializedArray<ValueType, default_block_size / config::warp_size>
+            warp_partial;
+    static_assert(default_block_size / config::warp_size <= config::warp_size,
+                  "needs third reduction level");
+    auto tidx = thread::get_thread_id_flat<int64>();
+    auto grid_size = thread::get_thread_num_flat<int64>();
+    auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    auto partial = init;
+    for (int64 i = tidx; i < size; i += grid_size) {
+        partial = op(partial, fn(i, args...));
+    }
+    partial = reduce(warp, partial, op);
+    if (warp.thread_rank() == 0) {
+        warp_partial[threadIdx.x / config::warp_size] = partial;
+    }
+    __syncthreads();
+    if (threadIdx.x < config::warp_size) {
+        storage[blockIdx.x] =
+            finalize(reduce(warp, warp_partial[threadIdx.x], op));
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+__global__ __launch_bounds__(
+    default_block_size) void generic_kernel_reduction_2d(int64 rows, int64 cols,
+                                                         KernelFunction fn,
+                                                         ReductionOp op,
+                                                         FinalizeOp finalize,
+                                                         ValueType init,
+                                                         ValueType* storage,
+                                                         KernelArgs... args)
+{
+    __shared__
+        UninitializedArray<ValueType, default_block_size / config::warp_size>
+            warp_partial;
+    static_assert(default_block_size / config::warp_size <= config::warp_size,
+                  "needs third reduction level");
+    auto tidx = thread::get_thread_id_flat<int64>();
+    auto grid_size = thread::get_thread_num_flat<int64>();
+    auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    auto partial = init;
+    for (int64 i = tidx; i < rows * cols; i += grid_size) {
+        const auto row = i / cols;
+        const auto col = i % cols;
+        partial = op(partial, fn(row, col, args...));
+    }
+    partial = reduce(warp, partial, op);
+    if (warp.thread_rank() == 0) {
+        warp_partial[threadIdx.x / config::warp_size] = partial;
+    }
+    __syncthreads();
+    if (threadIdx.x < config::warp_size) {
+        storage[blockIdx.x] =
+            finalize(reduce(warp, warp_partial[threadIdx.x], op));
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const CudaExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType init,
+                          ValueType* result, size_type size,
+                          KernelArgs&&... args)
+{
+    constexpr int oversubscription = 4;
+    gko::cuda::device_guard guard{exec->get_device_id()};
+    constexpr auto block_size = default_block_size;
+    const auto num_blocks = std::min<int64>(
+        ceildiv(size, block_size), exec->get_num_warps() * oversubscription);
+    if (num_blocks > 1) {
+        Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
+        generic_kernel_reduction_1d<<<num_blocks, block_size>>>(
+            static_cast<int64>(size), fn, op,
+            [] __device__(auto v) { return v; }, as_cuda_type(init),
+            as_cuda_type(partial.get_data()), map_to_device(args)...);
+        generic_kernel_reduction_1d<<<1, block_size>>>(
+            static_cast<int64>(num_blocks),
+            [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
+            as_cuda_type(init), as_cuda_type(result),
+            as_cuda_type(partial.get_const_data()));
+    } else {
+        generic_kernel_reduction_1d<<<1, block_size>>>(
+            static_cast<int64>(size), fn, op, finalize, as_cuda_type(init),
+            as_cuda_type(result), map_to_device(args)...);
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const CudaExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType init,
+                          ValueType* result, dim<2> size, KernelArgs&&... args)
+{
+    constexpr int oversubscription = 4;
+    gko::cuda::device_guard guard{exec->get_device_id()};
+    constexpr auto block_size = default_block_size;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto num_blocks =
+        std::min<int64>(ceildiv(rows * cols, block_size),
+                        exec->get_num_warps() * oversubscription);
+    if (num_blocks > 1) {
+        Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
+        generic_kernel_reduction_2d<<<num_blocks, block_size>>>(
+            rows, cols, fn, op, [] __device__(auto v) { return v; },
+            as_cuda_type(init), as_cuda_type(partial.get_data()),
+            map_to_device(args)...);
+        generic_kernel_reduction_1d<<<1, block_size>>>(
+            static_cast<int64>(num_blocks),
+            [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
+            as_cuda_type(init), as_cuda_type(result),
+            as_cuda_type(partial.get_const_data()));
+    } else {
+        generic_kernel_reduction_2d<<<1, block_size>>>(
+            rows, cols, fn, op, finalize, as_cuda_type(init),
+            as_cuda_type(result), map_to_device(args)...);
+    }
+}
+
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh
index 95ac3d8a417..8e0a962145e 100644
--- a/cuda/components/reduction.cuh
+++ b/cuda/components/reduction.cuh
@@ -53,7 +53,7 @@ namespace kernels {
 namespace cuda {
 
 
-constexpr int default_block_size = 512;
+constexpr int default_reduce_block_size = 512;
 
 
 #include "common/cuda_hip/components/reduction.hpp.inc"
@@ -75,13 +75,14 @@ __host__ ValueType reduce_add_array(std::shared_ptr<const CudaExecutor> exec,
     auto block_results_val = source;
     size_type grid_dim = size;
     auto block_results = Array<ValueType>(exec);
-    if (size > default_block_size) {
-        const auto n = ceildiv(size, default_block_size);
-        grid_dim = (n <= default_block_size) ? n : default_block_size;
+    if (size > default_reduce_block_size) {
+        const auto n = ceildiv(size, default_reduce_block_size);
+        grid_dim =
+            (n <= default_reduce_block_size) ? n : default_reduce_block_size;
 
         block_results.resize_and_reset(grid_dim);
 
-        reduce_add_array<<<grid_dim, default_block_size>>>(
+        reduce_add_array<<<grid_dim, default_reduce_block_size>>>(
             size, as_cuda_type(source), as_cuda_type(block_results.get_data()));
 
         block_results_val = block_results.get_const_data();
@@ -89,7 +90,7 @@ __host__ ValueType reduce_add_array(std::shared_ptr<const CudaExecutor> exec,
 
     auto d_result = Array<ValueType>(exec, 1);
 
-    reduce_add_array<<<1, default_block_size>>>(
+    reduce_add_array<<<1, default_reduce_block_size>>>(
         grid_dim, as_cuda_type(block_results_val),
         as_cuda_type(d_result.get_data()));
     auto answer = exec->copy_val_to_host(d_result.get_const_data());
diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu
index adf443445a5..8e78e3ee830 100644
--- a/cuda/test/base/kernel_launch.cu
+++ b/cuda/test/base/kernel_launch.cu
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/unified/base/kernel_launch_reduction.hpp"
 #include "common/unified/base/kernel_launch_solver.hpp"
 #include "core/test/utils.hpp"
 
@@ -276,4 +277,62 @@ TEST_F(KernelLaunch, Runs2DDense)
 }
 
 
+void run1d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
+{
+    gko::Array<int64> output{exec, 1};
+    gko::kernels::cuda::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            return i + 1;
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
+        size_type{100000});
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000ll);
+
+    gko::kernels::cuda::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            return i + 1;
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
+        size_type{100});
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll);
+}
+
+TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); }
+
+
+void run2d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
+{
+    gko::Array<int64> output{exec, 1};
+    gko::kernels::cuda::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
+        gko::dim<2>{1000, 100});
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll);
+
+    gko::kernels::cuda::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
+        gko::dim<2>{10, 10});
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll);
+}
+
+TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); }
+
+
 }  // namespace
diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp
new file mode 100644
index 00000000000..1075acc8198
--- /dev/null
+++ b/hip/base/kernel_launch_reduction.hip.hpp
@@ -0,0 +1,197 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#error \
+    "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp"
+#endif
+
+
+#include "hip/base/device_guard.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+__global__ __launch_bounds__(
+    default_block_size) void generic_kernel_reduction_1d(int64 size,
+                                                         KernelFunction fn,
+                                                         ReductionOp op,
+                                                         FinalizeOp finalize,
+                                                         ValueType init,
+                                                         ValueType* storage,
+                                                         KernelArgs... args)
+{
+    __shared__
+        UninitializedArray<ValueType, default_block_size / config::warp_size>
+            warp_partial;
+    static_assert(default_block_size / config::warp_size <= config::warp_size,
+                  "needs third reduction level");
+    auto tidx = thread::get_thread_id_flat<int64>();
+    auto grid_size = thread::get_thread_num_flat<int64>();
+    auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    auto partial = init;
+    for (int64 i = tidx; i < size; i += grid_size) {
+        partial = op(partial, fn(i, args...));
+    }
+    partial = reduce(warp, partial, op);
+    if (warp.thread_rank() == 0) {
+        warp_partial[threadIdx.x / config::warp_size] = partial;
+    }
+    __syncthreads();
+    if (threadIdx.x < config::warp_size) {
+        storage[blockIdx.x] =
+            finalize(reduce(warp, warp_partial[threadIdx.x], op));
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+__global__ __launch_bounds__(
+    default_block_size) void generic_kernel_reduction_2d(int64 rows, int64 cols,
+                                                         KernelFunction fn,
+                                                         ReductionOp op,
+                                                         FinalizeOp finalize,
+                                                         ValueType init,
+                                                         ValueType* storage,
+                                                         KernelArgs... args)
+{
+    __shared__
+        UninitializedArray<ValueType, default_block_size / config::warp_size>
+            warp_partial;
+    static_assert(default_block_size / config::warp_size <= config::warp_size,
+                  "needs third reduction level");
+    auto tidx = thread::get_thread_id_flat<int64>();
+    auto grid_size = thread::get_thread_num_flat<int64>();
+    auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    auto partial = init;
+    for (int64 i = tidx; i < rows * cols; i += grid_size) {
+        const auto row = i / cols;
+        const auto col = i % cols;
+        partial = op(partial, fn(row, col, args...));
+    }
+    partial = reduce(warp, partial, op);
+    if (warp.thread_rank() == 0) {
+        warp_partial[threadIdx.x / config::warp_size] = partial;
+    }
+    __syncthreads();
+    if (threadIdx.x < config::warp_size) {
+        storage[blockIdx.x] =
+            finalize(reduce(warp, warp_partial[threadIdx.x], op));
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const HipExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType init,
+                          ValueType* result, size_type size,
+                          KernelArgs&&... args)
+{
+    constexpr int oversubscription = 4;
+    gko::hip::device_guard guard{exec->get_device_id()};
+    constexpr auto block_size = default_block_size;
+    const auto num_blocks = std::min<int64>(
+        ceildiv(size, block_size), exec->get_num_warps() * oversubscription);
+    if (num_blocks > 1) {
+        Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
+        hipLaunchKernelGGL(
+            generic_kernel_reduction_1d, num_blocks, block_size, 0, 0,
+            static_cast<int64>(size), fn, op,
+            [] __device__(auto v) { return v; }, as_hip_type(init),
+            as_hip_type(partial.get_data()), map_to_device(args)...);
+        hipLaunchKernelGGL(
+            generic_kernel_reduction_1d, 1, block_size, 0, 0,
+            static_cast<int64>(num_blocks),
+            [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
+            as_hip_type(init), as_hip_type(result),
+            as_hip_type(partial.get_const_data()));
+    } else {
+        hipLaunchKernelGGL(generic_kernel_reduction_1d, 1, block_size, 0, 0,
+                           static_cast<int64>(size), fn, op, finalize,
+                           as_hip_type(init), as_hip_type(result),
+                           map_to_device(args)...);
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const HipExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType init,
+                          ValueType* result, dim<2> size, KernelArgs&&... args)
+{
+    constexpr int oversubscription = 4;
+    gko::hip::device_guard guard{exec->get_device_id()};
+    constexpr auto block_size = default_block_size;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto num_blocks =
+        std::min<int64>(ceildiv(rows * cols, block_size),
+                        exec->get_num_warps() * oversubscription);
+    if (num_blocks > 1) {
+        Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
+        generic_kernel_reduction_2d<<<num_blocks, block_size>>>(
+            rows, cols, fn, op, [] __device__(auto v) { return v; },
+            as_hip_type(init), as_hip_type(partial.get_data()),
+            map_to_device(args)...);
+        hipLaunchKernelGGL(
+            generic_kernel_reduction_1d, 1, block_size, 0, 0,
+            static_cast<int64>(num_blocks),
+            [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
+            as_hip_type(init), as_hip_type(result),
+            as_hip_type(partial.get_const_data()));
+    } else {
+        hipLaunchKernelGGL(generic_kernel_reduction_2d, 1, block_size, 0, 0,
+                           rows, cols, fn, op, finalize, as_hip_type(init),
+                           as_hip_type(result), map_to_device(args)...);
+    }
+}
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/components/reduction.hip.hpp b/hip/components/reduction.hip.hpp
index 87d6b518123..7850b9f65a1 100644
--- a/hip/components/reduction.hip.hpp
+++ b/hip/components/reduction.hip.hpp
@@ -55,7 +55,7 @@ namespace kernels {
 namespace hip {
 
 
-constexpr int default_block_size = 512;
+constexpr int default_reduce_block_size = 512;
 
 
 #include "common/cuda_hip/components/reduction.hpp.inc"
@@ -77,23 +77,26 @@ __host__ ValueType reduce_add_array(std::shared_ptr<const HipExecutor> exec,
     auto block_results_val = source;
     size_type grid_dim = size;
     auto block_results = Array<ValueType>(exec);
-    if (size > default_block_size) {
-        const auto n = ceildiv(size, default_block_size);
-        grid_dim = (n <= default_block_size) ? n : default_block_size;
+    if (size > default_reduce_block_size) {
+        const auto n = ceildiv(size, default_reduce_block_size);
+        grid_dim =
+            (n <= default_reduce_block_size) ? n : default_reduce_block_size;
 
         block_results.resize_and_reset(grid_dim);
 
-        hipLaunchKernelGGL(
-            reduce_add_array, dim3(grid_dim), dim3(default_block_size), 0, 0,
-            size, as_hip_type(source), as_hip_type(block_results.get_data()));
+        hipLaunchKernelGGL(reduce_add_array, dim3(grid_dim),
+                           dim3(default_reduce_block_size), 0, 0, size,
+                           as_hip_type(source),
+                           as_hip_type(block_results.get_data()));
 
         block_results_val = block_results.get_const_data();
     }
 
     auto d_result = Array<ValueType>(exec, 1);
 
-    hipLaunchKernelGGL(reduce_add_array, dim3(1), dim3(default_block_size), 0,
-                       0, grid_dim, as_hip_type(block_results_val),
+    hipLaunchKernelGGL(reduce_add_array, dim3(1),
+                       dim3(default_reduce_block_size), 0, 0, grid_dim,
+                       as_hip_type(block_results_val),
                        as_hip_type(d_result.get_data()));
     auto answer = exec->copy_val_to_host(d_result.get_const_data());
     return answer;
diff --git a/hip/test/base/kernel_launch.hip.cpp b/hip/test/base/kernel_launch.hip.cpp
index ad3ba3cc643..4fb5ef0a4dc 100644
--- a/hip/test/base/kernel_launch.hip.cpp
+++ b/hip/test/base/kernel_launch.hip.cpp
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/unified/base/kernel_launch_reduction.hpp"
 #include "common/unified/base/kernel_launch_solver.hpp"
 #include "core/test/utils.hpp"
 
@@ -275,4 +276,62 @@ TEST_F(KernelLaunch, Runs2DDense)
 }
 
 
+void run1d_reduction(std::shared_ptr<gko::HipExecutor> exec)
+{
+    gko::Array<int64> output{exec, 1};
+    gko::kernels::hip::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            return i + 1;
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
+        size_type{100000});
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000ll);
+
+    gko::kernels::hip::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            return i + 1;
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
+        size_type{100});
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll);
+}
+
+TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); }
+
+
+void run2d_reduction(std::shared_ptr<gko::HipExecutor> exec)
+{
+    gko::Array<int64> output{exec, 1};
+    gko::kernels::hip::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
+        gko::dim<2>{1000, 100});
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll);
+
+    gko::kernels::hip::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
+        gko::dim<2>{10, 10});
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll);
+}
+
+TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); }
+
+
 }  // namespace
diff --git a/omp/base/kernel_launch.hpp b/omp/base/kernel_launch.hpp
index 79c65ef868e..98432c2d155 100644
--- a/omp/base/kernel_launch.hpp
+++ b/omp/base/kernel_launch.hpp
@@ -70,23 +70,19 @@ void run_kernel_sized_impl(syn::value_list<int, remainder_cols>,
     static_assert(remainder_cols < block_size, "remainder too large");
     const auto rounded_cols = cols / block_size * block_size;
     GKO_ASSERT(rounded_cols + remainder_cols == cols);
-    if (rounded_cols == 0) {
+    if (rounded_cols == 0 || cols == block_size) {
+        // we group all sizes <= block_size here and unroll explicitly
+        constexpr auto local_cols =
+            remainder_cols == 0 ? block_size : remainder_cols;
 #pragma omp parallel for
         for (int64 row = 0; row < rows; row++) {
 #pragma unroll
-            for (int64 col = 0; col < remainder_cols; col++) {
-                [&]() { fn(row, col, args...); }();
-            }
-        }
-    } else if (cols == block_size) {
-#pragma omp parallel for
-        for (int64 row = 0; row < rows; row++) {
-#pragma unroll
-            for (int64 col = 0; col < block_size; col++) {
+            for (int64 col = 0; col < local_cols; col++) {
                 [&]() { fn(row, col, args...); }();
             }
         }
     } else {
+        // we operate in block_size blocks plus an explicitly unrolled remainder
 #pragma omp parallel for
         for (int64 row = 0; row < rows; row++) {
             for (int64 base_col = 0; base_col < rounded_cols;
@@ -104,7 +100,6 @@ void run_kernel_sized_impl(syn::value_list<int, remainder_cols>,
     }
 }
 
-
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_sized,
                                     run_kernel_sized_impl)
 
diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
new file mode 100644
index 00000000000..4981d4ed902
--- /dev/null
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -0,0 +1,182 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#error \
+    "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp"
+#endif
+
+
+#include <numeric>
+
+
+#include <omp.h>
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType init,
+                          ValueType* result, size_type size,
+                          KernelArgs&&... args)
+{
+    const auto num_threads = static_cast<int64>(omp_get_max_threads());
+    const auto ssize = static_cast<int64>(size);
+    const auto work_per_thread = ceildiv(ssize, num_threads);
+    Array<ValueType> partial{exec, static_cast<size_type>(num_threads)};
+#pragma omp parallel num_threads(num_threads)
+    {
+        const auto thread_id = omp_get_thread_num();
+        const auto begin = thread_id * work_per_thread;
+        const auto end = std::min(ssize, begin + work_per_thread);
+
+        auto local_partial = init;
+        for (auto i = begin; i < end; i++) {
+            local_partial = op(local_partial, [&]() {
+                return fn(i, map_to_device(args)...);
+            }());
+        }
+        partial.get_data()[thread_id] = local_partial;
+    }
+    *result = finalize(std::accumulate(partial.get_const_data(),
+                                       partial.get_const_data() + num_threads,
+                                       init, op));
+}
+
+
+namespace {
+
+
+template <int block_size, int remainder_cols, typename ValueType,
+          typename KernelFunction, typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
+                                     std::shared_ptr<const OmpExecutor> exec,
+                                     KernelFunction fn, ReductionOp op,
+                                     FinalizeOp finalize, ValueType init,
+                                     ValueType* result, dim<2> size,
+                                     MappedKernelArgs... args)
+{
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto num_threads = static_cast<int64>(omp_get_max_threads());
+    const auto work_per_thread = ceildiv(rows, num_threads);
+    Array<ValueType> partial{exec, static_cast<size_type>(num_threads)};
+    static_assert(remainder_cols < block_size, "remainder too large");
+    const auto rounded_cols = cols / block_size * block_size;
+    GKO_ASSERT(rounded_cols + remainder_cols == cols);
+#pragma omp parallel
+    {
+        const auto thread_id = omp_get_thread_num();
+        const auto begin = thread_id * work_per_thread;
+        const auto end = std::min(rows, begin + work_per_thread);
+
+        auto local_partial = init;
+        if (rounded_cols == 0 || cols == block_size) {
+            // we group all sizes <= block_size here and unroll explicitly
+            constexpr auto local_cols =
+                remainder_cols == 0 ? block_size : remainder_cols;
+            for (auto row = begin; row < end; row++) {
+#pragma unroll
+                for (int64 col = 0; col < local_cols; col++) {
+                    local_partial = op(local_partial, [&]() {
+                        return fn(row, col, args...);
+                    }());
+                }
+            }
+        } else {
+            // we operate in block_size blocks plus an explicitly unrolled
+            // remainder
+            for (auto row = begin; row < end; row++) {
+                for (int64 base_col = 0; base_col < rounded_cols;
+                     base_col += block_size) {
+#pragma unroll
+                    for (int64 i = 0; i < block_size; i++) {
+                        local_partial = op(local_partial, [&]() {
+                            return fn(row, base_col + i, args...);
+                        }());
+                    }
+                }
+#pragma unroll
+                for (int64 i = 0; i < remainder_cols; i++) {
+                    local_partial = op(local_partial, [&]() {
+                        return fn(row, rounded_cols + i, args...);
+                    }());
+                }
+            }
+        }
+        partial.get_data()[thread_id] = local_partial;
+    }
+    *result = finalize(std::accumulate(partial.get_const_data(),
+                                       partial.get_const_data() + num_threads,
+                                       init, op));
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized,
+                                    run_kernel_reduction_sized_impl)
+
+
+}  // namespace
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType init,
+                          ValueType* result, dim<2> size, KernelArgs&&... args)
+{
+    const auto cols = static_cast<int64>(size[1]);
+    constexpr int block_size = 8;
+    using remainders = syn::as_list<syn::range<0, block_size, 1>>;
+
+    if (cols <= 0) {
+        *result = init;
+        return;
+    }
+    select_run_kernel_reduction_sized(
+        remainders(),
+        [&](int remainder) { return remainder == cols % block_size; },
+        syn::value_list<int, block_size>(), syn::type_list<>(), exec, fn, op,
+        finalize, init, result, size, args...);
+}
+
+
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/test/base/kernel_launch.cpp b/omp/test/base/kernel_launch.cpp
index dfdf85c3e0e..6f13797b85f 100644
--- a/omp/test/base/kernel_launch.cpp
+++ b/omp/test/base/kernel_launch.cpp
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/unified/base/kernel_launch_reduction.hpp"
 #include "common/unified/base/kernel_launch_solver.hpp"
 #include "core/test/utils.hpp"
 
@@ -239,5 +240,83 @@ TEST_F(KernelLaunch, Runs2DDense)
     GKO_ASSERT_MTX_NEAR(zero_dense2, iota_dense, 0.0);
 }
 
+TEST_F(KernelLaunch, Reduction1D)
+{
+    gko::Array<int64> output{exec, 1};
+    gko::kernels::omp::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            return i + 1;
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
+        size_type{100000});
+    ASSERT_EQ(*output.get_const_data(), 10000100000ll);
+
+    gko::kernels::omp::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            return i + 1;
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
+        size_type{10});
+    ASSERT_EQ(*output.get_const_data(), 110ll);
+}
+
+
+TEST_F(KernelLaunch, Reduction2DSmallRows)
+{
+    gko::Array<int64> output{exec, 1};
+    for (int cols = 0; cols < 17; cols++) {
+        gko::kernels::omp::run_kernel_reduction(
+            exec,
+            [] GKO_KERNEL(auto i, auto j) {
+                static_assert(is_same<decltype(i), int64>::value, "index");
+                return (i + 1) * (j + 1);
+            },
+            [] GKO_KERNEL(auto i, auto j) { return i + j; },
+            [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
+            gko::dim<2>{10, cols});
+        ASSERT_EQ(*output.get_const_data(), 110ll * cols * (cols + 1));
+    }
+}
+
+
+TEST_F(KernelLaunch, Reduction2DLargeRows)
+{
+    gko::Array<int64> output{exec, 1};
+    for (int cols = 0; cols < 17; cols++) {
+        gko::kernels::omp::run_kernel_reduction(
+            exec,
+            [] GKO_KERNEL(auto i, auto j) {
+                static_assert(is_same<decltype(i), int64>::value, "index");
+                return (i + 1) * (j + 1);
+            },
+            [] GKO_KERNEL(auto i, auto j) { return i + j; },
+            [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
+            gko::dim<2>{1000, cols});
+        ASSERT_EQ(*output.get_const_data(), 1001000ll * cols * (cols + 1));
+    }
+}
+
+
+TEST_F(KernelLaunch, Reduction2D)
+{
+    gko::Array<int64> output{exec, 1};
+    gko::kernels::omp::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
+        gko::dim<2>{1000, 100});
+    ASSERT_EQ(*output.get_const_data(), 10110100000ll);
+}
+
 
 }  // namespace

From 12cb03c3f4e06410d37f867f09984f3e7db88e56 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 14 Jul 2021 02:30:43 +0200
Subject: [PATCH 04/25] add row and column reduction kernels

---
 cuda/base/kernel_launch_reduction.cuh | 235 +++++++++++++++++++++++++-
 cuda/test/base/kernel_launch.cu       |  57 +++++++
 2 files changed, 288 insertions(+), 4 deletions(-)

diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
index 3a661366b53..08849a90a4a 100644
--- a/cuda/base/kernel_launch_reduction.cuh
+++ b/cuda/base/kernel_launch_reduction.cuh
@@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
+#include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/device_guard.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
@@ -78,8 +79,10 @@ __global__ __launch_bounds__(
     }
     __syncthreads();
     if (threadIdx.x < config::warp_size) {
-        storage[blockIdx.x] =
-            finalize(reduce(warp, warp_partial[threadIdx.x], op));
+        partial = reduce(warp, warp_partial[threadIdx.x], op);
+        if (threadIdx.x == 0) {
+            storage[blockIdx.x] = finalize(partial);
+        }
     }
 }
 
@@ -116,8 +119,10 @@ __global__ __launch_bounds__(
     }
     __syncthreads();
     if (threadIdx.x < config::warp_size) {
-        storage[blockIdx.x] =
-            finalize(reduce(warp, warp_partial[threadIdx.x], op));
+        partial = reduce(warp, warp_partial[threadIdx.x], op);
+        if (threadIdx.x == 0) {
+            storage[blockIdx.x] = finalize(partial);
+        }
     }
 }
 
@@ -188,6 +193,228 @@ void run_kernel_reduction(std::shared_ptr<const CudaExecutor> exec,
 }
 
 
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d(
+        int64 rows, int64 cols, int64 col_parts, KernelFunction fn,
+        ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result,
+        int64 result_stride, KernelArgs... args)
+{
+    const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
+    const auto row = idx % rows;
+    const auto col_part = idx / rows;
+    if (col_part >= col_parts) {
+        return;
+    }
+    const auto cols_per_part = ceildiv(cols, col_parts);
+    // TODO use boundaries divisible by subwarp_size
+    const auto begin = cols_per_part * col_part;
+    const auto end = min(begin + cols_per_part, cols);
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    auto partial = init;
+    for (auto col = begin + subwarp.thread_rank(); col < end;
+         col += subwarp_size) {
+        partial = op(partial, fn(row, col, args...));
+    }
+    partial = reduce(subwarp, partial, op);
+    result[(row * col_parts + col_part) * result_stride] = finalize(partial);
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d(
+        int64 rows, int64 cols, int64 row_parts, KernelFunction fn,
+        ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result,
+        KernelArgs... args)
+{
+    const auto idx = thread::get_thread_id_flat<int64>();
+    const auto col = idx % cols;
+    const auto row_part = idx / cols;
+    if (row_part >= row_parts) {
+        return;
+    }
+    const auto rows_per_part = ceildiv(rows, row_parts);
+    const auto begin = rows_per_part * row_part;
+    const auto end = min(begin + rows_per_part, rows);
+    auto partial = init;
+    for (auto row = begin; row < end; row++) {
+        partial = op(partial, fn(row, col, args...));
+    }
+    result[col * row_parts + row_part] = finalize(partial);
+}
+
+
+template <int subwarp_size, typename ValueType, typename ReductionOp,
+          typename FinalizeOp>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d(
+        int64 num_results, int64 num_parts, ReductionOp op, FinalizeOp finalize,
+        ValueType init, const ValueType* input, int64 result_stride,
+        ValueType* result)
+{
+    const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
+    if (idx >= num_results) {
+        return;
+    }
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    auto partial = init;
+    for (int64 part = subwarp.thread_rank(); part < num_parts;
+         part += subwarp_size) {
+        partial = op(partial, input[idx * num_parts + part]);
+    }
+    partial = reduce(subwarp, partial, op);
+    if (subwarp.thread_rank() == 0) {
+        result[idx * result_stride] = finalize(partial);
+    }
+}
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+void run_generic_kernel_row_reduction(syn::value_list<int, subwarp_size>,
+                                      int64 rows, int64 cols, int64 col_parts,
+                                      KernelFunction fn, ReductionOp op,
+                                      FinalizeOp finalize, ValueType init,
+                                      ValueType* result, int64 result_stride,
+                                      KernelArgs... args)
+{
+    constexpr auto block_size = default_block_size;
+    const auto num_blocks = ceildiv(rows * cols * subwarp_size, block_size);
+    generic_kernel_row_reduction_2d<subwarp_size><<<num_blocks, block_size>>>(
+        rows, cols, col_parts, fn, op, finalize, as_cuda_type(init),
+        as_cuda_type(result), result_stride, args...);
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction,
+                                    run_generic_kernel_row_reduction)
+
+
+template <int subwarp_size, typename ValueType, typename ReductionOp,
+          typename FinalizeOp>
+void run_kernel_reduction_finalize(syn::value_list<int, subwarp_size>,
+                                   int64 num_results, int64 num_parts,
+                                   ReductionOp op, FinalizeOp finalize,
+                                   ValueType init, const ValueType* input,
+                                   int64 result_stride, ValueType* result)
+{
+    constexpr auto block_size = default_block_size;
+    const auto num_blocks = ceildiv(num_results * subwarp_size, block_size);
+    generic_kernel_reduction_finalize_2d<subwarp_size>
+        <<<num_blocks, block_size>>>(num_results, num_parts, op, finalize,
+                                     as_cuda_type(init), as_cuda_type(input),
+                                     static_cast<int64>(result_stride),
+                                     as_cuda_type(result));
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_finalize,
+                                    run_kernel_reduction_finalize)
+
+
+}  // namespace
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_row_reduction(std::shared_ptr<const CudaExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType init,
+                              ValueType* result, size_type result_stride,
+                              dim<2> size, KernelArgs&&... args)
+{
+    using subwarp_sizes =
+        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    constexpr int oversubscription = 4;
+    gko::cuda::device_guard guard{exec->get_device_id()};
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto resources = exec->get_num_warps() * oversubscription;
+    const auto col_parts = 1;  // TODO tune
+    if (col_parts > 1) {
+        Array<ValueType> partial{exec,
+                                 static_cast<size_type>(col_parts * rows)};
+        select_run_generic_kernel_row_reduction(
+            subwarp_sizes(),
+            [&](int compiled_subwarp_size) {
+                return compiled_subwarp_size >= cols ||
+                       compiled_subwarp_size == config::warp_size;
+            },
+            syn::value_list<int>(), syn::type_list<>(), rows, cols, col_parts,
+            fn, op, [] __device__(auto i) { return i; }, init,
+            partial.get_data(), 1, map_to_device(args)...);
+        select_run_kernel_reduction_finalize(
+            subwarp_sizes(),
+            [&](int compiled_subwarp_size) {
+                return compiled_subwarp_size >= col_parts ||
+                       compiled_subwarp_size == config::warp_size;
+            },
+            syn::value_list<int>(), syn::type_list<>(), rows, col_parts, op,
+            finalize, init, partial.get_const_data(),
+            static_cast<int64>(result_stride), result);
+    } else {
+        select_run_generic_kernel_row_reduction(
+            subwarp_sizes(),
+            [&](int compiled_subwarp_size) {
+                return compiled_subwarp_size >= cols ||
+                       compiled_subwarp_size == config::warp_size;
+            },
+            syn::value_list<int>(), syn::type_list<>(), rows, cols, 1, fn, op,
+            finalize, init, result, static_cast<int64>(result_stride),
+            map_to_device(args)...);
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_col_reduction(std::shared_ptr<const CudaExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType init,
+                              ValueType* result, dim<2> size,
+                              KernelArgs&&... args)
+{
+    constexpr int oversubscription = 4;
+    gko::cuda::device_guard guard{exec->get_device_id()};
+    constexpr auto block_size = default_block_size;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto resources =
+        exec->get_num_warps() * config::warp_size * oversubscription;
+    const auto num_blocks = ceildiv(rows * cols, block_size);
+    const auto row_parts = 1;  // TODO tune
+    if (row_parts > 1) {
+        Array<ValueType> partial{exec,
+                                 static_cast<size_type>(row_parts * cols)};
+        generic_kernel_col_reduction_2d<<<num_blocks, block_size>>>(
+            rows, cols, row_parts, fn, op, [] __device__(auto i) { return i; },
+            as_cuda_type(init), as_cuda_type(partial.get_data()),
+            map_to_device(args)...);
+        using subwarp_sizes =
+            syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+        select_run_kernel_reduction_finalize(
+            subwarp_sizes(),
+            [&](int compiled_subwarp_size) {
+                return compiled_subwarp_size >= row_parts ||
+                       compiled_subwarp_size == config::warp_size;
+            },
+            syn::value_list<int>(), syn::type_list<>(), cols, row_parts, op,
+            finalize, as_cuda_type(init),
+            as_cuda_type(partial.get_const_data()), 1, as_cuda_type(result));
+    } else {
+        generic_kernel_col_reduction_2d<<<num_blocks, block_size>>>(
+            rows, cols, 1, fn, op, finalize, as_cuda_type(init),
+            as_cuda_type(result), map_to_device(args)...);
+    }
+}
+
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu
index 8e78e3ee830..c6a4f1c679f 100644
--- a/cuda/test/base/kernel_launch.cu
+++ b/cuda/test/base/kernel_launch.cu
@@ -335,4 +335,61 @@ void run2d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
 TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); }
 
 
+void run2d_row_reduction(std::shared_ptr<gko::CudaExecutor> exec)
+{
+    int num_rows = 1000;
+    int num_cols = 100;
+    gko::Array<int64> host_ref{exec->get_master(),
+                               static_cast<size_type>(2 * num_rows)};
+    std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
+    gko::Array<int64> output{exec, host_ref};
+    for (int i = 0; i < num_rows; i++) {
+        host_ref.get_data()[2 * i] = num_cols * (num_cols + 1) * (i + 1);
+    }
+
+    gko::kernels::cuda::run_kernel_row_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2,
+        gko::dim<2>{static_cast<size_type>(num_rows),
+                    static_cast<size_type>(num_cols)});
+
+    GKO_ASSERT_ARRAY_EQ(host_ref, output);
+}
+
+TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); }
+
+
+void run2d_col_reduction(std::shared_ptr<gko::CudaExecutor> exec)
+{
+    int num_rows = 1000;
+    int num_cols = 100;
+    gko::Array<int64> host_ref{exec->get_master(),
+                               static_cast<size_type>(num_cols)};
+    gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
+    for (int i = 0; i < num_cols; i++) {
+        host_ref.get_data()[i] = num_rows * (num_rows + 1) * (i + 1);
+    }
+
+    gko::kernels::cuda::run_kernel_col_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
+        gko::dim<2>{static_cast<size_type>(num_rows),
+                    static_cast<size_type>(num_cols)});
+
+    GKO_ASSERT_ARRAY_EQ(host_ref, output);
+}
+
+TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); }
+
+
 }  // namespace

From 2232a44ec6a0b770625b984e335fc261b3ec7e70 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 14 Jul 2021 13:43:46 +0200
Subject: [PATCH 05/25] add HIP reduction support

---
 hip/base/kernel_launch_reduction.hip.hpp | 237 ++++++++++++++++++++++-
 hip/test/base/kernel_launch.hip.cpp      |  57 ++++++
 2 files changed, 290 insertions(+), 4 deletions(-)

diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp
index 1075acc8198..fe4b697bc30 100644
--- a/hip/base/kernel_launch_reduction.hip.hpp
+++ b/hip/base/kernel_launch_reduction.hip.hpp
@@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
+#include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/device_guard.hip.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
@@ -78,8 +79,10 @@ __global__ __launch_bounds__(
     }
     __syncthreads();
     if (threadIdx.x < config::warp_size) {
-        storage[blockIdx.x] =
-            finalize(reduce(warp, warp_partial[threadIdx.x], op));
+        partial = reduce(warp, warp_partial[threadIdx.x], op);
+        if (threadIdx.x == 0) {
+            storage[blockIdx.x] = finalize(partial);
+        }
     }
 }
 
@@ -174,8 +177,9 @@ void run_kernel_reduction(std::shared_ptr<const HipExecutor> exec,
                         exec->get_num_warps() * oversubscription);
     if (num_blocks > 1) {
         Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
-        generic_kernel_reduction_2d<<<num_blocks, block_size>>>(
-            rows, cols, fn, op, [] __device__(auto v) { return v; },
+        hipLaunchKernelGGL(
+            generic_kernel_reduction_2d, num_blocks, block_size, 0, 0, rows,
+            cols, fn, op, [] __device__(auto v) { return v; },
             as_hip_type(init), as_hip_type(partial.get_data()),
             map_to_device(args)...);
         hipLaunchKernelGGL(
@@ -192,6 +196,231 @@ void run_kernel_reduction(std::shared_ptr<const HipExecutor> exec,
 }
 
 
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d(
+        int64 rows, int64 cols, int64 col_parts, KernelFunction fn,
+        ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result,
+        int64 result_stride, KernelArgs... args)
+{
+    const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
+    const auto row = idx % rows;
+    const auto col_part = idx / rows;
+    if (col_part >= col_parts) {
+        return;
+    }
+    const auto cols_per_part = ceildiv(cols, col_parts);
+    // TODO use boundaries divisible by subwarp_size
+    const auto begin = cols_per_part * col_part;
+    const auto end = min(begin + cols_per_part, cols);
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    auto partial = init;
+    for (auto col = begin + subwarp.thread_rank(); col < end;
+         col += subwarp_size) {
+        partial = op(partial, fn(row, col, args...));
+    }
+    partial = reduce(subwarp, partial, op);
+    result[(row * col_parts + col_part) * result_stride] = finalize(partial);
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d(
+        int64 rows, int64 cols, int64 row_parts, KernelFunction fn,
+        ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result,
+        KernelArgs... args)
+{
+    const auto idx = thread::get_thread_id_flat<int64>();
+    const auto col = idx % cols;
+    const auto row_part = idx / cols;
+    if (row_part >= row_parts) {
+        return;
+    }
+    const auto rows_per_part = ceildiv(rows, row_parts);
+    const auto begin = rows_per_part * row_part;
+    const auto end = min(begin + rows_per_part, rows);
+    auto partial = init;
+    for (auto row = begin; row < end; row++) {
+        partial = op(partial, fn(row, col, args...));
+    }
+    result[col * row_parts + row_part] = finalize(partial);
+}
+
+
+template <int subwarp_size, typename ValueType, typename ReductionOp,
+          typename FinalizeOp>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d(
+        int64 num_results, int64 num_parts, ReductionOp op, FinalizeOp finalize,
+        ValueType init, const ValueType* input, int64 result_stride,
+        ValueType* result)
+{
+    const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
+    if (idx >= num_results) {
+        return;
+    }
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    auto partial = init;
+    for (int64 part = subwarp.thread_rank(); part < num_parts;
+         part += subwarp_size) {
+        partial = op(partial, input[idx * num_parts + part]);
+    }
+    partial = reduce(subwarp, partial, op);
+    if (subwarp.thread_rank() == 0) {
+        result[idx * result_stride] = finalize(partial);
+    }
+}
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+void run_generic_kernel_row_reduction(syn::value_list<int, subwarp_size>,
+                                      int64 rows, int64 cols, int64 col_parts,
+                                      KernelFunction fn, ReductionOp op,
+                                      FinalizeOp finalize, ValueType init,
+                                      ValueType* result, int64 result_stride,
+                                      KernelArgs... args)
+{
+    constexpr auto block_size = default_block_size;
+    const auto num_blocks = ceildiv(rows * cols * subwarp_size, block_size);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(generic_kernel_row_reduction_2d<subwarp_size>),
+        num_blocks, block_size, 0, 0, rows, cols, col_parts, fn, op, finalize,
+        as_hip_type(init), as_hip_type(result), result_stride, args...);
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction,
+                                    run_generic_kernel_row_reduction)
+
+
+template <int subwarp_size, typename ValueType, typename ReductionOp,
+          typename FinalizeOp>
+void run_kernel_reduction_finalize(syn::value_list<int, subwarp_size>,
+                                   int64 num_results, int64 num_parts,
+                                   ReductionOp op, FinalizeOp finalize,
+                                   ValueType init, const ValueType* input,
+                                   int64 result_stride, ValueType* result)
+{
+    constexpr auto block_size = default_block_size;
+    const auto num_blocks = ceildiv(num_results * subwarp_size, block_size);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(generic_kernel_reduction_finalize_2d<subwarp_size>),
+        num_blocks, block_size, 0, 0, num_results, num_parts, op, finalize,
+        as_hip_type(init), as_hip_type(input),
+        static_cast<int64>(result_stride), as_hip_type(result));
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_finalize,
+                                    run_kernel_reduction_finalize)
+
+
+}  // namespace
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_row_reduction(std::shared_ptr<const HipExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType init,
+                              ValueType* result, size_type result_stride,
+                              dim<2> size, KernelArgs&&... args)
+{
+    using subwarp_sizes =
+        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    constexpr int oversubscription = 4;
+    gko::hip::device_guard guard{exec->get_device_id()};
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto resources = exec->get_num_warps() * oversubscription;
+    const auto col_parts = 1;  // TODO tune
+    if (col_parts > 1) {
+        Array<ValueType> partial{exec,
+                                 static_cast<size_type>(col_parts * rows)};
+        select_run_generic_kernel_row_reduction(
+            subwarp_sizes(),
+            [&](int compiled_subwarp_size) {
+                return compiled_subwarp_size >= cols ||
+                       compiled_subwarp_size == config::warp_size;
+            },
+            syn::value_list<int>(), syn::type_list<>(), rows, cols, col_parts,
+            fn, op, [] __device__(auto i) { return i; }, init,
+            partial.get_data(), 1, map_to_device(args)...);
+        select_run_kernel_reduction_finalize(
+            subwarp_sizes(),
+            [&](int compiled_subwarp_size) {
+                return compiled_subwarp_size >= col_parts ||
+                       compiled_subwarp_size == config::warp_size;
+            },
+            syn::value_list<int>(), syn::type_list<>(), rows, col_parts, op,
+            finalize, init, partial.get_const_data(),
+            static_cast<int64>(result_stride), result);
+    } else {
+        select_run_generic_kernel_row_reduction(
+            subwarp_sizes(),
+            [&](int compiled_subwarp_size) {
+                return compiled_subwarp_size >= cols ||
+                       compiled_subwarp_size == config::warp_size;
+            },
+            syn::value_list<int>(), syn::type_list<>(), rows, cols, 1, fn, op,
+            finalize, init, result, static_cast<int64>(result_stride),
+            map_to_device(args)...);
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_col_reduction(std::shared_ptr<const HipExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType init,
+                              ValueType* result, dim<2> size,
+                              KernelArgs&&... args)
+{
+    constexpr int oversubscription = 4;
+    gko::hip::device_guard guard{exec->get_device_id()};
+    constexpr auto block_size = default_block_size;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto resources =
+        exec->get_num_warps() * config::warp_size * oversubscription;
+    const auto num_blocks = ceildiv(rows * cols, block_size);
+    const auto row_parts = 1;  // TODO tune
+    if (row_parts > 1) {
+        Array<ValueType> partial{exec,
+                                 static_cast<size_type>(row_parts * cols)};
+        hipLaunchKernelGGL(
+            generic_kernel_col_reduction_2d, num_blocks, block_size, 0, 0, rows,
+            cols, row_parts, fn, op, [] __device__(auto i) { return i; },
+            as_hip_type(init), as_hip_type(partial.get_data()),
+            map_to_device(args)...);
+        using subwarp_sizes =
+            syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+        select_run_kernel_reduction_finalize(
+            subwarp_sizes(),
+            [&](int compiled_subwarp_size) {
+                return compiled_subwarp_size >= row_parts ||
+                       compiled_subwarp_size == config::warp_size;
+            },
+            syn::value_list<int>(), syn::type_list<>(), cols, row_parts, op,
+            finalize, as_hip_type(init), as_hip_type(partial.get_const_data()),
+            1, as_hip_type(result));
+    } else {
+        hipLaunchKernelGGL(generic_kernel_col_reduction_2d, num_blocks,
+                           block_size, 0, 0, rows, cols, 1, fn, op, finalize,
+                           as_hip_type(init), as_hip_type(result),
+                           map_to_device(args)...);
+    }
+}
+
+
 }  // namespace hip
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/test/base/kernel_launch.hip.cpp b/hip/test/base/kernel_launch.hip.cpp
index 4fb5ef0a4dc..849d8b45161 100644
--- a/hip/test/base/kernel_launch.hip.cpp
+++ b/hip/test/base/kernel_launch.hip.cpp
@@ -334,4 +334,61 @@ void run2d_reduction(std::shared_ptr<gko::HipExecutor> exec)
 TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); }
 
 
+void run2d_row_reduction(std::shared_ptr<gko::HipExecutor> exec)
+{
+    int num_rows = 1000;
+    int num_cols = 100;
+    gko::Array<int64> host_ref{exec->get_master(),
+                               static_cast<size_type>(2 * num_rows)};
+    std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
+    gko::Array<int64> output{exec, host_ref};
+    for (int i = 0; i < num_rows; i++) {
+        host_ref.get_data()[2 * i] = num_cols * (num_cols + 1) * (i + 1);
+    }
+
+    gko::kernels::hip::run_kernel_row_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2,
+        gko::dim<2>{static_cast<size_type>(num_rows),
+                    static_cast<size_type>(num_cols)});
+
+    GKO_ASSERT_ARRAY_EQ(host_ref, output);
+}
+
+TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); }
+
+
+void run2d_col_reduction(std::shared_ptr<gko::HipExecutor> exec)
+{
+    int num_rows = 1000;
+    int num_cols = 100;
+    gko::Array<int64> host_ref{exec->get_master(),
+                               static_cast<size_type>(num_cols)};
+    gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
+    for (int i = 0; i < num_cols; i++) {
+        host_ref.get_data()[i] = num_rows * (num_rows + 1) * (i + 1);
+    }
+
+    gko::kernels::hip::run_kernel_col_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
+        gko::dim<2>{static_cast<size_type>(num_rows),
+                    static_cast<size_type>(num_cols)});
+
+    GKO_ASSERT_ARRAY_EQ(host_ref, output);
+}
+
+TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); }
+
+
 }  // namespace

From 98bc7df16266e906a058e04d495a0ecd5673a682 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 15 Jul 2021 21:38:48 +0200
Subject: [PATCH 06/25] add row and column OpenMP reduction kernels

---
 omp/base/kernel_launch_reduction.hpp | 193 +++++++++++++++++++++++++++
 omp/test/base/kernel_launch.cpp      |  86 ++++++++++++
 2 files changed, 279 insertions(+)

diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
index 4981d4ed902..dbc055fffd6 100644
--- a/omp/base/kernel_launch_reduction.hpp
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -47,6 +47,10 @@ namespace kernels {
 namespace omp {
 
 
+// how many more reduction tasks we launch relative to the number of threads
+constexpr int reduction_kernel_oversubscription = 4;
+
+
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
@@ -177,6 +181,195 @@ void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
 }
 
 
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_row_reduction(std::shared_ptr<const OmpExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType init,
+                              ValueType* result, size_type result_stride,
+                              dim<2> size, KernelArgs&&... args)
+{
+    constexpr int block_size = 8;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto num_threads = static_cast<int64>(omp_get_max_threads());
+    if (rows <= 0) {
+        return;
+    }
+    // enough work to keep all threads busy or only very small reduction sizes
+    if (rows >= reduction_kernel_oversubscription * num_threads ||
+        cols < rows) {
+#pragma omp parallel for
+        for (int64 row = 0; row < rows; row++) {
+            auto partial = init;
+            for (int64 col = 0; col < cols; col++) {
+                partial =
+                    op(partial, [&]() { return fn(row, col, args...); }());
+            }
+            result[result_stride * row] = finalize(partial);
+        }
+    } else {
+        // small number of rows and large reduction sizes: do partial sum first
+        const auto work_per_thread = ceildiv(cols, num_threads);
+        Array<ValueType> partial{exec,
+                                 static_cast<size_type>(rows * num_threads)};
+#pragma omp parallel num_threads(num_threads)
+        {
+            const auto thread_id = static_cast<int64>(omp_get_thread_num());
+            const auto begin = thread_id * work_per_thread;
+            const auto end = std::min(begin + work_per_thread, cols);
+            for (int64 row = 0; row < rows; row++) {
+                auto local_partial = init;
+                for (int64 col = begin; col < end; col++) {
+                    local_partial = op(local_partial, [&]() {
+                        return fn(row, col, args...);
+                    }());
+                }
+                partial.get_data()[row * num_threads + thread_id] =
+                    local_partial;
+            }
+        }
+        // then accumulate the partial sums and write to result
+#pragma omp parallel for
+        for (int64 row = 0; row < rows; row++) {
+            auto local_partial = init;
+            for (int64 thread_id = 0; thread_id < num_threads; thread_id++) {
+                local_partial =
+                    op(local_partial,
+                       partial.get_const_data()[row * num_threads + thread_id]);
+            }
+            result[row * result_stride] = finalize(local_partial);
+        }
+    }
+}
+
+
+namespace {
+
+
+template <int local_cols, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_kernel_col_reduction_sized_block_impl(
+    KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType init,
+    ValueType* result, int64 row_begin, int64 row_end, int64 base_col,
+    MappedKernelArgs... args)
+{
+    std::array<ValueType, local_cols> partial;
+    partial.fill(init);
+    for (auto row = row_begin; row < row_end; row++) {
+#pragma unroll
+        for (int64 rel_col = 0; rel_col < local_cols; rel_col++) {
+            partial[rel_col] = op(partial[rel_col], [&]() {
+                return fn(row, base_col + rel_col, args...);
+            }());
+        }
+    }
+#pragma unroll
+    for (int64 rel_col = 0; rel_col < local_cols; rel_col++) {
+        result[base_col + rel_col] = finalize(partial[rel_col]);
+    }
+}
+
+
+template <int block_size, int remainder_cols, typename ValueType,
+          typename KernelFunction, typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_kernel_col_reduction_sized_impl(
+    syn::value_list<int, remainder_cols>,
+    std::shared_ptr<const OmpExecutor> exec, KernelFunction fn, ReductionOp op,
+    FinalizeOp finalize, ValueType init, ValueType* result, dim<2> size,
+    MappedKernelArgs... args)
+{
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto num_threads = static_cast<int64>(omp_get_max_threads());
+    static_assert(remainder_cols < block_size, "remainder too large");
+    GKO_ASSERT(cols % block_size == remainder_cols);
+    const auto num_col_blocks = ceildiv(cols, block_size);
+    // enough work to keep all threads busy or only very small reduction sizes
+    if (cols >= reduction_kernel_oversubscription * num_threads ||
+        rows < cols) {
+#pragma omp parallel for
+        for (int64 col_block = 0; col_block < num_col_blocks; col_block++) {
+            const auto base_col = col_block * block_size;
+            if (base_col + block_size <= cols) {
+                run_kernel_col_reduction_sized_block_impl<block_size>(
+                    fn, op, finalize, init, result, 0, rows, base_col);
+            } else {
+                run_kernel_col_reduction_sized_block_impl<remainder_cols>(
+                    fn, op, finalize, init, result, 0, rows, base_col);
+            }
+        }
+    } else {
+        // number of blocks that need to be reduced afterwards
+        const auto reduction_size =
+            ceildiv(reduction_kernel_oversubscription * num_threads, cols);
+        const auto rows_per_thread = ceildiv(rows, reduction_size);
+        Array<ValueType> partial{exec,
+                                 static_cast<size_type>(reduction_size * cols)};
+#pragma omp parallel for
+        for (int64 i = 0; i < reduction_size * num_col_blocks; i++) {
+            const auto col_block = i % num_col_blocks;
+            const auto row_block = i / num_col_blocks;
+            const auto begin = row_block * rows_per_thread;
+            const auto end = std::min(begin + rows_per_thread, rows);
+            const auto base_col = col_block * block_size;
+            const auto identity = [](auto i) { return i; };
+            if (base_col + block_size <= cols) {
+                run_kernel_col_reduction_sized_block_impl<block_size>(
+                    fn, op, identity, init,
+                    partial.get_data() + cols * row_block, begin, end,
+                    base_col);
+            } else {
+                run_kernel_col_reduction_sized_block_impl<remainder_cols>(
+                    fn, op, identity, init,
+                    partial.get_data() + cols * row_block, begin, end,
+                    base_col);
+            }
+        }
+#pragma omp parallel for
+        for (int64 col = 0; col < cols; col++) {
+            auto total = init;
+            for (int64 row_block = 0; row_block < reduction_size; row_block++) {
+                total =
+                    op(total, partial.get_const_data()[col + cols * row_block]);
+            }
+            result[col] = finalize(total);
+        }
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_col_reduction_sized,
+                                    run_kernel_col_reduction_sized_impl)
+
+
+}  // namespace
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_col_reduction(std::shared_ptr<const OmpExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType init,
+                              ValueType* result, dim<2> size,
+                              KernelArgs&&... args)
+{
+    constexpr auto block_size = 8;
+    using remainders = syn::as_list<syn::range<0, block_size, 1>>;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    if (cols <= 0) {
+        return;
+    }
+    select_run_kernel_col_reduction_sized(
+        remainders(),
+        [&](int remainder) { return remainder == cols % block_size; },
+        syn::value_list<int, block_size>(), syn::type_list<>(), exec, fn, op,
+        finalize, init, result, size, args...);
+}
+
+
 }  // namespace omp
 }  // namespace kernels
 }  // namespace gko
diff --git a/omp/test/base/kernel_launch.cpp b/omp/test/base/kernel_launch.cpp
index 6f13797b85f..7184649c3ae 100644
--- a/omp/test/base/kernel_launch.cpp
+++ b/omp/test/base/kernel_launch.cpp
@@ -319,4 +319,90 @@ TEST_F(KernelLaunch, Reduction2D)
 }
 
 
+TEST_F(KernelLaunch, ReductionRow2DSmall)
+{
+    // 4 rows, with oversubscription this means we use multiple threads per row
+    // if OMP_NUM_THREADS >= 2
+    int num_rows = 4;
+    int num_cols = 100;
+    gko::Array<int64> host_ref{exec->get_master(),
+                               static_cast<size_type>(2 * num_rows)};
+    std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
+    gko::Array<int64> output{exec, host_ref};
+    for (int i = 0; i < num_rows; i++) {
+        host_ref.get_data()[2 * i] = num_cols * (num_cols + 1) * (i + 1);
+    }
+
+    gko::kernels::omp::run_kernel_row_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2,
+        gko::dim<2>{static_cast<size_type>(num_rows),
+                    static_cast<size_type>(num_cols)});
+
+    GKO_ASSERT_ARRAY_EQ(host_ref, output);
+}
+
+
+TEST_F(KernelLaunch, ReductionRow2D)
+{
+    int num_rows = 1000;
+    int num_cols = 100;
+    gko::Array<int64> host_ref{exec->get_master(),
+                               static_cast<size_type>(2 * num_rows)};
+    std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
+    gko::Array<int64> output{exec, host_ref};
+    for (int i = 0; i < num_rows; i++) {
+        host_ref.get_data()[2 * i] = num_cols * (num_cols + 1) * (i + 1);
+    }
+
+    gko::kernels::omp::run_kernel_row_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) { return i + j; },
+        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2,
+        gko::dim<2>{static_cast<size_type>(num_rows),
+                    static_cast<size_type>(num_cols)});
+
+    GKO_ASSERT_ARRAY_EQ(host_ref, output);
+}
+
+
+TEST_F(KernelLaunch, ReductionCol2D)
+{
+    for (int num_rows : {0, 1, 10, 1000, 1000}) {
+        for (int num_cols :
+             {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 40, 100, 1000}) {
+            gko::Array<int64> host_ref{exec->get_master(),
+                                       static_cast<size_type>(num_cols)};
+            gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
+            for (int i = 0; i < num_cols; i++) {
+                host_ref.get_data()[i] = num_rows * (num_rows + 1) * (i + 1);
+            }
+
+            gko::kernels::omp::run_kernel_col_reduction(
+                exec,
+                [] GKO_KERNEL(auto i, auto j) {
+                    static_assert(is_same<decltype(i), int64>::value, "index");
+                    return (i + 1) * (j + 1);
+                },
+                [] GKO_KERNEL(auto i, auto j) { return i + j; },
+                [] GKO_KERNEL(auto j) { return j * 2; }, int64{},
+                output.get_data(),
+                gko::dim<2>{static_cast<size_type>(num_rows),
+                            static_cast<size_type>(num_cols)});
+
+            GKO_ASSERT_ARRAY_EQ(host_ref, output);
+        }
+    }
+}
+
+
 }  // namespace

From 41d92b4bf56896734dc79ee69f464727c5218d5b Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 15 Jul 2021 23:56:42 +0200
Subject: [PATCH 07/25] fix types and tests

---
 cuda/test/base/kernel_launch.cu      | 38 +++++++++-----
 hip/test/base/kernel_launch.hip.cpp  | 38 +++++++++-----
 omp/base/kernel_launch.hpp           | 22 +++++---
 omp/base/kernel_launch_reduction.hpp | 78 ++++++++++++++++++----------
 omp/test/base/kernel_launch.cpp      | 52 +++++++++++++------
 5 files changed, 155 insertions(+), 73 deletions(-)

diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu
index c6a4f1c679f..6a5494e03fa 100644
--- a/cuda/test/base/kernel_launch.cu
+++ b/cuda/test/base/kernel_launch.cu
@@ -280,26 +280,31 @@ TEST_F(KernelLaunch, Runs2DDense)
 void run1d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
 {
     gko::Array<int64> output{exec, 1};
+
     gko::kernels::cuda::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i) {
+        [] GKO_KERNEL(auto i, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64 *>::value, "value");
             return i + 1;
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
-        size_type{100000});
+        size_type{100000}, output);
+
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000ll);
 
     gko::kernels::cuda::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i) {
+        [] GKO_KERNEL(auto i, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64 *>::value, "value");
             return i + 1;
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
-        size_type{100});
+        size_type{100}, output);
+
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll);
 }
 
@@ -309,26 +314,31 @@ TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); }
 void run2d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
 {
     gko::Array<int64> output{exec, 1};
+
     gko::kernels::cuda::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j) {
+        [] GKO_KERNEL(auto i, auto j, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64 *>::value, "value");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
-        gko::dim<2>{1000, 100});
+        gko::dim<2>{1000, 100}, output);
+
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll);
 
     gko::kernels::cuda::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j) {
+        [] GKO_KERNEL(auto i, auto j, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64 *>::value, "value");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
-        gko::dim<2>{10, 10});
+        gko::dim<2>{10, 10}, output);
+
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll);
 }
 
@@ -349,14 +359,16 @@ void run2d_row_reduction(std::shared_ptr<gko::CudaExecutor> exec)
 
     gko::kernels::cuda::run_kernel_row_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j) {
+        [] GKO_KERNEL(auto i, auto j, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64 *>::value, "value");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2,
         gko::dim<2>{static_cast<size_type>(num_rows),
-                    static_cast<size_type>(num_cols)});
+                    static_cast<size_type>(num_cols)},
+        output);
 
     GKO_ASSERT_ARRAY_EQ(host_ref, output);
 }
@@ -377,14 +389,16 @@ void run2d_col_reduction(std::shared_ptr<gko::CudaExecutor> exec)
 
     gko::kernels::cuda::run_kernel_col_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j) {
+        [] GKO_KERNEL(auto i, auto j, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64 *>::value, "value");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
         gko::dim<2>{static_cast<size_type>(num_rows),
-                    static_cast<size_type>(num_cols)});
+                    static_cast<size_type>(num_cols)},
+        output);
 
     GKO_ASSERT_ARRAY_EQ(host_ref, output);
 }
diff --git a/hip/test/base/kernel_launch.hip.cpp b/hip/test/base/kernel_launch.hip.cpp
index 849d8b45161..755f8b3834d 100644
--- a/hip/test/base/kernel_launch.hip.cpp
+++ b/hip/test/base/kernel_launch.hip.cpp
@@ -279,26 +279,31 @@ TEST_F(KernelLaunch, Runs2DDense)
 void run1d_reduction(std::shared_ptr<gko::HipExecutor> exec)
 {
     gko::Array<int64> output{exec, 1};
+
     gko::kernels::hip::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i) {
+        [] GKO_KERNEL(auto i, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
             return i + 1;
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
-        size_type{100000});
+        size_type{100000}, output);
+
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000ll);
 
     gko::kernels::hip::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i) {
+        [] GKO_KERNEL(auto i, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
             return i + 1;
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
-        size_type{100});
+        size_type{100}, output);
+
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll);
 }
 
@@ -308,26 +313,31 @@ TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); }
 void run2d_reduction(std::shared_ptr<gko::HipExecutor> exec)
 {
     gko::Array<int64> output{exec, 1};
+
     gko::kernels::hip::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j) {
+        [] GKO_KERNEL(auto i, auto j, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
-        gko::dim<2>{1000, 100});
+        gko::dim<2>{1000, 100}, output);
+
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll);
 
     gko::kernels::hip::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j) {
+        [] GKO_KERNEL(auto i, auto j, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
-        gko::dim<2>{10, 10});
+        gko::dim<2>{10, 10}, output);
+
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll);
 }
 
@@ -348,14 +358,16 @@ void run2d_row_reduction(std::shared_ptr<gko::HipExecutor> exec)
 
     gko::kernels::hip::run_kernel_row_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j) {
+        [] GKO_KERNEL(auto i, auto j, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2,
         gko::dim<2>{static_cast<size_type>(num_rows),
-                    static_cast<size_type>(num_cols)});
+                    static_cast<size_type>(num_cols)},
+        output);
 
     GKO_ASSERT_ARRAY_EQ(host_ref, output);
 }
@@ -376,14 +388,16 @@ void run2d_col_reduction(std::shared_ptr<gko::HipExecutor> exec)
 
     gko::kernels::hip::run_kernel_col_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j) {
+        [] GKO_KERNEL(auto i, auto j, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
         gko::dim<2>{static_cast<size_type>(num_rows),
-                    static_cast<size_type>(num_cols)});
+                    static_cast<size_type>(num_cols)},
+        output);
 
     GKO_ASSERT_ARRAY_EQ(host_ref, output);
 }
diff --git a/omp/base/kernel_launch.hpp b/omp/base/kernel_launch.hpp
index 98432c2d155..b6ef373fdee 100644
--- a/omp/base/kernel_launch.hpp
+++ b/omp/base/kernel_launch.hpp
@@ -44,20 +44,20 @@ namespace kernels {
 namespace omp {
 
 
-template <typename KernelFunction, typename... KernelArgs>
-void run_kernel(std::shared_ptr<const OmpExecutor> exec, KernelFunction fn,
-                size_type size, KernelArgs&&... args)
+namespace {
+
+
+template <typename KernelFunction, typename... MappedKernelArgs>
+void run_kernel_impl(std::shared_ptr<const OmpExecutor> exec, KernelFunction fn,
+                     size_type size, MappedKernelArgs... args)
 {
 #pragma omp parallel for
     for (int64 i = 0; i < static_cast<int64>(size); i++) {
-        [&]() { fn(i, map_to_device(args)...); }();
+        [&]() { fn(i, args...); }();
     }
 }
 
 
-namespace {
-
-
 template <int block_size, int remainder_cols, typename KernelFunction,
           typename... MappedKernelArgs>
 void run_kernel_sized_impl(syn::value_list<int, remainder_cols>,
@@ -126,6 +126,14 @@ void run_kernel_impl(std::shared_ptr<const OmpExecutor> exec, KernelFunction fn,
 }  // namespace
 
 
+template <typename KernelFunction, typename... KernelArgs>
+void run_kernel(std::shared_ptr<const OmpExecutor> exec, KernelFunction fn,
+                size_type size, KernelArgs&&... args)
+{
+    run_kernel_impl(exec, fn, size, map_to_device(args)...);
+}
+
+
 template <typename KernelFunction, typename... KernelArgs>
 void run_kernel(std::shared_ptr<const OmpExecutor> exec, KernelFunction fn,
                 dim<2> size, KernelArgs&&... args)
diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
index dbc055fffd6..84758549918 100644
--- a/omp/base/kernel_launch_reduction.hpp
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -51,13 +51,16 @@ namespace omp {
 constexpr int reduction_kernel_oversubscription = 4;
 
 
+namespace {
+
+
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
-          typename FinalizeOp, typename... KernelArgs>
-void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
-                          KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType init,
-                          ValueType* result, size_type size,
-                          KernelArgs&&... args)
+          typename FinalizeOp, typename... MappedKernelArgs>
+void run_kernel_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
+                               KernelFunction fn, ReductionOp op,
+                               FinalizeOp finalize, ValueType init,
+                               ValueType* result, size_type size,
+                               MappedKernelArgs... args)
 {
     const auto num_threads = static_cast<int64>(omp_get_max_threads());
     const auto ssize = static_cast<int64>(size);
@@ -83,9 +86,6 @@ void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
 }
 
 
-namespace {
-
-
 template <int block_size, int remainder_cols, typename ValueType,
           typename KernelFunction, typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
@@ -158,6 +158,19 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized,
 }  // namespace
 
 
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType init,
+                          ValueType* result, size_type size,
+                          KernelArgs&&... args)
+{
+    run_kernel_reduction_impl(exec, fn, op, finalize, init, result, size,
+                              map_to_device(args)...);
+}
+
+
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
@@ -177,17 +190,20 @@ void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
         remainders(),
         [&](int remainder) { return remainder == cols % block_size; },
         syn::value_list<int, block_size>(), syn::type_list<>(), exec, fn, op,
-        finalize, init, result, size, args...);
+        finalize, init, result, size, map_to_device(args)...);
 }
 
 
+namespace {
+
+
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
-          typename FinalizeOp, typename... KernelArgs>
-void run_kernel_row_reduction(std::shared_ptr<const OmpExecutor> exec,
-                              KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType init,
-                              ValueType* result, size_type result_stride,
-                              dim<2> size, KernelArgs&&... args)
+          typename FinalizeOp, typename... MappedKernelArgs>
+void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
+                                   KernelFunction fn, ReductionOp op,
+                                   FinalizeOp finalize, ValueType init,
+                                   ValueType* result, size_type result_stride,
+                                   dim<2> size, MappedKernelArgs... args)
 {
     constexpr int block_size = 8;
     const auto rows = static_cast<int64>(size[0]);
@@ -244,9 +260,6 @@ void run_kernel_row_reduction(std::shared_ptr<const OmpExecutor> exec,
 }
 
 
-namespace {
-
-
 template <int local_cols, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
@@ -295,10 +308,10 @@ void run_kernel_col_reduction_sized_impl(
             const auto base_col = col_block * block_size;
             if (base_col + block_size <= cols) {
                 run_kernel_col_reduction_sized_block_impl<block_size>(
-                    fn, op, finalize, init, result, 0, rows, base_col);
+                    fn, op, finalize, init, result, 0, rows, base_col, args...);
             } else {
                 run_kernel_col_reduction_sized_block_impl<remainder_cols>(
-                    fn, op, finalize, init, result, 0, rows, base_col);
+                    fn, op, finalize, init, result, 0, rows, base_col, args...);
             }
         }
     } else {
@@ -319,13 +332,13 @@ void run_kernel_col_reduction_sized_impl(
             if (base_col + block_size <= cols) {
                 run_kernel_col_reduction_sized_block_impl<block_size>(
                     fn, op, identity, init,
-                    partial.get_data() + cols * row_block, begin, end,
-                    base_col);
+                    partial.get_data() + cols * row_block, begin, end, base_col,
+                    args...);
             } else {
                 run_kernel_col_reduction_sized_block_impl<remainder_cols>(
                     fn, op, identity, init,
-                    partial.get_data() + cols * row_block, begin, end,
-                    base_col);
+                    partial.get_data() + cols * row_block, begin, end, base_col,
+                    args...);
             }
         }
 #pragma omp parallel for
@@ -347,6 +360,19 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_col_reduction_sized,
 }  // namespace
 
 
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... MappedKernelArgs>
+void run_kernel_row_reduction(std::shared_ptr<const OmpExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType init,
+                              ValueType* result, size_type result_stride,
+                              dim<2> size, MappedKernelArgs... args)
+{
+    run_kernel_row_reduction_impl(exec, fn, op, finalize, init, result,
+                                  result_stride, size, map_to_device(args)...);
+}
+
+
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_col_reduction(std::shared_ptr<const OmpExecutor> exec,
@@ -366,7 +392,7 @@ void run_kernel_col_reduction(std::shared_ptr<const OmpExecutor> exec,
         remainders(),
         [&](int remainder) { return remainder == cols % block_size; },
         syn::value_list<int, block_size>(), syn::type_list<>(), exec, fn, op,
-        finalize, init, result, size, args...);
+        finalize, init, result, size, map_to_device(args)...);
 }
 
 
diff --git a/omp/test/base/kernel_launch.cpp b/omp/test/base/kernel_launch.cpp
index 7184649c3ae..01c39514cdb 100644
--- a/omp/test/base/kernel_launch.cpp
+++ b/omp/test/base/kernel_launch.cpp
@@ -243,26 +243,31 @@ TEST_F(KernelLaunch, Runs2DDense)
 TEST_F(KernelLaunch, Reduction1D)
 {
     gko::Array<int64> output{exec, 1};
+
     gko::kernels::omp::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i) {
+        [] GKO_KERNEL(auto i, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
             return i + 1;
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
-        size_type{100000});
+        size_type{100000}, output);
+
     ASSERT_EQ(*output.get_const_data(), 10000100000ll);
 
     gko::kernels::omp::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i) {
+        [] GKO_KERNEL(auto i, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
             return i + 1;
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
-        size_type{10});
+        size_type{10}, output);
+
     ASSERT_EQ(*output.get_const_data(), 110ll);
 }
 
@@ -270,16 +275,19 @@ TEST_F(KernelLaunch, Reduction1D)
 TEST_F(KernelLaunch, Reduction2DSmallRows)
 {
     gko::Array<int64> output{exec, 1};
+
     for (int cols = 0; cols < 17; cols++) {
         gko::kernels::omp::run_kernel_reduction(
             exec,
-            [] GKO_KERNEL(auto i, auto j) {
+            [] GKO_KERNEL(auto i, auto j, auto a) {
                 static_assert(is_same<decltype(i), int64>::value, "index");
+                static_assert(is_same<decltype(a), int64*>::value, "value");
                 return (i + 1) * (j + 1);
             },
             [] GKO_KERNEL(auto i, auto j) { return i + j; },
             [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
-            gko::dim<2>{10, cols});
+            gko::dim<2>{10, cols}, output);
+
         ASSERT_EQ(*output.get_const_data(), 110ll * cols * (cols + 1));
     }
 }
@@ -288,16 +296,19 @@ TEST_F(KernelLaunch, Reduction2DSmallRows)
 TEST_F(KernelLaunch, Reduction2DLargeRows)
 {
     gko::Array<int64> output{exec, 1};
+
     for (int cols = 0; cols < 17; cols++) {
         gko::kernels::omp::run_kernel_reduction(
             exec,
-            [] GKO_KERNEL(auto i, auto j) {
+            [] GKO_KERNEL(auto i, auto j, auto a) {
                 static_assert(is_same<decltype(i), int64>::value, "index");
+                static_assert(is_same<decltype(a), int64*>::value, "value");
                 return (i + 1) * (j + 1);
             },
             [] GKO_KERNEL(auto i, auto j) { return i + j; },
             [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
-            gko::dim<2>{1000, cols});
+            gko::dim<2>{1000, cols}, output);
+
         ASSERT_EQ(*output.get_const_data(), 1001000ll * cols * (cols + 1));
     }
 }
@@ -306,15 +317,18 @@ TEST_F(KernelLaunch, Reduction2DLargeRows)
 TEST_F(KernelLaunch, Reduction2D)
 {
     gko::Array<int64> output{exec, 1};
+
     gko::kernels::omp::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j) {
+        [] GKO_KERNEL(auto i, auto j, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
-        gko::dim<2>{1000, 100});
+        gko::dim<2>{1000, 100}, output);
+
     ASSERT_EQ(*output.get_const_data(), 10110100000ll);
 }
 
@@ -335,14 +349,16 @@ TEST_F(KernelLaunch, ReductionRow2DSmall)
 
     gko::kernels::omp::run_kernel_row_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j) {
+        [] GKO_KERNEL(auto i, auto j, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2,
         gko::dim<2>{static_cast<size_type>(num_rows),
-                    static_cast<size_type>(num_cols)});
+                    static_cast<size_type>(num_cols)},
+        output);
 
     GKO_ASSERT_ARRAY_EQ(host_ref, output);
 }
@@ -362,14 +378,16 @@ TEST_F(KernelLaunch, ReductionRow2D)
 
     gko::kernels::omp::run_kernel_row_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j) {
+        [] GKO_KERNEL(auto i, auto j, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2,
         gko::dim<2>{static_cast<size_type>(num_rows),
-                    static_cast<size_type>(num_cols)});
+                    static_cast<size_type>(num_cols)},
+        output);
 
     GKO_ASSERT_ARRAY_EQ(host_ref, output);
 }
@@ -389,15 +407,17 @@ TEST_F(KernelLaunch, ReductionCol2D)
 
             gko::kernels::omp::run_kernel_col_reduction(
                 exec,
-                [] GKO_KERNEL(auto i, auto j) {
+                [] GKO_KERNEL(auto i, auto j, auto a) {
                     static_assert(is_same<decltype(i), int64>::value, "index");
+                    static_assert(is_same<decltype(a), int64*>::value, "value");
                     return (i + 1) * (j + 1);
                 },
                 [] GKO_KERNEL(auto i, auto j) { return i + j; },
                 [] GKO_KERNEL(auto j) { return j * 2; }, int64{},
                 output.get_data(),
                 gko::dim<2>{static_cast<size_type>(num_rows),
-                            static_cast<size_type>(num_cols)});
+                            static_cast<size_type>(num_cols)},
+                output);
 
             GKO_ASSERT_ARRAY_EQ(host_ref, output);
         }

From 513972683e44f0ee9e1d4ab3a0b3f52099a0ff23 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 15 Jul 2021 23:57:11 +0200
Subject: [PATCH 08/25] add dense reduction kernels

---
 common/unified/matrix/dense_kernels.cpp | 55 ++++++++++++++++++++
 omp/matrix/dense_kernels.cpp            | 67 -------------------------
 2 files changed, 55 insertions(+), 67 deletions(-)

diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.cpp
index a06d8e1eef2..a3e90576ced 100644
--- a/common/unified/matrix/dense_kernels.cpp
+++ b/common/unified/matrix/dense_kernels.cpp
@@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "common/unified/base/kernel_launch.hpp"
+#include "common/unified/base/kernel_launch_reduction.hpp"
 
 
 namespace gko {
@@ -220,6 +221,60 @@ void sub_scaled_diag(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
 
 
+template <typename ValueType>
+void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Dense<ValueType>* x,
+                 const matrix::Dense<ValueType>* y,
+                 matrix::Dense<ValueType>* result)
+{
+    run_kernel_col_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto x, auto y) {
+            return x(i, j) * y(i, j);
+        },
+        [] GKO_KERNEL(auto a, auto b) { return a + b; },
+        [] GKO_KERNEL(auto a) { return a; }, ValueType{}, result->get_values(),
+        x->get_size(), x, y);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Dense<ValueType>* x,
+                      const matrix::Dense<ValueType>* y,
+                      matrix::Dense<ValueType>* result)
+{
+    run_kernel_col_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto x, auto y) {
+            return conj(x(i, j)) * y(i, j);
+        },
+        [] GKO_KERNEL(auto a, auto b) { return a + b; },
+        [] GKO_KERNEL(auto a) { return a; }, ValueType{}, result->get_values(),
+        x->get_size(), x, y);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
+                   const matrix::Dense<ValueType>* x,
+                   matrix::Dense<remove_complex<ValueType>>* result)
+{
+    run_kernel_col_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto x) { return squared_norm(x(i, j)); },
+        [] GKO_KERNEL(auto a, auto b) { return a + b; },
+        [] GKO_KERNEL(auto a) { return sqrt(a); }, remove_complex<ValueType>{},
+        result->get_values(), x->get_size(), x);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void symm_permute(std::shared_ptr<const DefaultExecutor> exec,
                   const Array<IndexType>* permutation_indices,
diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp
index c0e4ca75ae3..b9df5fddf24 100644
--- a/omp/matrix/dense_kernels.cpp
+++ b/omp/matrix/dense_kernels.cpp
@@ -127,73 +127,6 @@ void apply(std::shared_ptr<const OmpExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
-template <typename ValueType>
-void compute_dot(std::shared_ptr<const OmpExecutor> exec,
-                 const matrix::Dense<ValueType>* x,
-                 const matrix::Dense<ValueType>* y,
-                 matrix::Dense<ValueType>* result)
-{
-#pragma omp parallel for
-    for (size_type j = 0; j < x->get_size()[1]; ++j) {
-        result->at(0, j) = zero<ValueType>();
-    }
-#pragma omp parallel for
-    for (size_type j = 0; j < x->get_size()[1]; ++j) {
-        for (size_type i = 0; i < x->get_size()[0]; ++i) {
-            result->at(0, j) += x->at(i, j) * y->at(i, j);
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot(std::shared_ptr<const OmpExecutor> exec,
-                      const matrix::Dense<ValueType>* x,
-                      const matrix::Dense<ValueType>* y,
-                      matrix::Dense<ValueType>* result)
-{
-#pragma omp parallel for
-    for (size_type j = 0; j < x->get_size()[1]; ++j) {
-        result->at(0, j) = zero<ValueType>();
-    }
-#pragma omp parallel for
-    for (size_type j = 0; j < x->get_size()[1]; ++j) {
-        for (size_type i = 0; i < x->get_size()[0]; ++i) {
-            result->at(0, j) += conj(x->at(i, j)) * y->at(i, j);
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2(std::shared_ptr<const OmpExecutor> exec,
-                   const matrix::Dense<ValueType>* x,
-                   matrix::Dense<remove_complex<ValueType>>* result)
-{
-    using norm_type = remove_complex<ValueType>;
-#pragma omp parallel for
-    for (size_type j = 0; j < x->get_size()[1]; ++j) {
-        result->at(0, j) = zero<norm_type>();
-    }
-#pragma omp parallel for
-    for (size_type j = 0; j < x->get_size()[1]; ++j) {
-        for (size_type i = 0; i < x->get_size()[0]; ++i) {
-            result->at(0, j) += squared_norm(x->at(i, j));
-        }
-    }
-#pragma omp parallel for
-    for (size_type j = 0; j < x->get_size()[1]; ++j) {
-        result->at(0, j) = sqrt(result->at(0, j));
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const OmpExecutor> exec,
                     const matrix::Dense<ValueType>* source,

From 5b07de5697ae547617fb8bd1d33b0519e77da87f Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 22 Jul 2021 22:58:56 +0200
Subject: [PATCH 09/25] fix dpcpp simple kernel indexing

---
 dpcpp/base/kernel_launch.dp.hpp        | 20 +++++++++++---------
 dpcpp/base/kernel_launch_solver.dp.hpp | 15 ++++++++-------
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/dpcpp/base/kernel_launch.dp.hpp b/dpcpp/base/kernel_launch.dp.hpp
index 0941fc7d524..2b99e98ac36 100644
--- a/dpcpp/base/kernel_launch.dp.hpp
+++ b/dpcpp/base/kernel_launch.dp.hpp
@@ -48,10 +48,11 @@ template <typename KernelFunction, typename... KernelArgs>
 void generic_kernel_1d(sycl::handler& cgh, int64 size, KernelFunction fn,
                        KernelArgs... args)
 {
-    cgh.parallel_for(sycl::range<1>{size}, [=](sycl::id<1> idx_id) {
-        auto idx = static_cast<int64>(idx_id[0]);
-        fn(idx, args...);
-    });
+    cgh.parallel_for(sycl::range<1>{static_cast<std::size_t>(size)},
+                     [=](sycl::id<1> idx_id) {
+                         auto idx = static_cast<int64>(idx_id[0]);
+                         fn(idx, args...);
+                     });
 }
 
 
@@ -59,11 +60,12 @@ template <typename KernelFunction, typename... KernelArgs>
 void generic_kernel_2d(sycl::handler& cgh, int64 rows, int64 cols,
                        KernelFunction fn, KernelArgs... args)
 {
-    cgh.parallel_for(sycl::range<2>{rows, cols}, [=](sycl::id<2> idx) {
-        auto row = static_cast<int64>(idx[0]);
-        auto col = static_cast<int64>(idx[1]);
-        fn(row, col, args...);
-    });
+    cgh.parallel_for(sycl::range<1>{static_cast<std::size_t>(rows * cols)},
+                     [=](sycl::id<1> idx) {
+                         auto row = static_cast<int64>(idx[0]) / cols;
+                         auto col = static_cast<int64>(idx[0]) % cols;
+                         fn(row, col, args...);
+                     });
 }
 
 
diff --git a/dpcpp/base/kernel_launch_solver.dp.hpp b/dpcpp/base/kernel_launch_solver.dp.hpp
index aa25d167bf3..68ef10ac1fe 100644
--- a/dpcpp/base/kernel_launch_solver.dp.hpp
+++ b/dpcpp/base/kernel_launch_solver.dp.hpp
@@ -46,13 +46,14 @@ void generic_kernel_2d_solver(sycl::handler& cgh, int64 rows, int64 cols,
                               int64 default_stride, KernelFunction fn,
                               KernelArgs... args)
 {
-    cgh.parallel_for(sycl::range<2>{rows, cols}, [=](sycl::id<2> idx) {
-        auto row = static_cast<int64>(idx[0]);
-        auto col = static_cast<int64>(idx[1]);
-        fn(row, col,
-           device_unpack_solver_impl<KernelArgs>::unpack(args,
-                                                         default_stride)...);
-    });
+    cgh.parallel_for(sycl::range<1>{static_cast<std::size_t>(rows * cols)},
+                     [=](sycl::id<1> idx) {
+                         auto row = static_cast<int64>(idx[0] / cols);
+                         auto col = static_cast<int64>(idx[0] % cols);
+                         fn(row, col,
+                            device_unpack_solver_impl<KernelArgs>::unpack(
+                                args, default_stride)...);
+                     });
 }
 
 

From e9ee66627e74686dd017b3446b7cddcfb1b41f90 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 22 Jul 2021 23:01:05 +0200
Subject: [PATCH 10/25] add reduction kernels

---
 dpcpp/base/helper.hpp                     |   2 +-
 dpcpp/base/kernel_launch_reduction.dp.hpp | 291 ++++++++++++++++++++++
 dpcpp/test/base/kernel_launch.dp.cpp      |  93 +++++++
 3 files changed, 385 insertions(+), 1 deletion(-)
 create mode 100644 dpcpp/base/kernel_launch_reduction.dp.hpp

diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp
index 90ec1cc05fe..f215864e01d 100644
--- a/dpcpp/base/helper.hpp
+++ b/dpcpp/base/helper.hpp
@@ -166,7 +166,7 @@ bool validate(sycl::queue* queue, unsigned workgroup_size,
  * @return the first valid config
  */
 template <typename IterArr, typename Validate>
-std::uint32_t get_first_cfg(IterArr& arr, Validate verify)
+std::uint32_t get_first_cfg(const IterArr& arr, Validate verify)
 {
     for (auto& cfg : arr) {
         if (verify(cfg)) {
diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp
new file mode 100644
index 00000000000..cbf3e3d7158
--- /dev/null
+++ b/dpcpp/base/kernel_launch_reduction.dp.hpp
@@ -0,0 +1,291 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#error \
+    "This file can only be used from inside common/base/kernel_launch_reduction.hpp"
+#endif
+
+
+#include "core/synthesizer/implementation_selection.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/components/uninitialized_array.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+using KCFG_1D = ConfigSet<11, 7>;
+constexpr auto kcfg_1d_list_simple_reduction =
+    syn::value_list<int, static_cast<int>(KCFG_1D::encode(512, 64)),
+                    static_cast<int>(KCFG_1D::encode(512, 32)),
+                    static_cast<int>(KCFG_1D::encode(512, 16)),
+                    static_cast<int>(KCFG_1D::encode(256, 32)),
+                    static_cast<int>(KCFG_1D::encode(256, 16)),
+                    static_cast<int>(KCFG_1D::encode(256, 8))>();
+
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType,
+          typename KernelFunction, typename ReductionOp, typename FinalizeOp,
+          typename... KernelArgs>
+void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size,
+                                 int64 num_workgroups, KernelFunction fn,
+                                 ReductionOp op, FinalizeOp finalize,
+                                 ValueType init, ValueType* storage,
+                                 KernelArgs... args)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    constexpr auto num_partials = wg_size / sg_size;
+    sycl::accessor<UninitializedArray<ValueType, num_partials>, 1,
+                   sycl::access_mode::read_write, sycl::access::target::local>
+        subgroup_partial_acc(sycl::range<1>{1}, cgh);
+    const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size));
+    const auto global_size = num_workgroups * wg_size;
+
+    cgh.parallel_for(
+        range, [=
+    ](sycl::nd_item<3> idx) [[intel::reqd_sub_group_size(sg_size)]] {
+            auto subgroup_partial = &subgroup_partial_acc[0][0];
+            const auto tidx = thread::get_thread_id_flat<int64>(idx);
+            const auto local_tidx = static_cast<int64>(tidx % wg_size);
+            auto subgroup =
+                group::tiled_partition<sg_size>(group::this_thread_block(idx));
+            auto partial = init;
+            for (int64 i = tidx; i < size; i += global_size) {
+                partial = op(partial, fn(i, args...));
+            }
+            partial = ::gko::kernels::dpcpp::reduce(subgroup, partial, op);
+            if (subgroup.thread_rank() == 0) {
+                subgroup_partial[local_tidx / sg_size] = partial;
+            }
+            idx.barrier(sycl::access::fence_space::local_space);
+            if (local_tidx < sg_size) {
+                partial = init;
+                for (int64 i = local_tidx; i < num_partials; i += sg_size) {
+                    partial = op(partial, subgroup_partial[i]);
+                }
+                partial = ::gko::kernels::dpcpp::reduce(subgroup, partial, op);
+                if (subgroup.thread_rank() == 0) {
+                    storage[tidx / wg_size] = finalize(partial);
+                }
+            }
+        });
+}
+
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType,
+          typename KernelFunction, typename ReductionOp, typename FinalizeOp,
+          typename... KernelArgs>
+void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols,
+                                 int64 num_workgroups, KernelFunction fn,
+                                 ReductionOp op, FinalizeOp finalize,
+                                 ValueType init, ValueType* storage,
+                                 KernelArgs... args)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    constexpr auto num_partials = wg_size / sg_size;
+    sycl::accessor<UninitializedArray<ValueType, num_partials>, 1,
+                   sycl::access_mode::read_write, sycl::access::target::local>
+        subgroup_partial_acc(sycl::range<1>{1}, cgh);
+    const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size));
+    const auto global_size = num_workgroups * wg_size;
+
+    cgh.parallel_for(
+        range, [=
+    ](sycl::nd_item<3> idx) [[intel::reqd_sub_group_size(sg_size)]] {
+            auto subgroup_partial = &subgroup_partial_acc[0][0];
+            const auto tidx = thread::get_thread_id_flat<int64>(idx);
+            const auto local_tidx = static_cast<int64>(tidx % wg_size);
+            auto subgroup =
+                group::tiled_partition<sg_size>(group::this_thread_block(idx));
+            auto partial = init;
+            for (int64 i = tidx; i < rows * cols; i += global_size) {
+                const auto row = i / cols;
+                const auto col = i % cols;
+                partial = op(partial, fn(row, col, args...));
+            }
+            partial = ::gko::kernels::dpcpp::reduce(subgroup, partial, op);
+            if (subgroup.thread_rank() == 0) {
+                subgroup_partial[local_tidx / sg_size] = partial;
+            }
+            idx.barrier(sycl::access::fence_space::local_space);
+            if (local_tidx < sg_size) {
+                partial = init;
+                for (int64 i = local_tidx; i < num_partials; i += sg_size) {
+                    partial = op(partial, subgroup_partial[i]);
+                }
+                partial = ::gko::kernels::dpcpp::reduce(subgroup, partial, op);
+                if (subgroup.thread_rank() == 0) {
+                    storage[tidx / wg_size] = finalize(partial);
+                }
+            }
+        });
+}
+
+
+template <int icfg, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction_impl(syn::value_list<int, icfg>,
+                               std::shared_ptr<const DpcppExecutor> exec,
+                               KernelFunction fn, ReductionOp op,
+                               FinalizeOp finalize, ValueType init,
+                               ValueType* result, size_type size,
+                               KernelArgs... args)
+{
+    constexpr auto cfg = static_cast<std::uint32_t>(icfg);
+    constexpr int oversubscription = 4;
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    const auto num_workgroups =
+        std::min<int64>(ceildiv(size, wg_size),
+                        exec->get_num_computing_units() * oversubscription);
+    auto queue = exec->get_queue();
+    if (num_workgroups > 1) {
+        Array<ValueType> partial{exec, static_cast<size_type>(num_workgroups)};
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_reduction_1d(
+                cgh, static_cast<int64>(size), num_workgroups, fn, op,
+                [](auto v) { return v; }, init, partial.get_data(), args...);
+        });
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_reduction_1d(
+                cgh, static_cast<int64>(num_workgroups), 1,
+                [](auto i, auto v) { return v[i]; }, op, finalize, init, result,
+                partial.get_const_data());
+        });
+    } else {
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_reduction_1d(cgh, static_cast<int64>(size),
+                                        num_workgroups, fn, op, finalize, init,
+                                        result, args...);
+        });
+    }
+}
+
+
+template <int icfg, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction_impl(syn::value_list<int, icfg>,
+                               std::shared_ptr<const DpcppExecutor> exec,
+                               KernelFunction fn, ReductionOp op,
+                               FinalizeOp finalize, ValueType init,
+                               ValueType* result, dim<2> size,
+                               KernelArgs... args)
+{
+    constexpr auto cfg = static_cast<std::uint32_t>(icfg);
+    constexpr int oversubscription = 4;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto flat_size = rows * cols;
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    const auto num_workgroups =
+        std::min<int64>(ceildiv(flat_size, wg_size),
+                        exec->get_num_computing_units() * oversubscription);
+    auto queue = exec->get_queue();
+    if (num_workgroups > 1) {
+        Array<ValueType> partial{exec, static_cast<size_type>(num_workgroups)};
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_reduction_2d(
+                cgh, rows, cols, num_workgroups, fn, op,
+                [](auto v) { return v; }, init, partial.get_data(), args...);
+        });
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_reduction_1d(
+                cgh, static_cast<int64>(num_workgroups), 1,
+                [](auto i, auto v) { return v[i]; }, op, finalize, init, result,
+                partial.get_const_data());
+        });
+    } else {
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_reduction_2d(cgh, rows, cols, num_workgroups, fn, op,
+                                        finalize, init, result, args...);
+        });
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction,
+                                    run_kernel_reduction_impl)
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType init,
+                          ValueType* result, dim<2> size, KernelArgs&&... args)
+{
+    const auto desired_icfg = static_cast<int>(get_first_cfg(
+        as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
+            return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        }));
+    select_run_kernel_reduction(
+        kcfg_1d_list_simple_reduction,
+        [&](int icfg) { return icfg == desired_icfg; }, syn::value_list<int>(),
+        syn::type_list<>(), exec, fn, op, finalize, init, result, size,
+        map_to_device(args)...);
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType init,
+                          ValueType* result, size_type size,
+                          KernelArgs&&... args)
+{
+    const auto desired_icfg = static_cast<int>(get_first_cfg(
+        as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
+            return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        }));
+    select_run_kernel_reduction(
+        kcfg_1d_list_simple_reduction,
+        [&](int icfg) { return icfg == desired_icfg; }, syn::value_list<int>(),
+        syn::type_list<>(), exec, fn, op, finalize, init, result, size,
+        map_to_device(args)...);
+}
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/test/base/kernel_launch.dp.cpp b/dpcpp/test/base/kernel_launch.dp.cpp
index decd2e8c64a..592ce9b934c 100644
--- a/dpcpp/test/base/kernel_launch.dp.cpp
+++ b/dpcpp/test/base/kernel_launch.dp.cpp
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/unified/base/kernel_launch_reduction.hpp"
 #include "common/unified/base/kernel_launch_solver.hpp"
 #include "core/test/utils.hpp"
 
@@ -256,4 +257,96 @@ TEST_F(KernelLaunch, Runs2DDense)
 }
 
 
+TEST_F(KernelLaunch, Reduction1D)
+{
+    gko::Array<int64> output{exec, 1};
+
+    gko::kernels::dpcpp::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return i + 1;
+        },
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "i");
+            static_assert(is_same<decltype(j), int64>::value, "j");
+            return i + j;
+        },
+        [] GKO_KERNEL(auto j) {
+            static_assert(is_same<decltype(j), int64>::value, "j");
+            return j * 2;
+        },
+        int64{}, output.get_data(), size_type{100000}, output);
+
+    EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000ll);
+
+    gko::kernels::dpcpp::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return i + 1;
+        },
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "i");
+            static_assert(is_same<decltype(j), int64>::value, "j");
+            return i + j;
+        },
+        [] GKO_KERNEL(auto j) {
+            static_assert(is_same<decltype(j), int64>::value, "j");
+            return j * 2;
+        },
+        int64{}, output.get_data(), size_type{100}, output);
+
+    EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll);
+}
+
+
+TEST_F(KernelLaunch, Reduction2D)
+{
+    gko::Array<int64> output{exec, 1};
+
+    gko::kernels::dpcpp::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "i");
+            static_assert(is_same<decltype(j), int64>::value, "j");
+            return i + j;
+        },
+        [] GKO_KERNEL(auto j) {
+            static_assert(is_same<decltype(j), int64>::value, "j");
+            return j * 4;
+        },
+        int64{}, output.get_data(), gko::dim<2>{1000, 100}, output);
+
+    EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll);
+
+    gko::kernels::dpcpp::run_kernel_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto a) {
+            static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
+            return (i + 1) * (j + 1);
+        },
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "i");
+            static_assert(is_same<decltype(j), int64>::value, "j");
+            return i + j;
+        },
+        [] GKO_KERNEL(auto j) {
+            static_assert(is_same<decltype(j), int64>::value, "j");
+            return j * 4;
+        },
+        int64{}, output.get_data(), gko::dim<2>{10, 10}, output);
+
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll);
+}
+
+
 }  // namespace

From 923913910779d4282ac5001a0ddfe0e30f62470e Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Tue, 27 Jul 2021 23:50:37 +0200
Subject: [PATCH 11/25] improve simple reductions for CUDA

---
 cuda/base/kernel_launch_reduction.cuh | 335 +++++++++++++++++---------
 cuda/test/base/kernel_launch.cu       | 163 ++++++++-----
 2 files changed, 333 insertions(+), 165 deletions(-)

diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
index 08849a90a4a..c2560ded334 100644
--- a/cuda/base/kernel_launch_reduction.cuh
+++ b/cuda/base/kernel_launch_reduction.cuh
@@ -79,7 +79,11 @@ __global__ __launch_bounds__(
     }
     __syncthreads();
     if (threadIdx.x < config::warp_size) {
-        partial = reduce(warp, warp_partial[threadIdx.x], op);
+        partial = reduce(warp,
+                         threadIdx.x < default_block_size / config::warp_size
+                             ? warp_partial[threadIdx.x]
+                             : init,
+                         op);
         if (threadIdx.x == 0) {
             storage[blockIdx.x] = finalize(partial);
         }
@@ -119,7 +123,11 @@ __global__ __launch_bounds__(
     }
     __syncthreads();
     if (threadIdx.x < config::warp_size) {
-        partial = reduce(warp, warp_partial[threadIdx.x], op);
+        partial = reduce(warp,
+                         threadIdx.x < default_block_size / config::warp_size
+                             ? warp_partial[threadIdx.x]
+                             : init,
+                         op);
         if (threadIdx.x == 0) {
             storage[blockIdx.x] = finalize(partial);
         }
@@ -135,7 +143,7 @@ void run_kernel_reduction(std::shared_ptr<const CudaExecutor> exec,
                           ValueType* result, size_type size,
                           KernelArgs&&... args)
 {
-    constexpr int oversubscription = 4;
+    constexpr int oversubscription = 16;
     gko::cuda::device_guard guard{exec->get_device_id()};
     constexpr auto block_size = default_block_size;
     const auto num_blocks = std::min<int64>(
@@ -144,7 +152,7 @@ void run_kernel_reduction(std::shared_ptr<const CudaExecutor> exec,
         Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
         generic_kernel_reduction_1d<<<num_blocks, block_size>>>(
             static_cast<int64>(size), fn, op,
-            [] __device__(auto v) { return v; }, as_cuda_type(init),
+            [] GKO_KERNEL(auto v) { return v; }, as_cuda_type(init),
             as_cuda_type(partial.get_data()), map_to_device(args)...);
         generic_kernel_reduction_1d<<<1, block_size>>>(
             static_cast<int64>(num_blocks),
@@ -166,7 +174,7 @@ void run_kernel_reduction(std::shared_ptr<const CudaExecutor> exec,
                           FinalizeOp finalize, ValueType init,
                           ValueType* result, dim<2> size, KernelArgs&&... args)
 {
-    constexpr int oversubscription = 4;
+    constexpr int oversubscription = 16;
     gko::cuda::device_guard guard{exec->get_device_id()};
     constexpr auto block_size = default_block_size;
     const auto rows = static_cast<int64>(size[0]);
@@ -177,7 +185,7 @@ void run_kernel_reduction(std::shared_ptr<const CudaExecutor> exec,
     if (num_blocks > 1) {
         Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
         generic_kernel_reduction_2d<<<num_blocks, block_size>>>(
-            rows, cols, fn, op, [] __device__(auto v) { return v; },
+            rows, cols, fn, op, [] GKO_KERNEL(auto v) { return v; },
             as_cuda_type(init), as_cuda_type(partial.get_data()),
             map_to_device(args)...);
         generic_kernel_reduction_1d<<<1, block_size>>>(
@@ -197,19 +205,19 @@ template <int subwarp_size, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
 __global__
     __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d(
-        int64 rows, int64 cols, int64 col_parts, KernelFunction fn,
+        int64 rows, int64 cols, int64 col_blocks, KernelFunction fn,
         ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result,
         int64 result_stride, KernelArgs... args)
 {
     const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
     const auto row = idx % rows;
-    const auto col_part = idx / rows;
-    if (col_part >= col_parts) {
+    const auto col_block = idx / rows;
+    if (col_block >= col_blocks) {
         return;
     }
-    const auto cols_per_part = ceildiv(cols, col_parts);
-    // TODO use boundaries divisible by subwarp_size
-    const auto begin = cols_per_part * col_part;
+    const auto cols_per_part =
+        ceildiv(ceildiv(cols, subwarp_size), col_blocks) * subwarp_size;
+    const auto begin = cols_per_part * col_block;
     const auto end = min(begin + cols_per_part, cols);
     auto subwarp =
         group::tiled_partition<subwarp_size>(group::this_thread_block());
@@ -219,58 +227,135 @@ __global__
         partial = op(partial, fn(row, col, args...));
     }
     partial = reduce(subwarp, partial, op);
-    result[(row * col_parts + col_part) * result_stride] = finalize(partial);
+    result[(row + col_block * rows) * result_stride] = finalize(partial);
+}
+
+
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small(
+        int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
+        FinalizeOp finalize, ValueType init, ValueType* result,
+        KernelArgs... args)
+{
+    constexpr auto warp_size = config::warp_size;
+    constexpr auto warps_per_block = default_block_size / warp_size;
+    // stores the subwarp_size partial sums from each warp, grouped by warp
+    constexpr auto shared_storage = warps_per_block * subwarp_size;
+    __shared__ UninitializedArray<ValueType, shared_storage> block_partial;
+    const auto subwarp_id = thread::get_subwarp_id_flat<subwarp_size, int64>();
+    const auto local_warp_id = threadIdx.x / warp_size;
+    const auto local_subwarp_id = threadIdx.x % warp_size / subwarp_size;
+    const auto subwarp_num =
+        thread::get_subwarp_num_flat<subwarp_size, int64>();
+    const auto block = group::this_thread_block();
+    //
+    if (threadIdx.x < shared_storage) {
+        block_partial[threadIdx.x] = init;
+    }
+    block.sync();
+    //
+    const auto warp = group::tiled_partition<warp_size>(block);
+    const auto warp_rank = warp.thread_rank();
+    const auto subwarp_rank = warp_rank % subwarp_size;
+    const auto col = static_cast<int64>(subwarp_rank);
+    auto partial = init;
+    // accumulate within a thread
+    if (col < cols) {
+        for (auto row = subwarp_id; row < rows; row += subwarp_num) {
+            partial = op(partial, fn(row, col, args...));
+        }
+    }
+    // accumulate between all subwarps in the warp
+#pragma unroll
+    for (unsigned i = subwarp_size; i < warp_size; i *= 2) {
+        partial = op(partial, warp.shfl_xor(partial, i));
+    }  // store the result to shared memory
+    if (local_subwarp_id == 0) {
+        block_partial[local_warp_id * subwarp_size + subwarp_rank] = partial;
+    }
+    block.sync();
+    // in a single thread: accumulate the results
+    if (local_warp_id == 0) {
+        partial = init;
+        // accumulate the partial results within a thread
+        if (shared_storage >= warp_size) {
+#pragma unroll
+            for (int i = 0; i < shared_storage; i += warp_size) {
+                partial = op(partial, block_partial[i + warp_rank]);
+            }
+        } else if (warp_rank < shared_storage) {
+            partial = op(partial, block_partial[warp_rank]);
+        }
+        // accumulate between all subwarps in the warp
+#pragma unroll
+        for (unsigned i = subwarp_size; i < warp_size; i *= 2) {
+            partial = op(partial, warp.shfl_xor(partial, i));
+        }
+        if (warp_rank < cols) {
+            result[warp_rank + blockIdx.x * cols] = finalize(partial);
+        }
+    }
 }
 
 
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 __global__
-    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d(
-        int64 rows, int64 cols, int64 row_parts, KernelFunction fn,
-        ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result,
+    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked(
+        int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
+        FinalizeOp finalize, ValueType init, ValueType* result,
         KernelArgs... args)
 {
-    const auto idx = thread::get_thread_id_flat<int64>();
-    const auto col = idx % cols;
-    const auto row_part = idx / cols;
-    if (row_part >= row_parts) {
-        return;
-    }
-    const auto rows_per_part = ceildiv(rows, row_parts);
-    const auto begin = rows_per_part * row_part;
-    const auto end = min(begin + rows_per_part, rows);
+    constexpr auto warp_size = config::warp_size;
+    __shared__ UninitializedArray<ValueType, default_block_size> block_partial;
+    const auto warp_id = thread::get_subwarp_id_flat<warp_size, int64>();
+    const auto warp_num = thread::get_subwarp_num_flat<warp_size, int64>();
+    const auto block = group::this_thread_block();
+    const auto warp = group::tiled_partition<warp_size>(block);
+    const auto warp_rank = warp.thread_rank();
+    const auto col = warp_rank + static_cast<int64>(blockIdx.y) * warp_size;
     auto partial = init;
-    for (auto row = begin; row < end; row++) {
-        partial = op(partial, fn(row, col, args...));
+    // accumulate within a thread
+    if (col < cols) {
+        for (auto row = warp_id; row < rows; row += warp_num) {
+            partial = op(partial, fn(row, col, args...));
+        }
+    }
+    block_partial[threadIdx.x] = partial;
+    block.sync();
+    // in a single warp: accumulate the results
+    if (threadIdx.x < warp_size) {
+        partial = init;
+        // accumulate the partial results within a thread
+#pragma unroll
+        for (int i = 0; i < default_block_size; i += warp_size) {
+            partial = op(partial, block_partial[i + warp_rank]);
+        }
+        if (col < cols) {
+            result[col + blockIdx.x * cols] = finalize(partial);
+        }
     }
-    result[col * row_parts + row_part] = finalize(partial);
 }
 
 
-template <int subwarp_size, typename ValueType, typename ReductionOp,
-          typename FinalizeOp>
+template <typename ValueType, typename ReductionOp, typename FinalizeOp>
 __global__
     __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d(
-        int64 num_results, int64 num_parts, ReductionOp op, FinalizeOp finalize,
-        ValueType init, const ValueType* input, int64 result_stride,
-        ValueType* result)
+        int64 num_results, int64 num_blocks, ReductionOp op,
+        FinalizeOp finalize, ValueType init, const ValueType* input,
+        int64 result_stride, ValueType* result)
 {
-    const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
+    const auto idx = thread::get_thread_id_flat<int64>();
     if (idx >= num_results) {
         return;
     }
-    auto subwarp =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
     auto partial = init;
-    for (int64 part = subwarp.thread_rank(); part < num_parts;
-         part += subwarp_size) {
-        partial = op(partial, input[idx * num_parts + part]);
-    }
-    partial = reduce(subwarp, partial, op);
-    if (subwarp.thread_rank() == 0) {
-        result[idx * result_stride] = finalize(partial);
+    for (int64 block = 0; block < num_blocks; block++) {
+        partial = op(partial, input[idx + block * num_results]);
     }
+    result[idx * result_stride] = finalize(partial);
 }
 
 
@@ -280,42 +365,62 @@ namespace {
 template <int subwarp_size, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
 void run_generic_kernel_row_reduction(syn::value_list<int, subwarp_size>,
-                                      int64 rows, int64 cols, int64 col_parts,
+                                      int64 rows, int64 cols, int64 col_blocks,
                                       KernelFunction fn, ReductionOp op,
                                       FinalizeOp finalize, ValueType init,
                                       ValueType* result, int64 result_stride,
                                       KernelArgs... args)
 {
-    constexpr auto block_size = default_block_size;
-    const auto num_blocks = ceildiv(rows * cols * subwarp_size, block_size);
-    generic_kernel_row_reduction_2d<subwarp_size><<<num_blocks, block_size>>>(
-        rows, cols, col_parts, fn, op, finalize, as_cuda_type(init),
-        as_cuda_type(result), result_stride, args...);
+    const auto num_blocks =
+        ceildiv(rows * col_blocks * subwarp_size, default_block_size);
+    generic_kernel_row_reduction_2d<subwarp_size>
+        <<<num_blocks, default_block_size>>>(
+            rows, cols, col_blocks, fn, op, finalize, as_cuda_type(init),
+            as_cuda_type(result), result_stride, args...);
 }
 
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction,
                                     run_generic_kernel_row_reduction)
 
 
-template <int subwarp_size, typename ValueType, typename ReductionOp,
-          typename FinalizeOp>
-void run_kernel_reduction_finalize(syn::value_list<int, subwarp_size>,
-                                   int64 num_results, int64 num_parts,
-                                   ReductionOp op, FinalizeOp finalize,
-                                   ValueType init, const ValueType* input,
-                                   int64 result_stride, ValueType* result)
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_generic_col_reduction_small(syn::value_list<int, subwarp_size>,
+                                     int64 max_blocks,
+                                     std::shared_ptr<const CudaExecutor> exec,
+                                     KernelFunction fn, ReductionOp op,
+                                     FinalizeOp finalize, ValueType init,
+                                     ValueType* result, dim<2> size,
+                                     MappedKernelArgs... args)
 {
-    constexpr auto block_size = default_block_size;
-    const auto num_blocks = ceildiv(num_results * subwarp_size, block_size);
-    generic_kernel_reduction_finalize_2d<subwarp_size>
-        <<<num_blocks, block_size>>>(num_results, num_parts, op, finalize,
-                                     as_cuda_type(init), as_cuda_type(input),
-                                     static_cast<int64>(result_stride),
-                                     as_cuda_type(result));
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto num_blocks = std::min<int64>(
+        ceildiv(rows * subwarp_size, default_block_size), max_blocks);
+    if (num_blocks <= 1) {
+        generic_kernel_col_reduction_2d_small<subwarp_size>
+            <<<1, default_block_size>>>(rows, cols, fn, op, finalize,
+                                        as_cuda_type(init),
+                                        as_cuda_type(result), args...);
+    } else {
+        Array<ValueType> tmp_storage{exec,
+                                     static_cast<size_type>(num_blocks * cols)};
+        generic_kernel_col_reduction_2d_small<subwarp_size>
+            <<<num_blocks, default_block_size>>>(
+                rows, cols, fn, op, [] GKO_KERNEL(auto v) { return v; },
+                as_cuda_type(init), as_cuda_type(tmp_storage.get_data()),
+                args...);
+        generic_kernel_reduction_finalize_2d<<<
+            ceildiv(cols, default_block_size), default_block_size>>>(
+            cols, num_blocks, op, finalize, as_cuda_type(init),
+            as_cuda_type(tmp_storage.get_const_data()), 1,
+            as_cuda_type(result));
+    }
 }
 
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_finalize,
-                                    run_kernel_reduction_finalize)
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small,
+                                    run_generic_col_reduction_small);
 
 
 }  // namespace
@@ -331,33 +436,29 @@ void run_kernel_row_reduction(std::shared_ptr<const CudaExecutor> exec,
 {
     using subwarp_sizes =
         syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-    constexpr int oversubscription = 4;
+    constexpr int oversubscription = 16;
     gko::cuda::device_guard guard{exec->get_device_id()};
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
-    const auto resources = exec->get_num_warps() * oversubscription;
-    const auto col_parts = 1;  // TODO tune
-    if (col_parts > 1) {
+    const auto resources =
+        exec->get_num_warps() * config::warp_size * oversubscription;
+    if (rows * cols > resources && rows < cols) {
+        const auto col_blocks = ceildiv(rows * cols, resources);
         Array<ValueType> partial{exec,
-                                 static_cast<size_type>(col_parts * rows)};
-        select_run_generic_kernel_row_reduction(
-            subwarp_sizes(),
-            [&](int compiled_subwarp_size) {
-                return compiled_subwarp_size >= cols ||
-                       compiled_subwarp_size == config::warp_size;
-            },
-            syn::value_list<int>(), syn::type_list<>(), rows, cols, col_parts,
-            fn, op, [] __device__(auto i) { return i; }, init,
-            partial.get_data(), 1, map_to_device(args)...);
-        select_run_kernel_reduction_finalize(
-            subwarp_sizes(),
-            [&](int compiled_subwarp_size) {
-                return compiled_subwarp_size >= col_parts ||
-                       compiled_subwarp_size == config::warp_size;
-            },
-            syn::value_list<int>(), syn::type_list<>(), rows, col_parts, op,
-            finalize, init, partial.get_const_data(),
-            static_cast<int64>(result_stride), result);
+                                 static_cast<size_type>(col_blocks * rows)};
+        const auto num_blocks =
+            ceildiv(rows * col_blocks * config::warp_size, default_block_size);
+        generic_kernel_row_reduction_2d<config::warp_size>
+            <<<num_blocks, default_block_size>>>(
+                rows, cols, col_blocks, fn, op,
+                [] GKO_KERNEL(auto v) { return v; }, as_cuda_type(init),
+                as_cuda_type(partial.get_data()), 1, map_to_device(args)...);
+        const auto num_finalize_blocks = ceildiv(rows, default_block_size);
+        generic_kernel_reduction_finalize_2d<<<num_finalize_blocks,
+                                               default_block_size>>>(
+            rows, col_blocks, op, finalize, as_cuda_type(init),
+            as_cuda_type(partial.get_const_data()),
+            static_cast<int64>(result_stride), as_cuda_type(result));
     } else {
         select_run_generic_kernel_row_reduction(
             subwarp_sizes(),
@@ -380,37 +481,49 @@ void run_kernel_col_reduction(std::shared_ptr<const CudaExecutor> exec,
                               ValueType* result, dim<2> size,
                               KernelArgs&&... args)
 {
-    constexpr int oversubscription = 4;
+    using subwarp_sizes =
+        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    constexpr int oversubscription = 16;
     gko::cuda::device_guard guard{exec->get_device_id()};
-    constexpr auto block_size = default_block_size;
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
-    const auto resources =
-        exec->get_num_warps() * config::warp_size * oversubscription;
-    const auto num_blocks = ceildiv(rows * cols, block_size);
-    const auto row_parts = 1;  // TODO tune
-    if (row_parts > 1) {
-        Array<ValueType> partial{exec,
-                                 static_cast<size_type>(row_parts * cols)};
-        generic_kernel_col_reduction_2d<<<num_blocks, block_size>>>(
-            rows, cols, row_parts, fn, op, [] __device__(auto i) { return i; },
-            as_cuda_type(init), as_cuda_type(partial.get_data()),
-            map_to_device(args)...);
-        using subwarp_sizes =
-            syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-        select_run_kernel_reduction_finalize(
+    const auto max_blocks = exec->get_num_warps() * config::warp_size *
+                            oversubscription / default_block_size;
+    if (cols <= config::warp_size) {
+        select_generic_col_reduction_small(
             subwarp_sizes(),
             [&](int compiled_subwarp_size) {
-                return compiled_subwarp_size >= row_parts ||
+                return compiled_subwarp_size >= cols ||
                        compiled_subwarp_size == config::warp_size;
             },
-            syn::value_list<int>(), syn::type_list<>(), cols, row_parts, op,
-            finalize, as_cuda_type(init),
-            as_cuda_type(partial.get_const_data()), 1, as_cuda_type(result));
+            syn::value_list<int>(), syn::type_list<>(), max_blocks, exec, fn,
+            op, finalize, init, result, size, map_to_device(args)...);
     } else {
-        generic_kernel_col_reduction_2d<<<num_blocks, block_size>>>(
-            rows, cols, 1, fn, op, finalize, as_cuda_type(init),
-            as_cuda_type(result), map_to_device(args)...);
+        const auto col_blocks = ceildiv(cols, config::warp_size);
+        const auto row_blocks =
+            ceildiv(std::min<int64>(
+                        ceildiv(rows * config::warp_size, default_block_size),
+                        max_blocks),
+                    col_blocks);
+        if (row_blocks <= 1) {
+            generic_kernel_col_reduction_2d_blocked<<<dim3(1, col_blocks),
+                                                      default_block_size>>>(
+                rows, cols, fn, op, finalize, as_cuda_type(init),
+                as_cuda_type(result), map_to_device(args)...);
+        } else {
+            Array<ValueType> tmp_storage{
+                exec, static_cast<size_type>(row_blocks * cols)};
+            generic_kernel_col_reduction_2d_blocked<<<
+                dim3(row_blocks, col_blocks), default_block_size>>>(
+                rows, cols, fn, op, [] GKO_KERNEL(auto v) { return v; },
+                as_cuda_type(init), as_cuda_type(tmp_storage.get_data()),
+                map_to_device(args)...);
+            generic_kernel_reduction_finalize_2d<<<
+                ceildiv(cols, default_block_size), default_block_size>>>(
+                cols, row_blocks, op, finalize, as_cuda_type(init),
+                as_cuda_type(tmp_storage.get_const_data()), 1,
+                as_cuda_type(result));
+        }
     }
 }
 
diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu
index 6a5494e03fa..2df6ee4ade7 100644
--- a/cuda/test/base/kernel_launch.cu
+++ b/cuda/test/base/kernel_launch.cu
@@ -301,9 +301,16 @@ void run1d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
             static_assert(is_same<decltype(a), int64 *>::value, "value");
             return i + 1;
         },
-        [] GKO_KERNEL(auto i, auto j) { return i + j; },
-        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
-        size_type{100}, output);
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "a");
+            static_assert(is_same<decltype(i), int64>::value, "b");
+            return i + j;
+        },
+        [] GKO_KERNEL(auto j) {
+            static_assert(is_same<decltype(j), int64>::value, "value");
+            return j * 2;
+        },
+        int64{}, output.get_data(), size_type{100}, output);
 
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll);
 }
@@ -319,12 +326,20 @@ void run2d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
         exec,
         [] GKO_KERNEL(auto i, auto j, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(a), int64 *>::value, "value");
             return (i + 1) * (j + 1);
         },
-        [] GKO_KERNEL(auto i, auto j) { return i + j; },
-        [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
-        gko::dim<2>{1000, 100}, output);
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "a");
+            static_assert(is_same<decltype(i), int64>::value, "b");
+            return i + j;
+        },
+        [] GKO_KERNEL(auto j) {
+            static_assert(is_same<decltype(j), int64>::value, "value");
+            return j * 4;
+        },
+        int64{}, output.get_data(), gko::dim<2>{1000, 100}, output);
 
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll);
 
@@ -332,12 +347,20 @@ void run2d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
         exec,
         [] GKO_KERNEL(auto i, auto j, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(a), int64 *>::value, "value");
             return (i + 1) * (j + 1);
         },
-        [] GKO_KERNEL(auto i, auto j) { return i + j; },
-        [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
-        gko::dim<2>{10, 10}, output);
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "a");
+            static_assert(is_same<decltype(i), int64>::value, "b");
+            return i + j;
+        },
+        [] GKO_KERNEL(auto j) {
+            static_assert(is_same<decltype(j), int64>::value, "value");
+            return j * 4;
+        },
+        int64{}, output.get_data(), gko::dim<2>{10, 10}, output);
 
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll);
 }
@@ -347,30 +370,45 @@ TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); }
 
 void run2d_row_reduction(std::shared_ptr<gko::CudaExecutor> exec)
 {
-    int num_rows = 1000;
-    int num_cols = 100;
-    gko::Array<int64> host_ref{exec->get_master(),
-                               static_cast<size_type>(2 * num_rows)};
-    std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
-    gko::Array<int64> output{exec, host_ref};
-    for (int i = 0; i < num_rows; i++) {
-        host_ref.get_data()[2 * i] = num_cols * (num_cols + 1) * (i + 1);
-    }
-
-    gko::kernels::cuda::run_kernel_row_reduction(
-        exec,
-        [] GKO_KERNEL(auto i, auto j, auto a) {
-            static_assert(is_same<decltype(i), int64>::value, "index");
-            static_assert(is_same<decltype(a), int64 *>::value, "value");
-            return (i + 1) * (j + 1);
-        },
-        [] GKO_KERNEL(auto i, auto j) { return i + j; },
-        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2,
-        gko::dim<2>{static_cast<size_type>(num_rows),
-                    static_cast<size_type>(num_cols)},
-        output);
+    for (auto num_rows : {0, 100, 1000, 10000}) {
+        for (auto num_cols : {0, 10, 100, 1000, 10000}) {
+            SCOPED_TRACE(std::to_string(num_rows) + " rows, " +
+                         std::to_string(num_cols) + " cols");
+            gko::Array<int64> host_ref{exec->get_master(),
+                                       static_cast<size_type>(2 * num_rows)};
+            std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
+            gko::Array<int64> output{exec, host_ref};
+            for (int64 i = 0; i < num_rows; i++) {
+                host_ref.get_data()[2 * i] =
+                    num_cols * (num_cols + 1) * (i + 1);
+            }
 
-    GKO_ASSERT_ARRAY_EQ(host_ref, output);
+            gko::kernels::cuda::run_kernel_row_reduction(
+                exec,
+                [] GKO_KERNEL(auto i, auto j, auto a) {
+                    static_assert(is_same<decltype(i), int64>::value, "index");
+                    static_assert(is_same<decltype(j), int64>::value, "index");
+                    static_assert(is_same<decltype(a), int64 *>::value,
+                                  "value");
+                    return (i + 1) * (j + 1);
+                },
+                [] GKO_KERNEL(auto i, auto j) {
+                    static_assert(is_same<decltype(i), int64>::value, "a");
+                    static_assert(is_same<decltype(i), int64>::value, "b");
+                    return i + j;
+                },
+                [] GKO_KERNEL(auto j) {
+                    static_assert(is_same<decltype(j), int64>::value, "value");
+                    return j * 2;
+                },
+                int64{}, output.get_data(), 2,
+                gko::dim<2>{static_cast<size_type>(num_rows),
+                            static_cast<size_type>(num_cols)},
+                output);
+
+            GKO_ASSERT_ARRAY_EQ(host_ref, output);
+        }
+    }
 }
 
 TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); }
@@ -378,29 +416,46 @@ TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); }
 
 void run2d_col_reduction(std::shared_ptr<gko::CudaExecutor> exec)
 {
-    int num_rows = 1000;
-    int num_cols = 100;
-    gko::Array<int64> host_ref{exec->get_master(),
-                               static_cast<size_type>(num_cols)};
-    gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
-    for (int i = 0; i < num_cols; i++) {
-        host_ref.get_data()[i] = num_rows * (num_rows + 1) * (i + 1);
-    }
-
-    gko::kernels::cuda::run_kernel_col_reduction(
-        exec,
-        [] GKO_KERNEL(auto i, auto j, auto a) {
-            static_assert(is_same<decltype(i), int64>::value, "index");
-            static_assert(is_same<decltype(a), int64 *>::value, "value");
-            return (i + 1) * (j + 1);
-        },
-        [] GKO_KERNEL(auto i, auto j) { return i + j; },
-        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
-        gko::dim<2>{static_cast<size_type>(num_rows),
-                    static_cast<size_type>(num_cols)},
-        output);
+    // empty, most threads idle, most threads busy, multiple blocks
+    for (auto num_rows : {0, 10, 100, 1000, 10000}) {
+        // check different edge cases: subwarp sizes, blocked mode
+        for (auto num_cols :
+             {0, 1, 2, 3, 4, 5, 7, 8, 9, 16, 31, 32, 63, 127, 128, 129}) {
+            SCOPED_TRACE(std::to_string(num_rows) + " rows, " +
+                         std::to_string(num_cols) + " cols");
+            gko::Array<int64> host_ref{exec->get_master(),
+                                       static_cast<size_type>(num_cols)};
+            gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
+            for (int64 i = 0; i < num_cols; i++) {
+                host_ref.get_data()[i] = num_rows * (num_rows + 1) * (i + 1);
+            }
 
-    GKO_ASSERT_ARRAY_EQ(host_ref, output);
+            gko::kernels::cuda::run_kernel_col_reduction(
+                exec,
+                [] GKO_KERNEL(auto i, auto j, auto a) {
+                    static_assert(is_same<decltype(i), int64>::value, "index");
+                    static_assert(is_same<decltype(j), int64>::value, "index");
+                    static_assert(is_same<decltype(a), int64 *>::value,
+                                  "value");
+                    return (i + 1) * (j + 1);
+                },
+                [] GKO_KERNEL(auto i, auto j) {
+                    static_assert(is_same<decltype(i), int64>::value, "a");
+                    static_assert(is_same<decltype(i), int64>::value, "b");
+                    return i + j;
+                },
+                [] GKO_KERNEL(auto j) {
+                    static_assert(is_same<decltype(j), int64>::value, "value");
+                    return j * 2;
+                },
+                int64{}, output.get_data(),
+                gko::dim<2>{static_cast<size_type>(num_rows),
+                            static_cast<size_type>(num_cols)},
+                output);
+
+            GKO_ASSERT_ARRAY_EQ(host_ref, output);
+        }
+    }
 }
 
 TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); }

From 45202f045c2d4874c25332c5475b7e9f8b79c266 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Tue, 27 Jul 2021 23:53:02 +0200
Subject: [PATCH 12/25] move CUDA reduction kernels to common entirely

---
 cuda/matrix/dense_kernels.cu | 127 -----------------------------------
 1 file changed, 127 deletions(-)

diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 7a18bb06e39..2b8fb8157c6 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -117,133 +117,6 @@ void apply(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
-template <typename ValueType>
-void compute_dot(std::shared_ptr<const CudaExecutor> exec,
-                 const matrix::Dense<ValueType>* x,
-                 const matrix::Dense<ValueType>* y,
-                 matrix::Dense<ValueType>* result)
-{
-    if (cublas::is_supported<ValueType>::value) {
-        // TODO: write a custom kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            cublas::dot(exec->get_cublas_handle(), x->get_size()[0],
-                        x->get_const_values() + col, x->get_stride(),
-                        y->get_const_values() + col, y->get_stride(),
-                        result->get_values() + col);
-        }
-    } else {
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        constexpr int block_size = 1024;
-
-        constexpr auto work_per_block = work_per_thread * block_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{config::warp_size, 1,
-                             block_size / config::warp_size};
-        Array<ValueType> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            kernel::compute_partial_dot<block_size><<<grid_dim, block_dim>>>(
-                x->get_size()[0], as_cuda_type(x->get_const_values() + col),
-                x->get_stride(), as_cuda_type(y->get_const_values() + col),
-                y->get_stride(), as_cuda_type(work.get_data()));
-            kernel::finalize_sum_reduce_computation<block_size>
-                <<<1, block_dim>>>(grid_dim.x,
-                                   as_cuda_type(work.get_const_data()),
-                                   as_cuda_type(result->get_values() + col));
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot(std::shared_ptr<const CudaExecutor> exec,
-                      const matrix::Dense<ValueType>* x,
-                      const matrix::Dense<ValueType>* y,
-                      matrix::Dense<ValueType>* result)
-{
-    if (cublas::is_supported<ValueType>::value) {
-        // TODO: write a custom kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            cublas::conj_dot(exec->get_cublas_handle(), x->get_size()[0],
-                             x->get_const_values() + col, x->get_stride(),
-                             y->get_const_values() + col, y->get_stride(),
-                             result->get_values() + col);
-        }
-    } else {
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        constexpr int block_size = 1024;
-
-        constexpr auto work_per_block = work_per_thread * block_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{config::warp_size, 1,
-                             block_size / config::warp_size};
-        Array<ValueType> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            kernel::compute_partial_conj_dot<block_size>
-                <<<grid_dim, block_dim>>>(
-                    x->get_size()[0], as_cuda_type(x->get_const_values() + col),
-                    x->get_stride(), as_cuda_type(y->get_const_values() + col),
-                    y->get_stride(), as_cuda_type(work.get_data()));
-            kernel::finalize_sum_reduce_computation<block_size>
-                <<<1, block_dim>>>(grid_dim.x,
-                                   as_cuda_type(work.get_const_data()),
-                                   as_cuda_type(result->get_values() + col));
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2(std::shared_ptr<const CudaExecutor> exec,
-                   const matrix::Dense<ValueType>* x,
-                   matrix::Dense<remove_complex<ValueType>>* result)
-{
-    if (cublas::is_supported<ValueType>::value) {
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            cublas::norm2(exec->get_cublas_handle(), x->get_size()[0],
-                          x->get_const_values() + col, x->get_stride(),
-                          result->get_values() + col);
-        }
-    } else {
-        using norm_type = remove_complex<ValueType>;
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        constexpr int block_size = 1024;
-
-        constexpr auto work_per_block = work_per_thread * block_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{config::warp_size, 1,
-                             block_size / config::warp_size};
-        Array<norm_type> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            kernel::compute_partial_norm2<block_size><<<grid_dim, block_dim>>>(
-                x->get_size()[0], as_cuda_type(x->get_const_values() + col),
-                x->get_stride(), as_cuda_type(work.get_data()));
-            kernel::finalize_sqrt_reduce_computation<block_size>
-                <<<1, block_dim>>>(grid_dim.x,
-                                   as_cuda_type(work.get_const_data()),
-                                   as_cuda_type(result->get_values() + col));
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const CudaExecutor> exec,
                     const matrix::Dense<ValueType>* source,

From 879c678e4a46eb35d8a9a8a4d6106fd064566ee4 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 28 Jul 2021 00:34:45 +0200
Subject: [PATCH 13/25] add HIP kernels

---
 cuda/base/kernel_launch_reduction.cuh    |  10 +-
 hip/base/kernel_launch_reduction.hip.hpp | 339 +++++++++++++++--------
 hip/matrix/dense_kernels.hip.cpp         | 138 ---------
 3 files changed, 233 insertions(+), 254 deletions(-)

diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
index c2560ded334..a857e0d5035 100644
--- a/cuda/base/kernel_launch_reduction.cuh
+++ b/cuda/base/kernel_launch_reduction.cuh
@@ -152,7 +152,7 @@ void run_kernel_reduction(std::shared_ptr<const CudaExecutor> exec,
         Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
         generic_kernel_reduction_1d<<<num_blocks, block_size>>>(
             static_cast<int64>(size), fn, op,
-            [] GKO_KERNEL(auto v) { return v; }, as_cuda_type(init),
+            [] __device__(auto v) { return v; }, as_cuda_type(init),
             as_cuda_type(partial.get_data()), map_to_device(args)...);
         generic_kernel_reduction_1d<<<1, block_size>>>(
             static_cast<int64>(num_blocks),
@@ -185,7 +185,7 @@ void run_kernel_reduction(std::shared_ptr<const CudaExecutor> exec,
     if (num_blocks > 1) {
         Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
         generic_kernel_reduction_2d<<<num_blocks, block_size>>>(
-            rows, cols, fn, op, [] GKO_KERNEL(auto v) { return v; },
+            rows, cols, fn, op, [] __device__(auto v) { return v; },
             as_cuda_type(init), as_cuda_type(partial.get_data()),
             map_to_device(args)...);
         generic_kernel_reduction_1d<<<1, block_size>>>(
@@ -408,7 +408,7 @@ void run_generic_col_reduction_small(syn::value_list<int, subwarp_size>,
                                      static_cast<size_type>(num_blocks * cols)};
         generic_kernel_col_reduction_2d_small<subwarp_size>
             <<<num_blocks, default_block_size>>>(
-                rows, cols, fn, op, [] GKO_KERNEL(auto v) { return v; },
+                rows, cols, fn, op, [] __device__(auto v) { return v; },
                 as_cuda_type(init), as_cuda_type(tmp_storage.get_data()),
                 args...);
         generic_kernel_reduction_finalize_2d<<<
@@ -451,7 +451,7 @@ void run_kernel_row_reduction(std::shared_ptr<const CudaExecutor> exec,
         generic_kernel_row_reduction_2d<config::warp_size>
             <<<num_blocks, default_block_size>>>(
                 rows, cols, col_blocks, fn, op,
-                [] GKO_KERNEL(auto v) { return v; }, as_cuda_type(init),
+                [] __device__(auto v) { return v; }, as_cuda_type(init),
                 as_cuda_type(partial.get_data()), 1, map_to_device(args)...);
         const auto num_finalize_blocks = ceildiv(rows, default_block_size);
         generic_kernel_reduction_finalize_2d<<<num_finalize_blocks,
@@ -515,7 +515,7 @@ void run_kernel_col_reduction(std::shared_ptr<const CudaExecutor> exec,
                 exec, static_cast<size_type>(row_blocks * cols)};
             generic_kernel_col_reduction_2d_blocked<<<
                 dim3(row_blocks, col_blocks), default_block_size>>>(
-                rows, cols, fn, op, [] GKO_KERNEL(auto v) { return v; },
+                rows, cols, fn, op, [] __device__(auto v) { return v; },
                 as_cuda_type(init), as_cuda_type(tmp_storage.get_data()),
                 map_to_device(args)...);
             generic_kernel_reduction_finalize_2d<<<
diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp
index fe4b697bc30..7a875491899 100644
--- a/hip/base/kernel_launch_reduction.hip.hpp
+++ b/hip/base/kernel_launch_reduction.hip.hpp
@@ -79,7 +79,11 @@ __global__ __launch_bounds__(
     }
     __syncthreads();
     if (threadIdx.x < config::warp_size) {
-        partial = reduce(warp, warp_partial[threadIdx.x], op);
+        partial = reduce(warp,
+                         threadIdx.x < default_block_size / config::warp_size
+                             ? warp_partial[threadIdx.x]
+                             : init,
+                         op);
         if (threadIdx.x == 0) {
             storage[blockIdx.x] = finalize(partial);
         }
@@ -119,8 +123,14 @@ __global__ __launch_bounds__(
     }
     __syncthreads();
     if (threadIdx.x < config::warp_size) {
-        storage[blockIdx.x] =
-            finalize(reduce(warp, warp_partial[threadIdx.x], op));
+        partial = reduce(warp,
+                         threadIdx.x < default_block_size / config::warp_size
+                             ? warp_partial[threadIdx.x]
+                             : init,
+                         op);
+        if (threadIdx.x == 0) {
+            storage[blockIdx.x] = finalize(partial);
+        }
     }
 }
 
@@ -133,7 +143,7 @@ void run_kernel_reduction(std::shared_ptr<const HipExecutor> exec,
                           ValueType* result, size_type size,
                           KernelArgs&&... args)
 {
-    constexpr int oversubscription = 4;
+    constexpr int oversubscription = 16;
     gko::hip::device_guard guard{exec->get_device_id()};
     constexpr auto block_size = default_block_size;
     const auto num_blocks = std::min<int64>(
@@ -167,7 +177,7 @@ void run_kernel_reduction(std::shared_ptr<const HipExecutor> exec,
                           FinalizeOp finalize, ValueType init,
                           ValueType* result, dim<2> size, KernelArgs&&... args)
 {
-    constexpr int oversubscription = 4;
+    constexpr int oversubscription = 16;
     gko::hip::device_guard guard{exec->get_device_id()};
     constexpr auto block_size = default_block_size;
     const auto rows = static_cast<int64>(size[0]);
@@ -200,19 +210,19 @@ template <int subwarp_size, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
 __global__
     __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d(
-        int64 rows, int64 cols, int64 col_parts, KernelFunction fn,
+        int64 rows, int64 cols, int64 col_blocks, KernelFunction fn,
         ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result,
         int64 result_stride, KernelArgs... args)
 {
     const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
     const auto row = idx % rows;
-    const auto col_part = idx / rows;
-    if (col_part >= col_parts) {
+    const auto col_block = idx / rows;
+    if (col_block >= col_blocks) {
         return;
     }
-    const auto cols_per_part = ceildiv(cols, col_parts);
-    // TODO use boundaries divisible by subwarp_size
-    const auto begin = cols_per_part * col_part;
+    const auto cols_per_part =
+        ceildiv(ceildiv(cols, subwarp_size), col_blocks) * subwarp_size;
+    const auto begin = cols_per_part * col_block;
     const auto end = min(begin + cols_per_part, cols);
     auto subwarp =
         group::tiled_partition<subwarp_size>(group::this_thread_block());
@@ -222,58 +232,135 @@ __global__
         partial = op(partial, fn(row, col, args...));
     }
     partial = reduce(subwarp, partial, op);
-    result[(row * col_parts + col_part) * result_stride] = finalize(partial);
+    result[(row + col_block * rows) * result_stride] = finalize(partial);
+}
+
+
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small(
+        int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
+        FinalizeOp finalize, ValueType init, ValueType* result,
+        KernelArgs... args)
+{
+    constexpr auto warp_size = config::warp_size;
+    constexpr auto warps_per_block = default_block_size / warp_size;
+    // stores the subwarp_size partial sums from each warp, grouped by warp
+    constexpr auto shared_storage = warps_per_block * subwarp_size;
+    __shared__ UninitializedArray<ValueType, shared_storage> block_partial;
+    const auto subwarp_id = thread::get_subwarp_id_flat<subwarp_size, int64>();
+    const auto local_warp_id = threadIdx.x / warp_size;
+    const auto local_subwarp_id = threadIdx.x % warp_size / subwarp_size;
+    const auto subwarp_num =
+        thread::get_subwarp_num_flat<subwarp_size, int64>();
+    const auto block = group::this_thread_block();
+    //
+    if (threadIdx.x < shared_storage) {
+        block_partial[threadIdx.x] = init;
+    }
+    block.sync();
+    //
+    const auto warp = group::tiled_partition<warp_size>(block);
+    const auto warp_rank = warp.thread_rank();
+    const auto subwarp_rank = warp_rank % subwarp_size;
+    const auto col = static_cast<int64>(subwarp_rank);
+    auto partial = init;
+    // accumulate within a thread
+    if (col < cols) {
+        for (auto row = subwarp_id; row < rows; row += subwarp_num) {
+            partial = op(partial, fn(row, col, args...));
+        }
+    }
+    // accumulate between all subwarps in the warp
+#pragma unroll
+    for (unsigned i = subwarp_size; i < warp_size; i *= 2) {
+        partial = op(partial, warp.shfl_xor(partial, i));
+    }  // store the result to shared memory
+    if (local_subwarp_id == 0) {
+        block_partial[local_warp_id * subwarp_size + subwarp_rank] = partial;
+    }
+    block.sync();
+    // in a single thread: accumulate the results
+    if (local_warp_id == 0) {
+        partial = init;
+        // accumulate the partial results within a thread
+        if (shared_storage >= warp_size) {
+#pragma unroll
+            for (int i = 0; i < shared_storage; i += warp_size) {
+                partial = op(partial, block_partial[i + warp_rank]);
+            }
+        } else if (warp_rank < shared_storage) {
+            partial = op(partial, block_partial[warp_rank]);
+        }
+        // accumulate between all subwarps in the warp
+#pragma unroll
+        for (unsigned i = subwarp_size; i < warp_size; i *= 2) {
+            partial = op(partial, warp.shfl_xor(partial, i));
+        }
+        if (warp_rank < cols) {
+            result[warp_rank + blockIdx.x * cols] = finalize(partial);
+        }
+    }
 }
 
 
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 __global__
-    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d(
-        int64 rows, int64 cols, int64 row_parts, KernelFunction fn,
-        ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result,
+    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked(
+        int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
+        FinalizeOp finalize, ValueType init, ValueType* result,
         KernelArgs... args)
 {
-    const auto idx = thread::get_thread_id_flat<int64>();
-    const auto col = idx % cols;
-    const auto row_part = idx / cols;
-    if (row_part >= row_parts) {
-        return;
-    }
-    const auto rows_per_part = ceildiv(rows, row_parts);
-    const auto begin = rows_per_part * row_part;
-    const auto end = min(begin + rows_per_part, rows);
+    constexpr auto warp_size = config::warp_size;
+    __shared__ UninitializedArray<ValueType, default_block_size> block_partial;
+    const auto warp_id = thread::get_subwarp_id_flat<warp_size, int64>();
+    const auto warp_num = thread::get_subwarp_num_flat<warp_size, int64>();
+    const auto block = group::this_thread_block();
+    const auto warp = group::tiled_partition<warp_size>(block);
+    const auto warp_rank = warp.thread_rank();
+    const auto col = warp_rank + static_cast<int64>(blockIdx.y) * warp_size;
     auto partial = init;
-    for (auto row = begin; row < end; row++) {
-        partial = op(partial, fn(row, col, args...));
+    // accumulate within a thread
+    if (col < cols) {
+        for (auto row = warp_id; row < rows; row += warp_num) {
+            partial = op(partial, fn(row, col, args...));
+        }
+    }
+    block_partial[threadIdx.x] = partial;
+    block.sync();
+    // in a single warp: accumulate the results
+    if (threadIdx.x < warp_size) {
+        partial = init;
+        // accumulate the partial results within a thread
+#pragma unroll
+        for (int i = 0; i < default_block_size; i += warp_size) {
+            partial = op(partial, block_partial[i + warp_rank]);
+        }
+        if (col < cols) {
+            result[col + blockIdx.x * cols] = finalize(partial);
+        }
     }
-    result[col * row_parts + row_part] = finalize(partial);
 }
 
 
-template <int subwarp_size, typename ValueType, typename ReductionOp,
-          typename FinalizeOp>
+template <typename ValueType, typename ReductionOp, typename FinalizeOp>
 __global__
     __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d(
-        int64 num_results, int64 num_parts, ReductionOp op, FinalizeOp finalize,
-        ValueType init, const ValueType* input, int64 result_stride,
-        ValueType* result)
+        int64 num_results, int64 num_blocks, ReductionOp op,
+        FinalizeOp finalize, ValueType init, const ValueType* input,
+        int64 result_stride, ValueType* result)
 {
-    const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
+    const auto idx = thread::get_thread_id_flat<int64>();
     if (idx >= num_results) {
         return;
     }
-    auto subwarp =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
     auto partial = init;
-    for (int64 part = subwarp.thread_rank(); part < num_parts;
-         part += subwarp_size) {
-        partial = op(partial, input[idx * num_parts + part]);
-    }
-    partial = reduce(subwarp, partial, op);
-    if (subwarp.thread_rank() == 0) {
-        result[idx * result_stride] = finalize(partial);
+    for (int64 block = 0; block < num_blocks; block++) {
+        partial = op(partial, input[idx + block * num_results]);
     }
+    result[idx * result_stride] = finalize(partial);
 }
 
 
@@ -283,43 +370,65 @@ namespace {
 template <int subwarp_size, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
 void run_generic_kernel_row_reduction(syn::value_list<int, subwarp_size>,
-                                      int64 rows, int64 cols, int64 col_parts,
+                                      int64 rows, int64 cols, int64 col_blocks,
                                       KernelFunction fn, ReductionOp op,
                                       FinalizeOp finalize, ValueType init,
                                       ValueType* result, int64 result_stride,
                                       KernelArgs... args)
 {
-    constexpr auto block_size = default_block_size;
-    const auto num_blocks = ceildiv(rows * cols * subwarp_size, block_size);
+    const auto num_blocks =
+        ceildiv(rows * col_blocks * subwarp_size, default_block_size);
     hipLaunchKernelGGL(
         HIP_KERNEL_NAME(generic_kernel_row_reduction_2d<subwarp_size>),
-        num_blocks, block_size, 0, 0, rows, cols, col_parts, fn, op, finalize,
-        as_hip_type(init), as_hip_type(result), result_stride, args...);
+        num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn, op,
+        finalize, as_hip_type(init), as_hip_type(result), result_stride,
+        args...);
 }
 
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction,
                                     run_generic_kernel_row_reduction)
 
 
-template <int subwarp_size, typename ValueType, typename ReductionOp,
-          typename FinalizeOp>
-void run_kernel_reduction_finalize(syn::value_list<int, subwarp_size>,
-                                   int64 num_results, int64 num_parts,
-                                   ReductionOp op, FinalizeOp finalize,
-                                   ValueType init, const ValueType* input,
-                                   int64 result_stride, ValueType* result)
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_generic_col_reduction_small(syn::value_list<int, subwarp_size>,
+                                     int64 max_blocks,
+                                     std::shared_ptr<const HipExecutor> exec,
+                                     KernelFunction fn, ReductionOp op,
+                                     FinalizeOp finalize, ValueType init,
+                                     ValueType* result, dim<2> size,
+                                     MappedKernelArgs... args)
 {
-    constexpr auto block_size = default_block_size;
-    const auto num_blocks = ceildiv(num_results * subwarp_size, block_size);
-    hipLaunchKernelGGL(
-        HIP_KERNEL_NAME(generic_kernel_reduction_finalize_2d<subwarp_size>),
-        num_blocks, block_size, 0, 0, num_results, num_parts, op, finalize,
-        as_hip_type(init), as_hip_type(input),
-        static_cast<int64>(result_stride), as_hip_type(result));
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto num_blocks = std::min<int64>(
+        ceildiv(rows * subwarp_size, default_block_size), max_blocks);
+    if (num_blocks <= 1) {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                generic_kernel_col_reduction_2d_small<subwarp_size>),
+            1, default_block_size, 0, 0, rows, cols, fn, op, finalize,
+            as_hip_type(init), as_hip_type(result), args...);
+    } else {
+        Array<ValueType> tmp_storage{exec,
+                                     static_cast<size_type>(num_blocks * cols)};
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(
+                generic_kernel_col_reduction_2d_small<subwarp_size>),
+            num_blocks, default_block_size, 0, 0, rows, cols, fn, op,
+            [] __device__(auto v) { return v; }, as_hip_type(init),
+            as_hip_type(tmp_storage.get_data()), args...);
+        hipLaunchKernelGGL(
+            generic_kernel_reduction_finalize_2d,
+            ceildiv(cols, default_block_size), default_block_size, 0, 0, cols,
+            num_blocks, op, finalize, as_hip_type(init),
+            as_hip_type(tmp_storage.get_const_data()), 1, as_hip_type(result));
+    }
 }
 
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_finalize,
-                                    run_kernel_reduction_finalize)
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small,
+                                    run_generic_col_reduction_small);
 
 
 }  // namespace
@@ -335,33 +444,29 @@ void run_kernel_row_reduction(std::shared_ptr<const HipExecutor> exec,
 {
     using subwarp_sizes =
         syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-    constexpr int oversubscription = 4;
+    constexpr int oversubscription = 16;
     gko::hip::device_guard guard{exec->get_device_id()};
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
-    const auto resources = exec->get_num_warps() * oversubscription;
-    const auto col_parts = 1;  // TODO tune
-    if (col_parts > 1) {
+    const auto resources =
+        exec->get_num_warps() * config::warp_size * oversubscription;
+    if (rows * cols > resources && rows < cols) {
+        const auto col_blocks = ceildiv(rows * cols, resources);
         Array<ValueType> partial{exec,
-                                 static_cast<size_type>(col_parts * rows)};
-        select_run_generic_kernel_row_reduction(
-            subwarp_sizes(),
-            [&](int compiled_subwarp_size) {
-                return compiled_subwarp_size >= cols ||
-                       compiled_subwarp_size == config::warp_size;
-            },
-            syn::value_list<int>(), syn::type_list<>(), rows, cols, col_parts,
-            fn, op, [] __device__(auto i) { return i; }, init,
-            partial.get_data(), 1, map_to_device(args)...);
-        select_run_kernel_reduction_finalize(
-            subwarp_sizes(),
-            [&](int compiled_subwarp_size) {
-                return compiled_subwarp_size >= col_parts ||
-                       compiled_subwarp_size == config::warp_size;
-            },
-            syn::value_list<int>(), syn::type_list<>(), rows, col_parts, op,
-            finalize, init, partial.get_const_data(),
-            static_cast<int64>(result_stride), result);
+                                 static_cast<size_type>(col_blocks * rows)};
+        const auto num_blocks =
+            ceildiv(rows * col_blocks * config::warp_size, default_block_size);
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(generic_kernel_row_reduction_2d<config::warp_size>),
+            num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn,
+            op, [] __device__(auto v) { return v; }, as_hip_type(init),
+            as_hip_type(partial.get_data()), 1, map_to_device(args)...);
+        const auto num_finalize_blocks = ceildiv(rows, default_block_size);
+        hipLaunchKernelGGL(
+            generic_kernel_reduction_finalize_2d, num_finalize_blocks,
+            default_block_size, 0, 0, rows, col_blocks, op, finalize,
+            as_hip_type(init), as_hip_type(partial.get_const_data()),
+            static_cast<int64>(result_stride), as_hip_type(result));
     } else {
         select_run_generic_kernel_row_reduction(
             subwarp_sizes(),
@@ -384,39 +489,51 @@ void run_kernel_col_reduction(std::shared_ptr<const HipExecutor> exec,
                               ValueType* result, dim<2> size,
                               KernelArgs&&... args)
 {
-    constexpr int oversubscription = 4;
+    using subwarp_sizes =
+        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    constexpr int oversubscription = 16;
     gko::hip::device_guard guard{exec->get_device_id()};
-    constexpr auto block_size = default_block_size;
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
-    const auto resources =
-        exec->get_num_warps() * config::warp_size * oversubscription;
-    const auto num_blocks = ceildiv(rows * cols, block_size);
-    const auto row_parts = 1;  // TODO tune
-    if (row_parts > 1) {
-        Array<ValueType> partial{exec,
-                                 static_cast<size_type>(row_parts * cols)};
-        hipLaunchKernelGGL(
-            generic_kernel_col_reduction_2d, num_blocks, block_size, 0, 0, rows,
-            cols, row_parts, fn, op, [] __device__(auto i) { return i; },
-            as_hip_type(init), as_hip_type(partial.get_data()),
-            map_to_device(args)...);
-        using subwarp_sizes =
-            syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-        select_run_kernel_reduction_finalize(
+    const auto max_blocks = exec->get_num_warps() * config::warp_size *
+                            oversubscription / default_block_size;
+    if (cols <= config::warp_size) {
+        select_generic_col_reduction_small(
             subwarp_sizes(),
             [&](int compiled_subwarp_size) {
-                return compiled_subwarp_size >= row_parts ||
+                return compiled_subwarp_size >= cols ||
                        compiled_subwarp_size == config::warp_size;
             },
-            syn::value_list<int>(), syn::type_list<>(), cols, row_parts, op,
-            finalize, as_hip_type(init), as_hip_type(partial.get_const_data()),
-            1, as_hip_type(result));
+            syn::value_list<int>(), syn::type_list<>(), max_blocks, exec, fn,
+            op, finalize, init, result, size, map_to_device(args)...);
     } else {
-        hipLaunchKernelGGL(generic_kernel_col_reduction_2d, num_blocks,
-                           block_size, 0, 0, rows, cols, 1, fn, op, finalize,
-                           as_hip_type(init), as_hip_type(result),
-                           map_to_device(args)...);
+        const auto col_blocks = ceildiv(cols, config::warp_size);
+        const auto row_blocks =
+            ceildiv(std::min<int64>(
+                        ceildiv(rows * config::warp_size, default_block_size),
+                        max_blocks),
+                    col_blocks);
+        if (row_blocks <= 1) {
+            hipLaunchKernelGGL(generic_kernel_col_reduction_2d_blocked,
+                               dim3(1, col_blocks), default_block_size, 0, 0,
+                               rows, cols, fn, op, finalize, as_hip_type(init),
+                               as_hip_type(result), map_to_device(args)...);
+        } else {
+            Array<ValueType> tmp_storage{
+                exec, static_cast<size_type>(row_blocks * cols)};
+            hipLaunchKernelGGL(
+                generic_kernel_col_reduction_2d_blocked,
+                dim3(row_blocks, col_blocks), default_block_size, 0, 0, rows,
+                cols, fn, op, [] __device__(auto v) { return v; },
+                as_hip_type(init), as_hip_type(tmp_storage.get_data()),
+                map_to_device(args)...);
+            hipLaunchKernelGGL(generic_kernel_reduction_finalize_2d,
+                               ceildiv(cols, default_block_size),
+                               default_block_size, 0, 0, cols, row_blocks, op,
+                               finalize, as_hip_type(init),
+                               as_hip_type(tmp_storage.get_const_data()), 1,
+                               as_hip_type(result));
+        }
     }
 }
 
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index 56ed5c327b9..d4c815c9539 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -120,144 +120,6 @@ void apply(std::shared_ptr<const HipExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
-template <typename ValueType>
-void compute_dot(std::shared_ptr<const HipExecutor> exec,
-                 const matrix::Dense<ValueType>* x,
-                 const matrix::Dense<ValueType>* y,
-                 matrix::Dense<ValueType>* result)
-{
-    if (hipblas::is_supported<ValueType>::value) {
-        // TODO: write a custom kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            hipblas::dot(exec->get_hipblas_handle(), x->get_size()[0],
-                         x->get_const_values() + col, x->get_stride(),
-                         y->get_const_values() + col, y->get_stride(),
-                         result->get_values() + col);
-        }
-    } else {
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        constexpr int block_size = 1024;
-
-        constexpr auto work_per_block = work_per_thread * block_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{config::warp_size, 1,
-                             block_size / config::warp_size};
-        Array<ValueType> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            hipLaunchKernelGGL(
-                HIP_KERNEL_NAME(kernel::compute_partial_dot<block_size>),
-                dim3(grid_dim), dim3(block_dim), 0, 0, x->get_size()[0],
-                as_hip_type(x->get_const_values() + col), x->get_stride(),
-                as_hip_type(y->get_const_values() + col), y->get_stride(),
-                as_hip_type(work.get_data()));
-            hipLaunchKernelGGL(
-                HIP_KERNEL_NAME(
-                    kernel::finalize_sum_reduce_computation<block_size>),
-                dim3(1), dim3(block_dim), 0, 0, grid_dim.x,
-                as_hip_type(work.get_const_data()),
-                as_hip_type(result->get_values() + col));
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot(std::shared_ptr<const HipExecutor> exec,
-                      const matrix::Dense<ValueType>* x,
-                      const matrix::Dense<ValueType>* y,
-                      matrix::Dense<ValueType>* result)
-{
-    if (hipblas::is_supported<ValueType>::value) {
-        // TODO: write a custom kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            hipblas::conj_dot(exec->get_hipblas_handle(), x->get_size()[0],
-                              x->get_const_values() + col, x->get_stride(),
-                              y->get_const_values() + col, y->get_stride(),
-                              result->get_values() + col);
-        }
-    } else {
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        constexpr int block_size = 1024;
-
-        constexpr auto work_per_block = work_per_thread * block_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{config::warp_size, 1,
-                             block_size / config::warp_size};
-        Array<ValueType> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            hipLaunchKernelGGL(
-                HIP_KERNEL_NAME(kernel::compute_partial_conj_dot<block_size>),
-                dim3(grid_dim), dim3(block_dim), 0, 0, x->get_size()[0],
-                as_hip_type(x->get_const_values() + col), x->get_stride(),
-                as_hip_type(y->get_const_values() + col), y->get_stride(),
-                as_hip_type(work.get_data()));
-            hipLaunchKernelGGL(
-                HIP_KERNEL_NAME(
-                    kernel::finalize_sum_reduce_computation<block_size>),
-                dim3(1), dim3(block_dim), 0, 0, grid_dim.x,
-                as_hip_type(work.get_const_data()),
-                as_hip_type(result->get_values() + col));
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2(std::shared_ptr<const HipExecutor> exec,
-                   const matrix::Dense<ValueType>* x,
-                   matrix::Dense<remove_complex<ValueType>>* result)
-{
-    if (hipblas::is_supported<ValueType>::value) {
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            hipblas::norm2(exec->get_hipblas_handle(), x->get_size()[0],
-                           x->get_const_values() + col, x->get_stride(),
-                           result->get_values() + col);
-        }
-    } else {
-        using norm_type = remove_complex<ValueType>;
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        constexpr int block_size = 1024;
-
-        constexpr auto work_per_block = work_per_thread * block_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{config::warp_size, 1,
-                             block_size / config::warp_size};
-        Array<norm_type> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            hipLaunchKernelGGL(
-                HIP_KERNEL_NAME(kernel::compute_partial_norm2<block_size>),
-                dim3(grid_dim), dim3(block_dim), 0, 0, x->get_size()[0],
-                as_hip_type(x->get_const_values() + col), x->get_stride(),
-                as_hip_type(work.get_data()));
-            hipLaunchKernelGGL(
-                HIP_KERNEL_NAME(
-                    kernel::finalize_sqrt_reduce_computation<block_size>),
-                dim3(1), dim3(block_dim), 0, 0, grid_dim.x,
-                as_hip_type(work.get_const_data()),
-                as_hip_type(result->get_values() + col));
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const HipExecutor> exec,
                     const matrix::Dense<ValueType>* source,

From 69068184070ad769fb1fffc17852a8fe123229b0 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 11 Aug 2021 10:00:36 +0200
Subject: [PATCH 14/25] fix overflows in reduction tests

---
 cuda/test/base/kernel_launch.cu | 5 +++--
 omp/test/base/kernel_launch.cpp | 9 ++++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu
index 2df6ee4ade7..1d43293d553 100644
--- a/cuda/test/base/kernel_launch.cu
+++ b/cuda/test/base/kernel_launch.cu
@@ -380,7 +380,7 @@ void run2d_row_reduction(std::shared_ptr<gko::CudaExecutor> exec)
             gko::Array<int64> output{exec, host_ref};
             for (int64 i = 0; i < num_rows; i++) {
                 host_ref.get_data()[2 * i] =
-                    num_cols * (num_cols + 1) * (i + 1);
+                    static_cast<int64>(num_cols) * (num_cols + 1) * (i + 1);
             }
 
             gko::kernels::cuda::run_kernel_row_reduction(
@@ -427,7 +427,8 @@ void run2d_col_reduction(std::shared_ptr<gko::CudaExecutor> exec)
                                        static_cast<size_type>(num_cols)};
             gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
             for (int64 i = 0; i < num_cols; i++) {
-                host_ref.get_data()[i] = num_rows * (num_rows + 1) * (i + 1);
+                host_ref.get_data()[i] =
+                    static_cast<int64>(num_rows) * (num_rows + 1) * (i + 1);
             }
 
             gko::kernels::cuda::run_kernel_col_reduction(
diff --git a/omp/test/base/kernel_launch.cpp b/omp/test/base/kernel_launch.cpp
index 01c39514cdb..a615c452f64 100644
--- a/omp/test/base/kernel_launch.cpp
+++ b/omp/test/base/kernel_launch.cpp
@@ -344,7 +344,8 @@ TEST_F(KernelLaunch, ReductionRow2DSmall)
     std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
     gko::Array<int64> output{exec, host_ref};
     for (int i = 0; i < num_rows; i++) {
-        host_ref.get_data()[2 * i] = num_cols * (num_cols + 1) * (i + 1);
+        host_ref.get_data()[2 * i] =
+            static_cast<int64>(num_cols) * (num_cols + 1) * (i + 1);
     }
 
     gko::kernels::omp::run_kernel_row_reduction(
@@ -373,7 +374,8 @@ TEST_F(KernelLaunch, ReductionRow2D)
     std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
     gko::Array<int64> output{exec, host_ref};
     for (int i = 0; i < num_rows; i++) {
-        host_ref.get_data()[2 * i] = num_cols * (num_cols + 1) * (i + 1);
+        host_ref.get_data()[2 * i] =
+            static_cast<int64>(num_cols) * (num_cols + 1) * (i + 1);
     }
 
     gko::kernels::omp::run_kernel_row_reduction(
@@ -402,7 +404,8 @@ TEST_F(KernelLaunch, ReductionCol2D)
                                        static_cast<size_type>(num_cols)};
             gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
             for (int i = 0; i < num_cols; i++) {
-                host_ref.get_data()[i] = num_rows * (num_rows + 1) * (i + 1);
+                host_ref.get_data()[i] =
+                    static_cast<int64>(num_rows) * (num_rows + 1) * (i + 1);
             }
 
             gko::kernels::omp::run_kernel_col_reduction(

From ddbe7fd9c2c6c169d32dbc42a2034d095168d8b1 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 11 Aug 2021 10:03:10 +0200
Subject: [PATCH 15/25] avoid duplicate writes in reductions

---
 cuda/base/kernel_launch_reduction.cuh    | 4 +++-
 hip/base/kernel_launch_reduction.hip.hpp | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
index a857e0d5035..49a6ca95f87 100644
--- a/cuda/base/kernel_launch_reduction.cuh
+++ b/cuda/base/kernel_launch_reduction.cuh
@@ -227,7 +227,9 @@ __global__
         partial = op(partial, fn(row, col, args...));
     }
     partial = reduce(subwarp, partial, op);
-    result[(row + col_block * rows) * result_stride] = finalize(partial);
+    if (subwarp.thread_rank() == 0) {
+        result[(row + col_block * rows) * result_stride] = finalize(partial);
+    }
 }
 
 
diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp
index 7a875491899..aa3f3384ca6 100644
--- a/hip/base/kernel_launch_reduction.hip.hpp
+++ b/hip/base/kernel_launch_reduction.hip.hpp
@@ -232,7 +232,9 @@ __global__
         partial = op(partial, fn(row, col, args...));
     }
     partial = reduce(subwarp, partial, op);
-    result[(row + col_block * rows) * result_stride] = finalize(partial);
+    if (subwarp.thread_rank() == 0) {
+        result[(row + col_block * rows) * result_stride] = finalize(partial);
+    }
 }
 
 

From 7103627befece41b0580100d0c5a356f985ab6c5 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 11 Aug 2021 10:03:44 +0200
Subject: [PATCH 16/25] add DPC++ reduction kernels

---
 dpcpp/base/kernel_launch_reduction.dp.hpp | 478 +++++++++++++++++++++-
 dpcpp/matrix/dense_kernels.dp.cpp         | 416 -------------------
 dpcpp/test/base/kernel_launch.dp.cpp      |  70 ++++
 3 files changed, 526 insertions(+), 438 deletions(-)

diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp
index cbf3e3d7158..ca82a897269 100644
--- a/dpcpp/base/kernel_launch_reduction.dp.hpp
+++ b/dpcpp/base/kernel_launch_reduction.dp.hpp
@@ -30,12 +30,15 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
 #error \
     "This file can only be used from inside common/base/kernel_launch_reduction.hpp"
 #endif
 
 
+#include <algorithm>
+
+
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
@@ -60,14 +63,14 @@ constexpr auto kcfg_1d_list_simple_reduction =
                     static_cast<int>(KCFG_1D::encode(256, 8))>();
 
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType,
-          typename KernelFunction, typename ReductionOp, typename FinalizeOp,
-          typename... KernelArgs>
+template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
 void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size,
                                  int64 num_workgroups, KernelFunction fn,
                                  ReductionOp op, FinalizeOp finalize,
                                  ValueType init, ValueType* storage,
-                                 KernelArgs... args)
+                                 MappedKernelArgs... args)
 {
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
@@ -109,14 +112,14 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size,
 }
 
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType,
-          typename KernelFunction, typename ReductionOp, typename FinalizeOp,
-          typename... KernelArgs>
+template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
 void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols,
                                  int64 num_workgroups, KernelFunction fn,
                                  ReductionOp op, FinalizeOp finalize,
                                  ValueType init, ValueType* storage,
-                                 KernelArgs... args)
+                                 MappedKernelArgs... args)
 {
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
@@ -161,13 +164,14 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols,
 
 
 template <int icfg, typename ValueType, typename KernelFunction,
-          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
 void run_kernel_reduction_impl(syn::value_list<int, icfg>,
                                std::shared_ptr<const DpcppExecutor> exec,
                                KernelFunction fn, ReductionOp op,
                                FinalizeOp finalize, ValueType init,
                                ValueType* result, size_type size,
-                               KernelArgs... args)
+                               MappedKernelArgs... args)
 {
     constexpr auto cfg = static_cast<std::uint32_t>(icfg);
     constexpr int oversubscription = 4;
@@ -180,34 +184,35 @@ void run_kernel_reduction_impl(syn::value_list<int, icfg>,
     if (num_workgroups > 1) {
         Array<ValueType> partial{exec, static_cast<size_type>(num_workgroups)};
         queue->submit([&](sycl::handler& cgh) {
-            generic_kernel_reduction_1d(
+            generic_kernel_reduction_1d<cfg>(
                 cgh, static_cast<int64>(size), num_workgroups, fn, op,
                 [](auto v) { return v; }, init, partial.get_data(), args...);
         });
         queue->submit([&](sycl::handler& cgh) {
-            generic_kernel_reduction_1d(
+            generic_kernel_reduction_1d<cfg>(
                 cgh, static_cast<int64>(num_workgroups), 1,
                 [](auto i, auto v) { return v[i]; }, op, finalize, init, result,
                 partial.get_const_data());
         });
     } else {
         queue->submit([&](sycl::handler& cgh) {
-            generic_kernel_reduction_1d(cgh, static_cast<int64>(size),
-                                        num_workgroups, fn, op, finalize, init,
-                                        result, args...);
+            generic_kernel_reduction_1d<cfg>(cgh, static_cast<int64>(size),
+                                             num_workgroups, fn, op, finalize,
+                                             init, result, args...);
         });
     }
 }
 
 
 template <int icfg, typename ValueType, typename KernelFunction,
-          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
 void run_kernel_reduction_impl(syn::value_list<int, icfg>,
                                std::shared_ptr<const DpcppExecutor> exec,
                                KernelFunction fn, ReductionOp op,
                                FinalizeOp finalize, ValueType init,
                                ValueType* result, dim<2> size,
-                               KernelArgs... args)
+                               MappedKernelArgs... args)
 {
     constexpr auto cfg = static_cast<std::uint32_t>(icfg);
     constexpr int oversubscription = 4;
@@ -223,20 +228,21 @@ void run_kernel_reduction_impl(syn::value_list<int, icfg>,
     if (num_workgroups > 1) {
         Array<ValueType> partial{exec, static_cast<size_type>(num_workgroups)};
         queue->submit([&](sycl::handler& cgh) {
-            generic_kernel_reduction_2d(
+            generic_kernel_reduction_2d<cfg>(
                 cgh, rows, cols, num_workgroups, fn, op,
                 [](auto v) { return v; }, init, partial.get_data(), args...);
         });
         queue->submit([&](sycl::handler& cgh) {
-            generic_kernel_reduction_1d(
+            generic_kernel_reduction_1d<cfg>(
                 cgh, static_cast<int64>(num_workgroups), 1,
                 [](auto i, auto v) { return v[i]; }, op, finalize, init, result,
                 partial.get_const_data());
         });
     } else {
         queue->submit([&](sycl::handler& cgh) {
-            generic_kernel_reduction_2d(cgh, rows, cols, num_workgroups, fn, op,
-                                        finalize, init, result, args...);
+            generic_kernel_reduction_2d<cfg>(cgh, rows, cols, num_workgroups,
+                                             fn, op, finalize, init, result,
+                                             args...);
         });
     }
 }
@@ -286,6 +292,434 @@ void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
 }
 
 
+namespace {
+
+
+template <int icfg, int ssg_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void generic_kernel_row_reduction_2d(syn::value_list<int, ssg_size>,
+                                     std::shared_ptr<const DpcppExecutor> exec,
+                                     int64 rows, int64 cols, int64 col_blocks,
+                                     KernelFunction fn, ReductionOp op,
+                                     FinalizeOp finalize, ValueType init,
+                                     ValueType* result, int64 result_stride,
+                                     MappedKernelArgs... args)
+{
+    constexpr auto wg_size =
+        KCFG_1D::decode<0>(static_cast<std::uint32_t>(icfg));
+    constexpr auto sg_size =
+        KCFG_1D::decode<1>(static_cast<std::uint32_t>(icfg));
+    static_assert(ssg_size <= sg_size, "ssg must be smaller than sg");
+    const auto num_workgroups = ceildiv(rows * col_blocks * ssg_size, wg_size);
+    const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size));
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(
+            range, [=
+        ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] {
+                const auto idx =
+                    thread::get_subwarp_id_flat<ssg_size, int64>(id);
+                const auto row = idx % rows;
+                const auto col_block = idx / rows;
+                auto partial = init;
+                auto subgroup = group::tiled_partition<sg_size>(
+                    group::this_thread_block(id));
+                auto ssg_rank =
+                    static_cast<int64>(subgroup.thread_rank() % ssg_size);
+                if (col_block < col_blocks) {
+                    const auto cols_per_part =
+                        ceildiv(ceildiv(cols, ssg_size), col_blocks) * ssg_size;
+                    const auto begin = cols_per_part * col_block;
+                    const auto end = min(begin + cols_per_part, cols);
+                    for (auto col = begin + ssg_rank; col < end;
+                         col += ssg_size) {
+                        partial = op(partial, fn(row, col, args...));
+                    }
+                }
+// since we do a sub-subgroup reduction, we can't use reduce
+#pragma unroll
+                for (int i = 1; i < ssg_size; i *= 2) {
+                    partial = op(partial, subgroup.shfl_xor(partial, i));
+                }
+                if (col_block < col_blocks && ssg_rank == 0) {
+                    result[(row + col_block * rows) * result_stride] =
+                        finalize(partial);
+                }
+            });
+    });
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_kernel_row_reduction_2d,
+                                    generic_kernel_row_reduction_2d);
+
+
+template <int icfg, int ssg_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void generic_kernel_col_reduction_2d_small(sycl::handler& cgh, int64 rows,
+                                           int64 cols, int64 row_blocks,
+                                           KernelFunction fn, ReductionOp op,
+                                           FinalizeOp finalize, ValueType init,
+                                           ValueType* result,
+                                           MappedKernelArgs... args)
+{
+    constexpr auto wg_size =
+        KCFG_1D::decode<0>(static_cast<std::uint32_t>(icfg));
+    constexpr auto sg_size =
+        KCFG_1D::decode<1>(static_cast<std::uint32_t>(icfg));
+    static_assert(ssg_size <= sg_size, "ssg must be smaller than sg");
+    constexpr auto subgroups_per_workgroup = wg_size / sg_size;
+    // stores the subwarp_size partial sums from each warp, grouped by warp
+    constexpr auto shared_storage = subgroups_per_workgroup * ssg_size;
+    sycl::accessor<UninitializedArray<ValueType, shared_storage>, 1,
+                   sycl::access_mode::read_write, sycl::access::target::local>
+        block_partial_acc(sycl::range<1>{1}, cgh);
+    const auto range = sycl_nd_range(dim3(row_blocks), dim3(wg_size));
+    cgh.parallel_for(
+        range, [=
+    ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] {
+            auto block_partial = &block_partial_acc[0][0];
+            const auto ssg_id =
+                thread::get_subwarp_id_flat<ssg_size, int64>(id);
+            const auto local_sg_id = id.get_local_id(2) / sg_size;
+            const auto local_ssg_id = id.get_local_id(2) % sg_size / ssg_size;
+            const auto ssg_num =
+                thread::get_subwarp_num_flat<ssg_size, int64>(id);
+            const auto workgroup = group::this_thread_block(id);
+            // TODO remove
+            if (id.get_local_id(2) < shared_storage) {
+                block_partial[id.get_local_id(2)] = init;
+            }
+            workgroup.sync();
+            // TODO end
+            const auto subgroup = group::tiled_partition<sg_size>(workgroup);
+            const auto sg_rank = subgroup.thread_rank();
+            const auto ssg_rank = sg_rank % ssg_size;
+            const auto col = static_cast<int64>(ssg_rank);
+            auto partial = init;
+            // accumulate within a thread
+            if (col < cols) {
+                for (auto row = ssg_id; row < rows; row += ssg_num) {
+                    partial = op(partial, fn(row, col, args...));
+                }
+            }
+        // accumulate between all subsubgroups in the subgroup
+#pragma unroll
+            for (unsigned i = ssg_size; i < sg_size; i *= 2) {
+                partial = op(partial, subgroup.shfl_xor(partial, i));
+            }
+            // store the result to shared memory
+            if (local_ssg_id == 0) {
+                block_partial[local_sg_id * ssg_size + ssg_rank] = partial;
+            }
+            workgroup.sync();
+            // in a single thread: accumulate the results
+            if (local_sg_id == 0) {
+                partial = init;
+                // accumulate the partial results within a thread
+                if (shared_storage >= sg_size) {
+#pragma unroll
+                    for (int i = 0; i < shared_storage; i += sg_size) {
+                        partial = op(partial, block_partial[i + sg_rank]);
+                    }
+                } else if (sg_rank < shared_storage) {
+                    partial = op(partial, block_partial[sg_rank]);
+                }
+            // accumulate between all subsubgroups in the subgroup
+#pragma unroll
+                for (unsigned i = ssg_size; i < sg_size; i *= 2) {
+                    partial = op(partial, subgroup.shfl_xor(partial, i));
+                }
+                if (sg_rank < cols) {
+                    result[sg_rank + id.get_group(2) * cols] =
+                        finalize(partial);
+                }
+            }
+        });
+}
+
+
+template <int icfg, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void generic_kernel_col_reduction_2d_blocked(
+    sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks,
+    int64 col_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize,
+    ValueType init, ValueType* result, MappedKernelArgs... args)
+{
+    constexpr auto cfg = static_cast<std::uint32_t>(icfg);
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    const auto range =
+        sycl_nd_range(dim3(row_blocks, col_blocks), dim3(wg_size));
+    sycl::accessor<UninitializedArray<ValueType, wg_size>, 1,
+                   sycl::access_mode::read_write, sycl::access::target::local>
+        block_partial_acc(sycl::range<1>{1}, cgh);
+    cgh.parallel_for(
+        range, [=
+    ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] {
+            const auto sg_id = thread::get_subwarp_id_flat<sg_size, int64>(id);
+            const auto sg_num =
+                thread::get_subwarp_num_flat<sg_size, int64>(id);
+            const auto workgroup = group::this_thread_block(id);
+            const auto subgroup = group::tiled_partition<sg_size>(workgroup);
+            const auto sg_rank = subgroup.thread_rank();
+            const auto col =
+                sg_rank + static_cast<int64>(id.get_group(1)) * sg_size;
+            auto block_partial = &block_partial_acc[0][0];
+            auto partial = init;
+            // accumulate within a thread
+            if (col < cols) {
+                for (auto row = sg_id; row < rows; row += sg_num) {
+                    partial = op(partial, fn(row, col, args...));
+                }
+            }
+            block_partial[id.get_local_id(2)] = partial;
+            workgroup.sync();
+            // in a single warp: accumulate the results
+            if (id.get_local_id(2) < sg_size) {
+                partial = init;
+            // accumulate the partial results within a thread
+#pragma unroll
+                for (int i = 0; i < wg_size; i += sg_size) {
+                    partial = op(partial, block_partial[i + sg_rank]);
+                }
+                if (col < cols) {
+                    result[col + id.get_group(2) * cols] = finalize(partial);
+                }
+            }
+        });
+}
+
+
+template <typename ValueType, typename ReductionOp, typename FinalizeOp>
+void generic_kernel_reduction_finalize_2d(sycl::handler& cgh, int64 num_results,
+                                          int64 num_blocks, ReductionOp op,
+                                          FinalizeOp finalize, ValueType init,
+                                          const ValueType* input,
+                                          int64 result_stride,
+                                          ValueType* result)
+{
+    cgh.parallel_for(sycl::range<1>{static_cast<std::size_t>(num_results)},
+                     [=](sycl::id<1> id) {
+                         auto partial = init;
+                         for (int64 block = 0; block < num_blocks; block++) {
+                             partial = op(partial,
+                                          input[id[0] + block * num_results]);
+                         }
+                         result[id[0] * result_stride] = finalize(partial);
+                     });
+}
+
+
+template <int icfg, int ssg_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_generic_col_reduction_small(syn::value_list<int, ssg_size>,
+                                     std::shared_ptr<const DpcppExecutor> exec,
+                                     int64 max_workgroups, KernelFunction fn,
+                                     ReductionOp op, FinalizeOp finalize,
+                                     ValueType init, ValueType* result,
+                                     dim<2> size, MappedKernelArgs... args)
+{
+    constexpr auto wg_size =
+        KCFG_1D::decode<0>(static_cast<std::uint32_t>(icfg));
+    constexpr auto sg_size =
+        KCFG_1D::decode<1>(static_cast<std::uint32_t>(icfg));
+    static_assert(ssg_size <= sg_size, "ssg must be smaller than sg");
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto row_blocks =
+        std::min<int64>(ceildiv(rows * ssg_size, wg_size), max_workgroups);
+    auto queue = exec->get_queue();
+    if (row_blocks <= 1) {
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_col_reduction_2d_small<icfg, ssg_size>(
+                cgh, rows, cols, 1, fn, op, finalize, init, result, args...);
+        });
+    } else {
+        Array<ValueType> tmp_storage{exec,
+                                     static_cast<size_type>(row_blocks * cols)};
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_col_reduction_2d_small<icfg, ssg_size>(
+                cgh, rows, cols, row_blocks, fn, op, [](auto v) { return v; },
+                init, tmp_storage.get_data(), args...);
+        });
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_reduction_finalize_2d(
+                cgh, cols, row_blocks, op, finalize, init,
+                tmp_storage.get_const_data(), 1, result);
+        });
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small,
+                                    run_generic_col_reduction_small);
+
+
+template <int icfg, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_kernel_row_reduction_stage1(syn::value_list<int, icfg>,
+                                     std::shared_ptr<const DpcppExecutor> exec,
+                                     KernelFunction fn, ReductionOp op,
+                                     FinalizeOp finalize, ValueType init,
+                                     ValueType* result, size_type result_stride,
+                                     dim<2> size, MappedKernelArgs... args)
+{
+    constexpr auto wg_size =
+        KCFG_1D::decode<0>(static_cast<std::uint32_t>(icfg));
+    constexpr auto sg_size =
+        KCFG_1D::decode<1>(static_cast<std::uint32_t>(icfg));
+    using subsubgroup_sizes =
+        syn::value_list<int, 1, 2, 4, 8, std::min<int>(16, sg_size),
+                        std::min<int>(32, sg_size), sg_size>;
+    constexpr int oversubscription = 16;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto resources =
+        exec->get_num_computing_units() * sg_size * oversubscription;
+    auto queue = exec->get_queue();
+    if (rows * cols > resources && rows < cols) {
+        const auto col_blocks = ceildiv(rows * cols, resources);
+        Array<ValueType> partial{exec,
+                                 static_cast<size_type>(col_blocks * rows)};
+        generic_kernel_row_reduction_2d<icfg, sg_size>(
+            syn::value_list<int, sg_size>{}, exec, rows, cols, col_blocks, fn,
+            op, [](auto v) { return v; }, init, partial.get_data(), 1, args...);
+        queue->submit([&](sycl::handler& cgh) {
+            generic_kernel_reduction_finalize_2d(
+                cgh, rows, col_blocks, op, finalize, init,
+                partial.get_const_data(), static_cast<int64>(result_stride),
+                result);
+        });
+    } else {
+        select_generic_kernel_row_reduction_2d(
+            subsubgroup_sizes(),
+            [&](int compiled_ssg_size) {
+                return compiled_ssg_size >= cols ||
+                       compiled_ssg_size == sg_size;
+            },
+            syn::value_list<int, icfg>(), syn::type_list<>(), exec, rows, cols,
+            1, fn, op, finalize, init, result,
+            static_cast<int64>(result_stride), args...);
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_kernel_row_reduction_stage1,
+                                    run_kernel_row_reduction_stage1);
+
+
+template <int icfg, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_kernel_col_reduction_stage1(syn::value_list<int, icfg>,
+                                     std::shared_ptr<const DpcppExecutor> exec,
+                                     KernelFunction fn, ReductionOp op,
+                                     FinalizeOp finalize, ValueType init,
+                                     ValueType* result, dim<2> size,
+                                     MappedKernelArgs... args)
+{
+    constexpr auto wg_size =
+        KCFG_1D::decode<0>(static_cast<std::uint32_t>(icfg));
+    constexpr auto sg_size =
+        KCFG_1D::decode<1>(static_cast<std::uint32_t>(icfg));
+    using subsubgroup_sizes =
+        syn::value_list<int, 1, 2, 4, 8, std::min<int>(16, sg_size),
+                        std::min<int>(32, sg_size), sg_size>;
+    constexpr int oversubscription = 16;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto max_blocks =
+        exec->get_num_computing_units() * sg_size * oversubscription / wg_size;
+    if (cols <= sg_size) {
+        select_generic_col_reduction_small(
+            subsubgroup_sizes(),
+            [&](int compiled_ssg_size) {
+                return compiled_ssg_size >= cols ||
+                       compiled_ssg_size == sg_size;
+            },
+            syn::value_list<int, icfg>(), syn::type_list<>(), exec, max_blocks,
+            fn, op, finalize, init, result, size, args...);
+    } else {
+        const auto col_blocks = ceildiv(cols, sg_size);
+        const auto row_blocks = ceildiv(
+            std::min<int64>(ceildiv(rows * sg_size, wg_size), max_blocks),
+            col_blocks);
+        auto queue = exec->get_queue();
+        if (row_blocks <= 1) {
+            queue->submit([&](sycl::handler& cgh) {
+                generic_kernel_col_reduction_2d_blocked<icfg>(
+                    cgh, rows, cols, 1, col_blocks, fn, op, finalize, init,
+                    result, args...);
+            });
+        } else {
+            Array<ValueType> tmp_storage{
+                exec, static_cast<size_type>(row_blocks * cols)};
+            queue->submit([&](sycl::handler& cgh) {
+                generic_kernel_col_reduction_2d_blocked<icfg>(
+                    cgh, rows, cols, row_blocks, col_blocks, fn, op,
+                    [](auto v) { return v; }, init, tmp_storage.get_data(),
+                    args...);
+            });
+            queue->submit([&](sycl::handler& cgh) {
+                generic_kernel_reduction_finalize_2d(
+                    cgh, cols, row_blocks, op, finalize, init,
+                    tmp_storage.get_const_data(), 1, result);
+            });
+        }
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_kernel_col_reduction_stage1,
+                                    run_kernel_col_reduction_stage1);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_row_reduction(std::shared_ptr<const DpcppExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType init,
+                              ValueType* result, size_type result_stride,
+                              dim<2> size, KernelArgs&&... args)
+{
+    const auto desired_icfg = static_cast<int>(get_first_cfg(
+        as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
+            return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        }));
+    select_kernel_row_reduction_stage1(
+        kcfg_1d_list_simple_reduction,
+        [&](int icfg) { return icfg == desired_icfg; }, syn::value_list<int>(),
+        syn::type_list<>(), exec, fn, op, finalize, init, result, result_stride,
+        size, map_to_device(args)...);
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_col_reduction(std::shared_ptr<const DpcppExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType init,
+                              ValueType* result, dim<2> size,
+                              KernelArgs&&... args)
+{
+    const auto desired_icfg = static_cast<int>(get_first_cfg(
+        as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
+            return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        }));
+    select_kernel_col_reduction_stage1(
+        kcfg_1d_list_simple_reduction,
+        [&](int icfg) { return icfg == desired_icfg; }, syn::value_list<int>(),
+        syn::type_list<>(), exec, fn, op, finalize, init, result, size,
+        map_to_device(args)...);
+}
+
+
 }  // namespace dpcpp
 }  // namespace kernels
 }  // namespace gko
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 9a86ab9cd15..7873a687e4b 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -84,284 +84,6 @@ constexpr int default_block_size = 256;
 namespace kernel {
 
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename OutType,
-          typename CallableGetValue, typename CallableReduce>
-void compute_partial_reduce(
-    size_type num_rows, OutType* __restrict__ work, CallableGetValue get_value,
-    CallableReduce reduce_op, sycl::nd_item<3> item_ct1,
-    UninitializedArray<OutType, KCFG_1D::decode<0>(cfg)>& tmp_work)
-{
-    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
-
-    constexpr auto warps_per_block = wg_size / sg_size;
-
-    const auto num_blocks = item_ct1.get_group_range(2);
-    const auto local_id = thread::get_local_thread_id<sg_size>(item_ct1);
-    const auto global_id =
-        thread::get_thread_id<sg_size, warps_per_block>(item_ct1);
-
-    OutType* tmp_work_array = tmp_work;
-    auto tmp = zero<OutType>();
-    for (auto i = global_id; i < num_rows; i += wg_size * num_blocks) {
-        tmp = reduce_op(tmp, get_value(i));
-    }
-
-    tmp_work_array[local_id] = tmp;
-
-    ::gko::kernels::dpcpp::reduce<sg_size>(group::this_thread_block(item_ct1),
-                                           tmp_work_array, reduce_op);
-
-    if (local_id == 0) {
-        work[thread::get_block_id(item_ct1)] = tmp_work_array[0];
-    }
-}
-
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType,
-          typename CallableReduce, typename CallableFinalize>
-void finalize_reduce_computation(
-    size_type size, const ValueType* work, ValueType* result,
-    CallableReduce reduce_op, CallableFinalize finalize_op,
-    sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)>& tmp_work)
-{
-    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
-
-    const auto local_id = thread::get_local_thread_id<sg_size>(item_ct1);
-
-    ValueType tmp = zero<ValueType>();
-    for (auto i = local_id; i < size; i += wg_size) {
-        tmp = reduce_op(tmp, work[i]);
-    }
-    ValueType* tmp_work_array = tmp_work;
-    tmp_work_array[local_id] = tmp;
-
-    ::gko::kernels::dpcpp::reduce<sg_size>(group::this_thread_block(item_ct1),
-                                           tmp_work_array, reduce_op);
-
-    if (local_id == 0) {
-        *result = finalize_op(tmp_work_array[0]);
-    }
-}
-
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void compute_partial_dot(
-    size_type num_rows, const ValueType* __restrict__ x, size_type stride_x,
-    const ValueType* __restrict__ y, size_type stride_y,
-    ValueType* __restrict__ work, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)>& tmp_work)
-{
-    compute_partial_reduce<cfg>(
-        num_rows, work,
-        [x, stride_x, y, stride_y](size_type i) {
-            return x[i * stride_x] * y[i * stride_y];
-        },
-        [](const ValueType& x, const ValueType& y) { return x + y; }, item_ct1,
-        tmp_work);
-}
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void compute_partial_dot(dim3 grid, dim3 block, size_type dynamic_shared_memory,
-                         sycl::queue* queue, size_type num_rows,
-                         const ValueType* x, size_type stride_x,
-                         const ValueType* y, size_type stride_y,
-                         ValueType* work)
-{
-    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
-                       sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            tmp_work_acc_ct1(cgh);
-
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                compute_partial_dot<cfg>(num_rows, x, stride_x, y, stride_y,
-                                         work, item_ct1,
-                                         *tmp_work_acc_ct1.get_pointer());
-            });
-    });
-}
-
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_dot,
-                                           compute_partial_dot)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_dot_call, compute_partial_dot,
-                               kcfg_1d_list)
-
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void compute_partial_conj_dot(
-    size_type num_rows, const ValueType* __restrict__ x, size_type stride_x,
-    const ValueType* __restrict__ y, size_type stride_y,
-    ValueType* __restrict__ work, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)>& tmp_work)
-{
-    compute_partial_reduce<cfg>(
-        num_rows, work,
-        [x, stride_x, y, stride_y](size_type i) {
-            return conj(x[i * stride_x]) * y[i * stride_y];
-        },
-        [](const ValueType& x, const ValueType& y) { return x + y; }, item_ct1,
-        tmp_work);
-}
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void compute_partial_conj_dot(dim3 grid, dim3 block,
-                              size_type dynamic_shared_memory,
-                              sycl::queue* queue, size_type num_rows,
-                              const ValueType* x, size_type stride_x,
-                              const ValueType* y, size_type stride_y,
-                              ValueType* work)
-{
-    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
-                       sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            tmp_work_acc_ct1(cgh);
-
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                compute_partial_conj_dot<cfg>(num_rows, x, stride_x, y,
-                                              stride_y, work, item_ct1,
-                                              *tmp_work_acc_ct1.get_pointer());
-            });
-    });
-}
-
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_conj_dot,
-                                           compute_partial_conj_dot)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_conj_dot_call,
-                               compute_partial_conj_dot, kcfg_1d_list)
-
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void finalize_sum_reduce_computation(
-    size_type size, const ValueType* work, ValueType* result,
-    sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)>& tmp_work)
-{
-    finalize_reduce_computation<cfg>(
-        size, work, result,
-        [](const ValueType& x, const ValueType& y) { return x + y; },
-        [](const ValueType& x) { return x; }, item_ct1, tmp_work);
-}
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void finalize_sum_reduce_computation(dim3 grid, dim3 block,
-                                     size_type dynamic_shared_memory,
-                                     sycl::queue* queue, size_type size,
-                                     const ValueType* work, ValueType* result)
-{
-    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
-                       sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            tmp_work_acc_ct1(cgh);
-
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             finalize_sum_reduce_computation<cfg>(
-                                 size, work, result, item_ct1,
-                                 *tmp_work_acc_ct1.get_pointer());
-                         });
-    });
-}
-
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sum_reduce_computation,
-                                           finalize_sum_reduce_computation)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sum_reduce_computation_call,
-                               finalize_sum_reduce_computation, kcfg_1d_list)
-
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void compute_partial_norm2(
-    size_type num_rows, const ValueType* __restrict__ x, size_type stride_x,
-    remove_complex<ValueType>* __restrict__ work, sycl::nd_item<3> item_ct1,
-    UninitializedArray<remove_complex<ValueType>, KCFG_1D::decode<0>(cfg)>&
-        tmp_work)
-{
-    using norm_type = remove_complex<ValueType>;
-    compute_partial_reduce<cfg>(
-        num_rows, work,
-        [x, stride_x](size_type i) { return squared_norm(x[i * stride_x]); },
-        [](const norm_type& x, const norm_type& y) { return x + y; }, item_ct1,
-        tmp_work);
-}
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void compute_partial_norm2(dim3 grid, dim3 block,
-                           size_type dynamic_shared_memory, sycl::queue* queue,
-                           size_type num_rows, const ValueType* x,
-                           size_type stride_x, remove_complex<ValueType>* work)
-{
-    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<remove_complex<ValueType>, wg_size>,
-                       0, sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            tmp_work_acc_ct1(cgh);
-
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                compute_partial_norm2<cfg>(num_rows, x, stride_x, work,
-                                           item_ct1,
-                                           *tmp_work_acc_ct1.get_pointer());
-            });
-    });
-}
-
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_norm2,
-                                           compute_partial_norm2)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_norm2_call,
-                               compute_partial_norm2, kcfg_1d_list)
-
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void finalize_sqrt_reduce_computation(
-    size_type size, const ValueType* work, ValueType* result,
-    sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)>& tmp_work)
-{
-    finalize_reduce_computation<cfg>(
-        size, work, result,
-        [](const ValueType& x, const ValueType& y) { return x + y; },
-        [](const ValueType& x) { return std::sqrt(x); }, item_ct1, tmp_work);
-}
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void finalize_sqrt_reduce_computation(dim3 grid, dim3 block,
-                                      size_type dynamic_shared_memory,
-                                      sycl::queue* queue, size_type size,
-                                      const ValueType* work, ValueType* result)
-{
-    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
-                       sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            tmp_work_acc_ct1(cgh);
-
-
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             finalize_sqrt_reduce_computation<cfg>(
-                                 size, work, result, item_ct1,
-                                 *tmp_work_acc_ct1.get_pointer());
-                         });
-    });
-}
-
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sqrt_reduce_computation,
-                                           finalize_sqrt_reduce_computation)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sqrt_reduce_computation_call,
-                               finalize_sqrt_reduce_computation, kcfg_1d_list)
-
-
 template <typename ValueType, typename IndexType>
 void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride,
                  const size_type* __restrict__ row_ptrs,
@@ -812,144 +534,6 @@ void apply(std::shared_ptr<const DpcppExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
-template <typename ValueType>
-void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
-                 const matrix::Dense<ValueType>* x,
-                 const matrix::Dense<ValueType>* y,
-                 matrix::Dense<ValueType>* result)
-{
-    if (x->get_size()[1] == 1) {
-        // TODO: write a custom kernel which does this more efficiently
-        onemkl::dot(*exec->get_queue(), x->get_size()[0], x->get_const_values(),
-                    x->get_stride(), y->get_const_values(), y->get_stride(),
-                    result->get_values());
-    } else {
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        auto queue = exec->get_queue();
-        constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-        const std::uint32_t cfg =
-            get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
-                return validate(queue, KCFG_1D::decode<0>(cfg),
-                                KCFG_1D::decode<1>(cfg));
-            });
-        const auto wg_size = KCFG_1D::decode<0>(cfg);
-        const auto sg_size = KCFG_1D::decode<1>(cfg);
-        const auto work_per_block = work_per_thread * wg_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{sg_size, 1, wg_size / sg_size};
-        Array<ValueType> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            kernel::compute_partial_dot_call(
-                cfg, grid_dim, block_dim, 0, exec->get_queue(),
-                x->get_size()[0], x->get_const_values() + col, x->get_stride(),
-                y->get_const_values() + col, y->get_stride(), work.get_data());
-            kernel::finalize_sum_reduce_computation_call(
-                cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
-                work.get_const_data(), result->get_values() + col);
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot(std::shared_ptr<const DpcppExecutor> exec,
-                      const matrix::Dense<ValueType>* x,
-                      const matrix::Dense<ValueType>* y,
-                      matrix::Dense<ValueType>* result)
-{
-    if (x->get_size()[1] == 1) {
-        // TODO: write a custom kernel which does this more efficiently
-        onemkl::conj_dot(*exec->get_queue(), x->get_size()[0],
-                         x->get_const_values(), x->get_stride(),
-                         y->get_const_values(), y->get_stride(),
-                         result->get_values());
-
-    } else {
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        auto queue = exec->get_queue();
-        constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-        const std::uint32_t cfg =
-            get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
-                return validate(queue, KCFG_1D::decode<0>(cfg),
-                                KCFG_1D::decode<1>(cfg));
-            });
-        const auto wg_size = KCFG_1D::decode<0>(cfg);
-        const auto sg_size = KCFG_1D::decode<1>(cfg);
-
-        const auto work_per_block = work_per_thread * wg_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{sg_size, 1, wg_size / sg_size};
-        Array<ValueType> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            kernel::compute_partial_conj_dot_call(
-                cfg, grid_dim, block_dim, 0, exec->get_queue(),
-                x->get_size()[0], x->get_const_values() + col, x->get_stride(),
-                y->get_const_values() + col, y->get_stride(), work.get_data());
-            kernel::finalize_sum_reduce_computation_call(
-                cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
-                work.get_const_data(), result->get_values() + col);
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
-                   const matrix::Dense<ValueType>* x,
-                   matrix::Dense<remove_complex<ValueType>>* result)
-{
-    if (x->get_size()[1] == 1) {
-        oneapi::mkl::blas::row_major::nrm2(
-            *exec->get_queue(), x->get_size()[0], x->get_const_values(),
-            x->get_stride(), result->get_values());
-    } else {
-        using norm_type = remove_complex<ValueType>;
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
-        constexpr int work_per_thread = 32;
-        auto queue = exec->get_queue();
-        constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-        const std::uint32_t cfg =
-            get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
-                return validate(queue, KCFG_1D::decode<0>(cfg),
-                                KCFG_1D::decode<1>(cfg));
-            });
-        const auto wg_size = KCFG_1D::decode<0>(cfg);
-        const auto sg_size = KCFG_1D::decode<1>(cfg);
-
-        const auto work_per_block = work_per_thread * wg_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{sg_size, 1, wg_size / sg_size};
-        Array<norm_type> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            kernel::compute_partial_norm2_call(
-                cfg, grid_dim, block_dim, 0, exec->get_queue(),
-                x->get_size()[0], x->get_const_values() + col, x->get_stride(),
-                work.get_data());
-            kernel::finalize_sqrt_reduce_computation_call(
-                cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
-                work.get_const_data(), result->get_values() + col);
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-
-
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const DpcppExecutor> exec,
                     const matrix::Dense<ValueType>* source,
diff --git a/dpcpp/test/base/kernel_launch.dp.cpp b/dpcpp/test/base/kernel_launch.dp.cpp
index 592ce9b934c..8a70d9bc1cb 100644
--- a/dpcpp/test/base/kernel_launch.dp.cpp
+++ b/dpcpp/test/base/kernel_launch.dp.cpp
@@ -349,4 +349,74 @@ TEST_F(KernelLaunch, Reduction2D)
 }
 
 
+TEST_F(KernelLaunch, ReductionRow2D)
+{
+    for (auto num_rows : {0, 1, 10, 100, 1000, 10000}) {
+        for (auto num_cols : {0, 1, 10, 100, 1000, 10000}) {
+            SCOPED_TRACE(std::to_string(num_rows) + " rows, " +
+                         std::to_string(num_cols) + " cols");
+            gko::Array<int64> host_ref{exec->get_master(),
+                                       static_cast<size_type>(2 * num_rows)};
+            std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
+            gko::Array<int64> output{exec, host_ref};
+            for (int i = 0; i < num_rows; i++) {
+                host_ref.get_data()[2 * i] =
+                    static_cast<int64>(num_cols) * (num_cols + 1) * (i + 1);
+            }
+
+            gko::kernels::dpcpp::run_kernel_row_reduction(
+                exec,
+                [] GKO_KERNEL(auto i, auto j, auto a) {
+                    static_assert(is_same<decltype(i), int64>::value, "index");
+                    static_assert(is_same<decltype(a), int64*>::value, "value");
+                    return (i + 1) * (j + 1);
+                },
+                [] GKO_KERNEL(auto i, auto j) { return i + j; },
+                [] GKO_KERNEL(auto j) { return 2 * j; }, int64{},
+                output.get_data(), 2,
+                gko::dim<2>{static_cast<size_type>(num_rows),
+                            static_cast<size_type>(num_cols)},
+                output);
+
+            GKO_ASSERT_ARRAY_EQ(host_ref, output);
+        }
+    }
+}
+
+
+TEST_F(KernelLaunch, ReductionCol2D)
+{
+    for (int num_rows : {0, 1, 10, 100, 1000, 10000}) {
+        for (int num_cols :
+             {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 40, 100, 1000}) {
+            SCOPED_TRACE(std::to_string(num_rows) + " rows, " +
+                         std::to_string(num_cols) + " cols");
+            gko::Array<int64> host_ref{exec->get_master(),
+                                       static_cast<size_type>(num_cols)};
+            gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
+            for (int i = 0; i < num_cols; i++) {
+                host_ref.get_data()[i] =
+                    static_cast<int64>(num_rows) * (num_rows + 1) * (i + 1);
+            }
+
+            gko::kernels::dpcpp::run_kernel_col_reduction(
+                exec,
+                [] GKO_KERNEL(auto i, auto j, auto a) {
+                    static_assert(is_same<decltype(i), int64>::value, "index");
+                    static_assert(is_same<decltype(a), int64*>::value, "value");
+                    return (i + 1) * (j + 1);
+                },
+                [] GKO_KERNEL(auto i, auto j) { return i + j; },
+                [] GKO_KERNEL(auto j) { return j * 2; }, int64{},
+                output.get_data(),
+                gko::dim<2>{static_cast<size_type>(num_rows),
+                            static_cast<size_type>(num_cols)},
+                output);
+
+            GKO_ASSERT_ARRAY_EQ(host_ref, output);
+        }
+    }
+}
+
+
 }  // namespace

From 7eec0c51aeb1e7d260519362eab49dc2037297e9 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 11 Aug 2021 10:42:55 +0200
Subject: [PATCH 17/25] fix include guards for kernel launch reduction

---
 common/unified/base/kernel_launch_reduction.hpp | 2 +-
 cuda/base/kernel_launch_reduction.cuh           | 2 +-
 hip/base/kernel_launch_reduction.hip.hpp        | 2 +-
 omp/base/kernel_launch_reduction.hpp            | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/common/unified/base/kernel_launch_reduction.hpp b/common/unified/base/kernel_launch_reduction.hpp
index 78de06466aa..9eb65216416 100644
--- a/common/unified/base/kernel_launch_reduction.hpp
+++ b/common/unified/base/kernel_launch_reduction.hpp
@@ -48,4 +48,4 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-#endif  // GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#endif  // GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
index 49a6ca95f87..a083a92eac5 100644
--- a/cuda/base/kernel_launch_reduction.cuh
+++ b/cuda/base/kernel_launch_reduction.cuh
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
 #error \
     "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp"
 #endif
diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp
index aa3f3384ca6..502a87cc3fd 100644
--- a/hip/base/kernel_launch_reduction.hip.hpp
+++ b/hip/base/kernel_launch_reduction.hip.hpp
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
 #error \
     "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp"
 #endif
diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
index 84758549918..bcab938449f 100644
--- a/omp/base/kernel_launch_reduction.hpp
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
 #error \
     "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp"
 #endif

From 53ba972021dace92fae73063650f747d7e549be2 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 11 Aug 2021 10:43:13 +0200
Subject: [PATCH 18/25] run header formatting on common kernels

---
 .github/bot-pr-format-base.sh | 4 ++--
 dev_tools/scripts/config      | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/bot-pr-format-base.sh b/.github/bot-pr-format-base.sh
index e72539a9d61..10be68353b9 100644
--- a/.github/bot-pr-format-base.sh
+++ b/.github/bot-pr-format-base.sh
@@ -3,8 +3,8 @@
 source .github/bot-pr-base.sh
 
 EXTENSION_REGEX='\.(cuh?|hpp|hpp\.inc?|cpp)$'
-FORMAT_HEADER_REGEX='^(benchmark|core|cuda|hip|include/ginkgo/core|omp|reference|dpcpp)/'
-FORMAT_REGEX='^(common|examples|test)/'
+FORMAT_HEADER_REGEX='^(benchmark|core|cuda|hip|include/ginkgo/core|omp|reference|dpcpp|common/unified)/'
+FORMAT_REGEX='^(common/cuda_hip|examples|test)/'
 
 echo "Retrieving PR file list"
 PR_FILES=$(bot_get_all_changed_files ${PR_URL})
diff --git a/dev_tools/scripts/config b/dev_tools/scripts/config
index 696d886d489..a8b0cb3841a 100644
--- a/dev_tools/scripts/config
+++ b/dev_tools/scripts/config
@@ -30,6 +30,9 @@
     - FixInclude: "common/unified/base/kernel_launch.hpp"
 - "(cuda|hip|dpcpp|omp)/base/kernel_launch_solver\."
     - FixInclude: "common/unified/base/kernel_launch_solver.hpp"
+- "common/unified/.*.cpp"
+    - PathIgnore: "2"
+    - PathPrefix: "core"
 - "core/test/base/(extended_float|iterator_factory)"
     - RemoveTest: "true"
 - "core/test/base/allocator"

From 5889bbc23205f87a839de1dbf19d733e0287191a Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 11 Aug 2021 11:57:17 +0200
Subject: [PATCH 19/25] work around Intel compiler bug

icpc has issues with some combination of pragma omp parallel for and
lambdas called inside the loop:
  internal error: assertion failed: find_assoc_pragma: pragma not found
Putting the entire scope into an immediately evaluated lambda helps
---
 omp/base/kernel_launch_reduction.hpp | 70 ++++++++++++++--------------
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
index bcab938449f..0c5acf0ebe2 100644
--- a/omp/base/kernel_launch_reduction.hpp
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -74,9 +74,7 @@ void run_kernel_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
 
         auto local_partial = init;
         for (auto i = begin; i < end; i++) {
-            local_partial = op(local_partial, [&]() {
-                return fn(i, map_to_device(args)...);
-            }());
+            local_partial = op(local_partial, fn(i, map_to_device(args)...));
         }
         partial.get_data()[thread_id] = local_partial;
     }
@@ -118,9 +116,7 @@ void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
             for (auto row = begin; row < end; row++) {
 #pragma unroll
                 for (int64 col = 0; col < local_cols; col++) {
-                    local_partial = op(local_partial, [&]() {
-                        return fn(row, col, args...);
-                    }());
+                    local_partial = op(local_partial, fn(row, col, args...));
                 }
             }
         } else {
@@ -131,16 +127,14 @@ void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
                      base_col += block_size) {
 #pragma unroll
                     for (int64 i = 0; i < block_size; i++) {
-                        local_partial = op(local_partial, [&]() {
-                            return fn(row, base_col + i, args...);
-                        }());
+                        local_partial =
+                            op(local_partial, fn(row, base_col + i, args...));
                     }
                 }
 #pragma unroll
                 for (int64 i = 0; i < remainder_cols; i++) {
-                    local_partial = op(local_partial, [&]() {
-                        return fn(row, rounded_cols + i, args...);
-                    }());
+                    local_partial =
+                        op(local_partial, fn(row, rounded_cols + i, args...));
                 }
             }
         }
@@ -217,12 +211,13 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
         cols < rows) {
 #pragma omp parallel for
         for (int64 row = 0; row < rows; row++) {
-            auto partial = init;
-            for (int64 col = 0; col < cols; col++) {
-                partial =
-                    op(partial, [&]() { return fn(row, col, args...); }());
-            }
-            result[result_stride * row] = finalize(partial);
+            [&]() {
+                auto partial = init;
+                for (int64 col = 0; col < cols; col++) {
+                    partial = op(partial, fn(row, col, args...));
+                }
+                result[result_stride * row] = finalize(partial);
+            }();
         }
     } else {
         // small number of rows and large reduction sizes: do partial sum first
@@ -248,13 +243,17 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
         // then accumulate the partial sums and write to result
 #pragma omp parallel for
         for (int64 row = 0; row < rows; row++) {
-            auto local_partial = init;
-            for (int64 thread_id = 0; thread_id < num_threads; thread_id++) {
-                local_partial =
-                    op(local_partial,
-                       partial.get_const_data()[row * num_threads + thread_id]);
-            }
-            result[row * result_stride] = finalize(local_partial);
+            [&] {
+                auto local_partial = init;
+                for (int64 thread_id = 0; thread_id < num_threads;
+                     thread_id++) {
+                    local_partial = op(
+                        local_partial,
+                        partial
+                            .get_const_data()[row * num_threads + thread_id]);
+                }
+                result[row * result_stride] = finalize(local_partial);
+            }();
         }
     }
 }
@@ -273,9 +272,8 @@ void run_kernel_col_reduction_sized_block_impl(
     for (auto row = row_begin; row < row_end; row++) {
 #pragma unroll
         for (int64 rel_col = 0; rel_col < local_cols; rel_col++) {
-            partial[rel_col] = op(partial[rel_col], [&]() {
-                return fn(row, base_col + rel_col, args...);
-            }());
+            partial[rel_col] =
+                op(partial[rel_col], fn(row, base_col + rel_col, args...));
         }
     }
 #pragma unroll
@@ -343,12 +341,16 @@ void run_kernel_col_reduction_sized_impl(
         }
 #pragma omp parallel for
         for (int64 col = 0; col < cols; col++) {
-            auto total = init;
-            for (int64 row_block = 0; row_block < reduction_size; row_block++) {
-                total =
-                    op(total, partial.get_const_data()[col + cols * row_block]);
-            }
-            result[col] = finalize(total);
+            [&] {
+                auto total = init;
+                for (int64 row_block = 0; row_block < reduction_size;
+                     row_block++) {
+                    total =
+                        op(total,
+                           partial.get_const_data()[col + cols * row_block]);
+                }
+                result[col] = finalize(total);
+            }();
         }
     }
 }

From 00e61103095f4ad966f0281ddf68eb1650c35632 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Mon, 4 Oct 2021 12:23:40 +0200
Subject: [PATCH 20/25] review updates

* remove unnecessary shmem init
* add test comments
---
 cuda/base/kernel_launch_reduction.cuh     |  6 ------
 cuda/test/base/kernel_launch.cu           | 16 ++++++++++++----
 dpcpp/base/kernel_launch_reduction.dp.hpp |  6 ------
 dpcpp/test/base/kernel_launch.dp.cpp      | 16 ++++++++++++----
 hip/base/kernel_launch_reduction.hip.hpp  |  6 ------
 hip/test/base/kernel_launch.hip.cpp       | 16 ++++++++++++----
 omp/test/base/kernel_launch.cpp           | 22 +++++++++++++++++-----
 7 files changed, 53 insertions(+), 35 deletions(-)

diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
index a083a92eac5..d98bb878672 100644
--- a/cuda/base/kernel_launch_reduction.cuh
+++ b/cuda/base/kernel_launch_reduction.cuh
@@ -252,12 +252,6 @@ __global__
     const auto subwarp_num =
         thread::get_subwarp_num_flat<subwarp_size, int64>();
     const auto block = group::this_thread_block();
-    //
-    if (threadIdx.x < shared_storage) {
-        block_partial[threadIdx.x] = init;
-    }
-    block.sync();
-    //
     const auto warp = group::tiled_partition<warp_size>(block);
     const auto warp_rank = warp.thread_rank();
     const auto subwarp_rank = warp_rank % subwarp_size;
diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu
index 1d43293d553..66fc3d9e94d 100644
--- a/cuda/test/base/kernel_launch.cu
+++ b/cuda/test/base/kernel_launch.cu
@@ -292,7 +292,8 @@ void run1d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
         size_type{100000}, output);
 
-    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000ll);
+    // 2 * sum i=0...99999 (i+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL);
 
     gko::kernels::cuda::run_kernel_reduction(
         exec,
@@ -312,7 +313,8 @@ void run1d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
         },
         int64{}, output.get_data(), size_type{100}, output);
 
-    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll);
+    // 2 * sum i=0...99 (i+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL);
 }
 
 TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); }
@@ -341,7 +343,8 @@ void run2d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
         },
         int64{}, output.get_data(), gko::dim<2>{1000, 100}, output);
 
-    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll);
+    // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL);
 
     gko::kernels::cuda::run_kernel_reduction(
         exec,
@@ -362,7 +365,8 @@ void run2d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
         },
         int64{}, output.get_data(), gko::dim<2>{10, 10}, output);
 
-    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll);
+    // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL);
 }
 
 TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); }
@@ -379,6 +383,8 @@ void run2d_row_reduction(std::shared_ptr<gko::CudaExecutor> exec)
             std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
             gko::Array<int64> output{exec, host_ref};
             for (int64 i = 0; i < num_rows; i++) {
+                // we are computing 2 * sum {j=0, j<cols} (i+1)*(j+1) for each
+                // row i and storing it with stride 2
                 host_ref.get_data()[2 * i] =
                     static_cast<int64>(num_cols) * (num_cols + 1) * (i + 1);
             }
@@ -427,6 +433,8 @@ void run2d_col_reduction(std::shared_ptr<gko::CudaExecutor> exec)
                                        static_cast<size_type>(num_cols)};
             gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
             for (int64 i = 0; i < num_cols; i++) {
+                // we are computing 2 * sum {j=0, j<row} (i+1)*(j+1) for each
+                // column i
                 host_ref.get_data()[i] =
                     static_cast<int64>(num_rows) * (num_rows + 1) * (i + 1);
             }
diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp
index ca82a897269..4bda0422178 100644
--- a/dpcpp/base/kernel_launch_reduction.dp.hpp
+++ b/dpcpp/base/kernel_launch_reduction.dp.hpp
@@ -386,12 +386,6 @@ void generic_kernel_col_reduction_2d_small(sycl::handler& cgh, int64 rows,
             const auto ssg_num =
                 thread::get_subwarp_num_flat<ssg_size, int64>(id);
             const auto workgroup = group::this_thread_block(id);
-            // TODO remove
-            if (id.get_local_id(2) < shared_storage) {
-                block_partial[id.get_local_id(2)] = init;
-            }
-            workgroup.sync();
-            // TODO end
             const auto subgroup = group::tiled_partition<sg_size>(workgroup);
             const auto sg_rank = subgroup.thread_rank();
             const auto ssg_rank = sg_rank % ssg_size;
diff --git a/dpcpp/test/base/kernel_launch.dp.cpp b/dpcpp/test/base/kernel_launch.dp.cpp
index 8a70d9bc1cb..2c7a08cdb36 100644
--- a/dpcpp/test/base/kernel_launch.dp.cpp
+++ b/dpcpp/test/base/kernel_launch.dp.cpp
@@ -279,7 +279,8 @@ TEST_F(KernelLaunch, Reduction1D)
         },
         int64{}, output.get_data(), size_type{100000}, output);
 
-    EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000ll);
+    // 2 * sum i=0...99999 (i+1)
+    EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL);
 
     gko::kernels::dpcpp::run_kernel_reduction(
         exec,
@@ -299,7 +300,8 @@ TEST_F(KernelLaunch, Reduction1D)
         },
         int64{}, output.get_data(), size_type{100}, output);
 
-    EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll);
+    // 2 * sum i=0...99 (i+1)
+    EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL);
 }
 
 
@@ -325,7 +327,8 @@ TEST_F(KernelLaunch, Reduction2D)
         },
         int64{}, output.get_data(), gko::dim<2>{1000, 100}, output);
 
-    EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll);
+    // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1)
+    EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL);
 
     gko::kernels::dpcpp::run_kernel_reduction(
         exec,
@@ -345,7 +348,8 @@ TEST_F(KernelLaunch, Reduction2D)
         },
         int64{}, output.get_data(), gko::dim<2>{10, 10}, output);
 
-    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll);
+    // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL);
 }
 
 
@@ -360,6 +364,8 @@ TEST_F(KernelLaunch, ReductionRow2D)
             std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
             gko::Array<int64> output{exec, host_ref};
             for (int i = 0; i < num_rows; i++) {
+                // we are computing 2 * sum {j=0, j<cols} (i+1)*(j+1) for each
+                // row i and storing it with stride 2
                 host_ref.get_data()[2 * i] =
                     static_cast<int64>(num_cols) * (num_cols + 1) * (i + 1);
             }
@@ -395,6 +401,8 @@ TEST_F(KernelLaunch, ReductionCol2D)
                                        static_cast<size_type>(num_cols)};
             gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
             for (int i = 0; i < num_cols; i++) {
+                // we are computing 2 * sum {j=0, j<row} (i+1)*(j+1) for each
+                // column i
                 host_ref.get_data()[i] =
                     static_cast<int64>(num_rows) * (num_rows + 1) * (i + 1);
             }
diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp
index 502a87cc3fd..47b33f411ac 100644
--- a/hip/base/kernel_launch_reduction.hip.hpp
+++ b/hip/base/kernel_launch_reduction.hip.hpp
@@ -257,12 +257,6 @@ __global__
     const auto subwarp_num =
         thread::get_subwarp_num_flat<subwarp_size, int64>();
     const auto block = group::this_thread_block();
-    //
-    if (threadIdx.x < shared_storage) {
-        block_partial[threadIdx.x] = init;
-    }
-    block.sync();
-    //
     const auto warp = group::tiled_partition<warp_size>(block);
     const auto warp_rank = warp.thread_rank();
     const auto subwarp_rank = warp_rank % subwarp_size;
diff --git a/hip/test/base/kernel_launch.hip.cpp b/hip/test/base/kernel_launch.hip.cpp
index 755f8b3834d..c7add9ddca8 100644
--- a/hip/test/base/kernel_launch.hip.cpp
+++ b/hip/test/base/kernel_launch.hip.cpp
@@ -291,7 +291,8 @@ void run1d_reduction(std::shared_ptr<gko::HipExecutor> exec)
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
         size_type{100000}, output);
 
-    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000ll);
+    // 2 * sum i=0...99999 (i+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL);
 
     gko::kernels::hip::run_kernel_reduction(
         exec,
@@ -304,7 +305,8 @@ void run1d_reduction(std::shared_ptr<gko::HipExecutor> exec)
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
         size_type{100}, output);
 
-    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll);
+    // 2 * sum i=0...99 (i+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL);
 }
 
 TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); }
@@ -325,7 +327,8 @@ void run2d_reduction(std::shared_ptr<gko::HipExecutor> exec)
         [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
         gko::dim<2>{1000, 100}, output);
 
-    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll);
+    // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL);
 
     gko::kernels::hip::run_kernel_reduction(
         exec,
@@ -338,7 +341,8 @@ void run2d_reduction(std::shared_ptr<gko::HipExecutor> exec)
         [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
         gko::dim<2>{10, 10}, output);
 
-    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll);
+    // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1)
+    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL);
 }
 
 TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); }
@@ -353,6 +357,8 @@ void run2d_row_reduction(std::shared_ptr<gko::HipExecutor> exec)
     std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
     gko::Array<int64> output{exec, host_ref};
     for (int i = 0; i < num_rows; i++) {
+        // we are computing 2 * sum {j=0, j<cols} (i+1)*(j+1) for each
+        // row i and storing it with stride 2
         host_ref.get_data()[2 * i] = num_cols * (num_cols + 1) * (i + 1);
     }
 
@@ -383,6 +389,8 @@ void run2d_col_reduction(std::shared_ptr<gko::HipExecutor> exec)
                                static_cast<size_type>(num_cols)};
     gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
     for (int i = 0; i < num_cols; i++) {
+        // we are computing 2 * sum {j=0, j<row} (i+1)*(j+1) for each
+        // column i
         host_ref.get_data()[i] = num_rows * (num_rows + 1) * (i + 1);
     }
 
diff --git a/omp/test/base/kernel_launch.cpp b/omp/test/base/kernel_launch.cpp
index a615c452f64..f2c6a7e1465 100644
--- a/omp/test/base/kernel_launch.cpp
+++ b/omp/test/base/kernel_launch.cpp
@@ -255,7 +255,8 @@ TEST_F(KernelLaunch, Reduction1D)
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
         size_type{100000}, output);
 
-    ASSERT_EQ(*output.get_const_data(), 10000100000ll);
+    // 2 * sum i=0...99999 (i+1)
+    ASSERT_EQ(*output.get_const_data(), 10000100000LL);
 
     gko::kernels::omp::run_kernel_reduction(
         exec,
@@ -268,7 +269,8 @@ TEST_F(KernelLaunch, Reduction1D)
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
         size_type{10}, output);
 
-    ASSERT_EQ(*output.get_const_data(), 110ll);
+    // 2 * sum i=0...9 (i+1)
+    ASSERT_EQ(*output.get_const_data(), 110LL);
 }
 
 
@@ -288,7 +290,8 @@ TEST_F(KernelLaunch, Reduction2DSmallRows)
             [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
             gko::dim<2>{10, cols}, output);
 
-        ASSERT_EQ(*output.get_const_data(), 110ll * cols * (cols + 1));
+        // 4 * sum i=0...9 sum j=0...cols-1 of (i+1)*(j+1)
+        ASSERT_EQ(*output.get_const_data(), 110LL * cols * (cols + 1));
     }
 }
 
@@ -309,7 +312,8 @@ TEST_F(KernelLaunch, Reduction2DLargeRows)
             [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
             gko::dim<2>{1000, cols}, output);
 
-        ASSERT_EQ(*output.get_const_data(), 1001000ll * cols * (cols + 1));
+        // 4 * sum i=0...999 sum j=0...cols-1 of (i+1)*(j+1)
+        ASSERT_EQ(*output.get_const_data(), 1001000LL * cols * (cols + 1));
     }
 }
 
@@ -329,7 +333,9 @@ TEST_F(KernelLaunch, Reduction2D)
         [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
         gko::dim<2>{1000, 100}, output);
 
-    ASSERT_EQ(*output.get_const_data(), 10110100000ll);
+
+    // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1)
+    ASSERT_EQ(*output.get_const_data(), 10110100000LL);
 }
 
 
@@ -344,6 +350,8 @@ TEST_F(KernelLaunch, ReductionRow2DSmall)
     std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
     gko::Array<int64> output{exec, host_ref};
     for (int i = 0; i < num_rows; i++) {
+        // we are computing 2 * sum {j=0, j<cols} (i+1)*(j+1) for each
+        // row i and storing it with stride 2
         host_ref.get_data()[2 * i] =
             static_cast<int64>(num_cols) * (num_cols + 1) * (i + 1);
     }
@@ -374,6 +382,8 @@ TEST_F(KernelLaunch, ReductionRow2D)
     std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
     gko::Array<int64> output{exec, host_ref};
     for (int i = 0; i < num_rows; i++) {
+        // we are computing 2 * sum {j=0, j<cols} (i+1)*(j+1) for each
+        // row i and storing it with stride 2
         host_ref.get_data()[2 * i] =
             static_cast<int64>(num_cols) * (num_cols + 1) * (i + 1);
     }
@@ -404,6 +414,8 @@ TEST_F(KernelLaunch, ReductionCol2D)
                                        static_cast<size_type>(num_cols)};
             gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
             for (int i = 0; i < num_cols; i++) {
+                // we are computing 2 * sum {j=0, j<row} (i+1)*(j+1) for each
+                // column i
                 host_ref.get_data()[i] =
                     static_cast<int64>(num_rows) * (num_rows + 1) * (i + 1);
             }

From c0093c8ca9003dab88848b8046820f7d98e9f8a5 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Tue, 5 Oct 2021 12:07:48 +0200
Subject: [PATCH 21/25] rename init to identity in reduction interface

---
 cuda/base/kernel_launch_reduction.cuh     |  80 +++++++--------
 dpcpp/base/kernel_launch_reduction.dp.hpp | 116 +++++++++++-----------
 hip/base/kernel_launch_reduction.hip.hpp  |  83 ++++++++--------
 omp/base/kernel_launch_reduction.hpp      |  56 ++++++-----
 4 files changed, 169 insertions(+), 166 deletions(-)

diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
index d98bb878672..30f7fa1ba96 100644
--- a/cuda/base/kernel_launch_reduction.cuh
+++ b/cuda/base/kernel_launch_reduction.cuh
@@ -56,7 +56,7 @@ __global__ __launch_bounds__(
                                                          KernelFunction fn,
                                                          ReductionOp op,
                                                          FinalizeOp finalize,
-                                                         ValueType init,
+                                                         ValueType identity,
                                                          ValueType* storage,
                                                          KernelArgs... args)
 {
@@ -69,7 +69,7 @@ __global__ __launch_bounds__(
     auto grid_size = thread::get_thread_num_flat<int64>();
     auto warp =
         group::tiled_partition<config::warp_size>(group::this_thread_block());
-    auto partial = init;
+    auto partial = identity;
     for (int64 i = tidx; i < size; i += grid_size) {
         partial = op(partial, fn(i, args...));
     }
@@ -82,7 +82,7 @@ __global__ __launch_bounds__(
         partial = reduce(warp,
                          threadIdx.x < default_block_size / config::warp_size
                              ? warp_partial[threadIdx.x]
-                             : init,
+                             : identity,
                          op);
         if (threadIdx.x == 0) {
             storage[blockIdx.x] = finalize(partial);
@@ -98,7 +98,7 @@ __global__ __launch_bounds__(
                                                          KernelFunction fn,
                                                          ReductionOp op,
                                                          FinalizeOp finalize,
-                                                         ValueType init,
+                                                         ValueType identity,
                                                          ValueType* storage,
                                                          KernelArgs... args)
 {
@@ -111,7 +111,7 @@ __global__ __launch_bounds__(
     auto grid_size = thread::get_thread_num_flat<int64>();
     auto warp =
         group::tiled_partition<config::warp_size>(group::this_thread_block());
-    auto partial = init;
+    auto partial = identity;
     for (int64 i = tidx; i < rows * cols; i += grid_size) {
         const auto row = i / cols;
         const auto col = i % cols;
@@ -126,7 +126,7 @@ __global__ __launch_bounds__(
         partial = reduce(warp,
                          threadIdx.x < default_block_size / config::warp_size
                              ? warp_partial[threadIdx.x]
-                             : init,
+                             : identity,
                          op);
         if (threadIdx.x == 0) {
             storage[blockIdx.x] = finalize(partial);
@@ -139,7 +139,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_reduction(std::shared_ptr<const CudaExecutor> exec,
                           KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType init,
+                          FinalizeOp finalize, ValueType identity,
                           ValueType* result, size_type size,
                           KernelArgs&&... args)
 {
@@ -152,16 +152,16 @@ void run_kernel_reduction(std::shared_ptr<const CudaExecutor> exec,
         Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
         generic_kernel_reduction_1d<<<num_blocks, block_size>>>(
             static_cast<int64>(size), fn, op,
-            [] __device__(auto v) { return v; }, as_cuda_type(init),
+            [] __device__(auto v) { return v; }, as_cuda_type(identity),
             as_cuda_type(partial.get_data()), map_to_device(args)...);
         generic_kernel_reduction_1d<<<1, block_size>>>(
             static_cast<int64>(num_blocks),
             [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
-            as_cuda_type(init), as_cuda_type(result),
+            as_cuda_type(identity), as_cuda_type(result),
             as_cuda_type(partial.get_const_data()));
     } else {
         generic_kernel_reduction_1d<<<1, block_size>>>(
-            static_cast<int64>(size), fn, op, finalize, as_cuda_type(init),
+            static_cast<int64>(size), fn, op, finalize, as_cuda_type(identity),
             as_cuda_type(result), map_to_device(args)...);
     }
 }
@@ -171,7 +171,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_reduction(std::shared_ptr<const CudaExecutor> exec,
                           KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType init,
+                          FinalizeOp finalize, ValueType identity,
                           ValueType* result, dim<2> size, KernelArgs&&... args)
 {
     constexpr int oversubscription = 16;
@@ -186,16 +186,16 @@ void run_kernel_reduction(std::shared_ptr<const CudaExecutor> exec,
         Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
         generic_kernel_reduction_2d<<<num_blocks, block_size>>>(
             rows, cols, fn, op, [] __device__(auto v) { return v; },
-            as_cuda_type(init), as_cuda_type(partial.get_data()),
+            as_cuda_type(identity), as_cuda_type(partial.get_data()),
             map_to_device(args)...);
         generic_kernel_reduction_1d<<<1, block_size>>>(
             static_cast<int64>(num_blocks),
             [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
-            as_cuda_type(init), as_cuda_type(result),
+            as_cuda_type(identity), as_cuda_type(result),
             as_cuda_type(partial.get_const_data()));
     } else {
         generic_kernel_reduction_2d<<<1, block_size>>>(
-            rows, cols, fn, op, finalize, as_cuda_type(init),
+            rows, cols, fn, op, finalize, as_cuda_type(identity),
             as_cuda_type(result), map_to_device(args)...);
     }
 }
@@ -206,8 +206,8 @@ template <int subwarp_size, typename ValueType, typename KernelFunction,
 __global__
     __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d(
         int64 rows, int64 cols, int64 col_blocks, KernelFunction fn,
-        ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result,
-        int64 result_stride, KernelArgs... args)
+        ReductionOp op, FinalizeOp finalize, ValueType identity,
+        ValueType* result, int64 result_stride, KernelArgs... args)
 {
     const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
     const auto row = idx % rows;
@@ -221,7 +221,7 @@ __global__
     const auto end = min(begin + cols_per_part, cols);
     auto subwarp =
         group::tiled_partition<subwarp_size>(group::this_thread_block());
-    auto partial = init;
+    auto partial = identity;
     for (auto col = begin + subwarp.thread_rank(); col < end;
          col += subwarp_size) {
         partial = op(partial, fn(row, col, args...));
@@ -238,7 +238,7 @@ template <int subwarp_size, typename ValueType, typename KernelFunction,
 __global__
     __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small(
         int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
-        FinalizeOp finalize, ValueType init, ValueType* result,
+        FinalizeOp finalize, ValueType identity, ValueType* result,
         KernelArgs... args)
 {
     constexpr auto warp_size = config::warp_size;
@@ -256,7 +256,7 @@ __global__
     const auto warp_rank = warp.thread_rank();
     const auto subwarp_rank = warp_rank % subwarp_size;
     const auto col = static_cast<int64>(subwarp_rank);
-    auto partial = init;
+    auto partial = identity;
     // accumulate within a thread
     if (col < cols) {
         for (auto row = subwarp_id; row < rows; row += subwarp_num) {
@@ -274,7 +274,7 @@ __global__
     block.sync();
     // in a single thread: accumulate the results
     if (local_warp_id == 0) {
-        partial = init;
+        partial = identity;
         // accumulate the partial results within a thread
         if (shared_storage >= warp_size) {
 #pragma unroll
@@ -301,7 +301,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
 __global__
     __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked(
         int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
-        FinalizeOp finalize, ValueType init, ValueType* result,
+        FinalizeOp finalize, ValueType identity, ValueType* result,
         KernelArgs... args)
 {
     constexpr auto warp_size = config::warp_size;
@@ -312,7 +312,7 @@ __global__
     const auto warp = group::tiled_partition<warp_size>(block);
     const auto warp_rank = warp.thread_rank();
     const auto col = warp_rank + static_cast<int64>(blockIdx.y) * warp_size;
-    auto partial = init;
+    auto partial = identity;
     // accumulate within a thread
     if (col < cols) {
         for (auto row = warp_id; row < rows; row += warp_num) {
@@ -323,7 +323,7 @@ __global__
     block.sync();
     // in a single warp: accumulate the results
     if (threadIdx.x < warp_size) {
-        partial = init;
+        partial = identity;
         // accumulate the partial results within a thread
 #pragma unroll
         for (int i = 0; i < default_block_size; i += warp_size) {
@@ -340,14 +340,14 @@ template <typename ValueType, typename ReductionOp, typename FinalizeOp>
 __global__
     __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d(
         int64 num_results, int64 num_blocks, ReductionOp op,
-        FinalizeOp finalize, ValueType init, const ValueType* input,
+        FinalizeOp finalize, ValueType identity, const ValueType* input,
         int64 result_stride, ValueType* result)
 {
     const auto idx = thread::get_thread_id_flat<int64>();
     if (idx >= num_results) {
         return;
     }
-    auto partial = init;
+    auto partial = identity;
     for (int64 block = 0; block < num_blocks; block++) {
         partial = op(partial, input[idx + block * num_results]);
     }
@@ -363,7 +363,7 @@ template <int subwarp_size, typename ValueType, typename KernelFunction,
 void run_generic_kernel_row_reduction(syn::value_list<int, subwarp_size>,
                                       int64 rows, int64 cols, int64 col_blocks,
                                       KernelFunction fn, ReductionOp op,
-                                      FinalizeOp finalize, ValueType init,
+                                      FinalizeOp finalize, ValueType identity,
                                       ValueType* result, int64 result_stride,
                                       KernelArgs... args)
 {
@@ -371,7 +371,7 @@ void run_generic_kernel_row_reduction(syn::value_list<int, subwarp_size>,
         ceildiv(rows * col_blocks * subwarp_size, default_block_size);
     generic_kernel_row_reduction_2d<subwarp_size>
         <<<num_blocks, default_block_size>>>(
-            rows, cols, col_blocks, fn, op, finalize, as_cuda_type(init),
+            rows, cols, col_blocks, fn, op, finalize, as_cuda_type(identity),
             as_cuda_type(result), result_stride, args...);
 }
 
@@ -386,7 +386,7 @@ void run_generic_col_reduction_small(syn::value_list<int, subwarp_size>,
                                      int64 max_blocks,
                                      std::shared_ptr<const CudaExecutor> exec,
                                      KernelFunction fn, ReductionOp op,
-                                     FinalizeOp finalize, ValueType init,
+                                     FinalizeOp finalize, ValueType identity,
                                      ValueType* result, dim<2> size,
                                      MappedKernelArgs... args)
 {
@@ -397,7 +397,7 @@ void run_generic_col_reduction_small(syn::value_list<int, subwarp_size>,
     if (num_blocks <= 1) {
         generic_kernel_col_reduction_2d_small<subwarp_size>
             <<<1, default_block_size>>>(rows, cols, fn, op, finalize,
-                                        as_cuda_type(init),
+                                        as_cuda_type(identity),
                                         as_cuda_type(result), args...);
     } else {
         Array<ValueType> tmp_storage{exec,
@@ -405,11 +405,11 @@ void run_generic_col_reduction_small(syn::value_list<int, subwarp_size>,
         generic_kernel_col_reduction_2d_small<subwarp_size>
             <<<num_blocks, default_block_size>>>(
                 rows, cols, fn, op, [] __device__(auto v) { return v; },
-                as_cuda_type(init), as_cuda_type(tmp_storage.get_data()),
+                as_cuda_type(identity), as_cuda_type(tmp_storage.get_data()),
                 args...);
         generic_kernel_reduction_finalize_2d<<<
             ceildiv(cols, default_block_size), default_block_size>>>(
-            cols, num_blocks, op, finalize, as_cuda_type(init),
+            cols, num_blocks, op, finalize, as_cuda_type(identity),
             as_cuda_type(tmp_storage.get_const_data()), 1,
             as_cuda_type(result));
     }
@@ -426,7 +426,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_row_reduction(std::shared_ptr<const CudaExecutor> exec,
                               KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType init,
+                              FinalizeOp finalize, ValueType identity,
                               ValueType* result, size_type result_stride,
                               dim<2> size, KernelArgs&&... args)
 {
@@ -447,12 +447,12 @@ void run_kernel_row_reduction(std::shared_ptr<const CudaExecutor> exec,
         generic_kernel_row_reduction_2d<config::warp_size>
             <<<num_blocks, default_block_size>>>(
                 rows, cols, col_blocks, fn, op,
-                [] __device__(auto v) { return v; }, as_cuda_type(init),
+                [] __device__(auto v) { return v; }, as_cuda_type(identity),
                 as_cuda_type(partial.get_data()), 1, map_to_device(args)...);
         const auto num_finalize_blocks = ceildiv(rows, default_block_size);
         generic_kernel_reduction_finalize_2d<<<num_finalize_blocks,
                                                default_block_size>>>(
-            rows, col_blocks, op, finalize, as_cuda_type(init),
+            rows, col_blocks, op, finalize, as_cuda_type(identity),
             as_cuda_type(partial.get_const_data()),
             static_cast<int64>(result_stride), as_cuda_type(result));
     } else {
@@ -463,7 +463,7 @@ void run_kernel_row_reduction(std::shared_ptr<const CudaExecutor> exec,
                        compiled_subwarp_size == config::warp_size;
             },
             syn::value_list<int>(), syn::type_list<>(), rows, cols, 1, fn, op,
-            finalize, init, result, static_cast<int64>(result_stride),
+            finalize, identity, result, static_cast<int64>(result_stride),
             map_to_device(args)...);
     }
 }
@@ -473,7 +473,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_col_reduction(std::shared_ptr<const CudaExecutor> exec,
                               KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType init,
+                              FinalizeOp finalize, ValueType identity,
                               ValueType* result, dim<2> size,
                               KernelArgs&&... args)
 {
@@ -493,7 +493,7 @@ void run_kernel_col_reduction(std::shared_ptr<const CudaExecutor> exec,
                        compiled_subwarp_size == config::warp_size;
             },
             syn::value_list<int>(), syn::type_list<>(), max_blocks, exec, fn,
-            op, finalize, init, result, size, map_to_device(args)...);
+            op, finalize, identity, result, size, map_to_device(args)...);
     } else {
         const auto col_blocks = ceildiv(cols, config::warp_size);
         const auto row_blocks =
@@ -504,7 +504,7 @@ void run_kernel_col_reduction(std::shared_ptr<const CudaExecutor> exec,
         if (row_blocks <= 1) {
             generic_kernel_col_reduction_2d_blocked<<<dim3(1, col_blocks),
                                                       default_block_size>>>(
-                rows, cols, fn, op, finalize, as_cuda_type(init),
+                rows, cols, fn, op, finalize, as_cuda_type(identity),
                 as_cuda_type(result), map_to_device(args)...);
         } else {
             Array<ValueType> tmp_storage{
@@ -512,11 +512,11 @@ void run_kernel_col_reduction(std::shared_ptr<const CudaExecutor> exec,
             generic_kernel_col_reduction_2d_blocked<<<
                 dim3(row_blocks, col_blocks), default_block_size>>>(
                 rows, cols, fn, op, [] __device__(auto v) { return v; },
-                as_cuda_type(init), as_cuda_type(tmp_storage.get_data()),
+                as_cuda_type(identity), as_cuda_type(tmp_storage.get_data()),
                 map_to_device(args)...);
             generic_kernel_reduction_finalize_2d<<<
                 ceildiv(cols, default_block_size), default_block_size>>>(
-                cols, row_blocks, op, finalize, as_cuda_type(init),
+                cols, row_blocks, op, finalize, as_cuda_type(identity),
                 as_cuda_type(tmp_storage.get_const_data()), 1,
                 as_cuda_type(result));
         }
diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp
index 4bda0422178..4b29a5af55e 100644
--- a/dpcpp/base/kernel_launch_reduction.dp.hpp
+++ b/dpcpp/base/kernel_launch_reduction.dp.hpp
@@ -69,7 +69,7 @@ template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
 void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size,
                                  int64 num_workgroups, KernelFunction fn,
                                  ReductionOp op, FinalizeOp finalize,
-                                 ValueType init, ValueType* storage,
+                                 ValueType identity, ValueType* storage,
                                  MappedKernelArgs... args)
 {
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
@@ -89,7 +89,7 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size,
             const auto local_tidx = static_cast<int64>(tidx % wg_size);
             auto subgroup =
                 group::tiled_partition<sg_size>(group::this_thread_block(idx));
-            auto partial = init;
+            auto partial = identity;
             for (int64 i = tidx; i < size; i += global_size) {
                 partial = op(partial, fn(i, args...));
             }
@@ -99,7 +99,7 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size,
             }
             idx.barrier(sycl::access::fence_space::local_space);
             if (local_tidx < sg_size) {
-                partial = init;
+                partial = identity;
                 for (int64 i = local_tidx; i < num_partials; i += sg_size) {
                     partial = op(partial, subgroup_partial[i]);
                 }
@@ -118,7 +118,7 @@ template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
 void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols,
                                  int64 num_workgroups, KernelFunction fn,
                                  ReductionOp op, FinalizeOp finalize,
-                                 ValueType init, ValueType* storage,
+                                 ValueType identity, ValueType* storage,
                                  MappedKernelArgs... args)
 {
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
@@ -138,7 +138,7 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols,
             const auto local_tidx = static_cast<int64>(tidx % wg_size);
             auto subgroup =
                 group::tiled_partition<sg_size>(group::this_thread_block(idx));
-            auto partial = init;
+            auto partial = identity;
             for (int64 i = tidx; i < rows * cols; i += global_size) {
                 const auto row = i / cols;
                 const auto col = i % cols;
@@ -150,7 +150,7 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols,
             }
             idx.barrier(sycl::access::fence_space::local_space);
             if (local_tidx < sg_size) {
-                partial = init;
+                partial = identity;
                 for (int64 i = local_tidx; i < num_partials; i += sg_size) {
                     partial = op(partial, subgroup_partial[i]);
                 }
@@ -169,7 +169,7 @@ template <int icfg, typename ValueType, typename KernelFunction,
 void run_kernel_reduction_impl(syn::value_list<int, icfg>,
                                std::shared_ptr<const DpcppExecutor> exec,
                                KernelFunction fn, ReductionOp op,
-                               FinalizeOp finalize, ValueType init,
+                               FinalizeOp finalize, ValueType identity,
                                ValueType* result, size_type size,
                                MappedKernelArgs... args)
 {
@@ -186,19 +186,20 @@ void run_kernel_reduction_impl(syn::value_list<int, icfg>,
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_reduction_1d<cfg>(
                 cgh, static_cast<int64>(size), num_workgroups, fn, op,
-                [](auto v) { return v; }, init, partial.get_data(), args...);
+                [](auto v) { return v; }, identity, partial.get_data(),
+                args...);
         });
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_reduction_1d<cfg>(
                 cgh, static_cast<int64>(num_workgroups), 1,
-                [](auto i, auto v) { return v[i]; }, op, finalize, init, result,
-                partial.get_const_data());
+                [](auto i, auto v) { return v[i]; }, op, finalize, identity,
+                result, partial.get_const_data());
         });
     } else {
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_reduction_1d<cfg>(cgh, static_cast<int64>(size),
                                              num_workgroups, fn, op, finalize,
-                                             init, result, args...);
+                                             identity, result, args...);
         });
     }
 }
@@ -210,7 +211,7 @@ template <int icfg, typename ValueType, typename KernelFunction,
 void run_kernel_reduction_impl(syn::value_list<int, icfg>,
                                std::shared_ptr<const DpcppExecutor> exec,
                                KernelFunction fn, ReductionOp op,
-                               FinalizeOp finalize, ValueType init,
+                               FinalizeOp finalize, ValueType identity,
                                ValueType* result, dim<2> size,
                                MappedKernelArgs... args)
 {
@@ -230,18 +231,19 @@ void run_kernel_reduction_impl(syn::value_list<int, icfg>,
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_reduction_2d<cfg>(
                 cgh, rows, cols, num_workgroups, fn, op,
-                [](auto v) { return v; }, init, partial.get_data(), args...);
+                [](auto v) { return v; }, identity, partial.get_data(),
+                args...);
         });
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_reduction_1d<cfg>(
                 cgh, static_cast<int64>(num_workgroups), 1,
-                [](auto i, auto v) { return v[i]; }, op, finalize, init, result,
-                partial.get_const_data());
+                [](auto i, auto v) { return v[i]; }, op, finalize, identity,
+                result, partial.get_const_data());
         });
     } else {
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_reduction_2d<cfg>(cgh, rows, cols, num_workgroups,
-                                             fn, op, finalize, init, result,
+                                             fn, op, finalize, identity, result,
                                              args...);
         });
     }
@@ -255,7 +257,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
                           KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType init,
+                          FinalizeOp finalize, ValueType identity,
                           ValueType* result, dim<2> size, KernelArgs&&... args)
 {
     const auto desired_icfg = static_cast<int>(get_first_cfg(
@@ -266,7 +268,7 @@ void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
     select_run_kernel_reduction(
         kcfg_1d_list_simple_reduction,
         [&](int icfg) { return icfg == desired_icfg; }, syn::value_list<int>(),
-        syn::type_list<>(), exec, fn, op, finalize, init, result, size,
+        syn::type_list<>(), exec, fn, op, finalize, identity, result, size,
         map_to_device(args)...);
 }
 
@@ -275,7 +277,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
                           KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType init,
+                          FinalizeOp finalize, ValueType identity,
                           ValueType* result, size_type size,
                           KernelArgs&&... args)
 {
@@ -287,7 +289,7 @@ void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
     select_run_kernel_reduction(
         kcfg_1d_list_simple_reduction,
         [&](int icfg) { return icfg == desired_icfg; }, syn::value_list<int>(),
-        syn::type_list<>(), exec, fn, op, finalize, init, result, size,
+        syn::type_list<>(), exec, fn, op, finalize, identity, result, size,
         map_to_device(args)...);
 }
 
@@ -302,7 +304,7 @@ void generic_kernel_row_reduction_2d(syn::value_list<int, ssg_size>,
                                      std::shared_ptr<const DpcppExecutor> exec,
                                      int64 rows, int64 cols, int64 col_blocks,
                                      KernelFunction fn, ReductionOp op,
-                                     FinalizeOp finalize, ValueType init,
+                                     FinalizeOp finalize, ValueType identity,
                                      ValueType* result, int64 result_stride,
                                      MappedKernelArgs... args)
 {
@@ -321,7 +323,7 @@ void generic_kernel_row_reduction_2d(syn::value_list<int, ssg_size>,
                     thread::get_subwarp_id_flat<ssg_size, int64>(id);
                 const auto row = idx % rows;
                 const auto col_block = idx / rows;
-                auto partial = init;
+                auto partial = identity;
                 auto subgroup = group::tiled_partition<sg_size>(
                     group::this_thread_block(id));
                 auto ssg_rank =
@@ -356,12 +358,10 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_kernel_row_reduction_2d,
 template <int icfg, int ssg_size, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
-void generic_kernel_col_reduction_2d_small(sycl::handler& cgh, int64 rows,
-                                           int64 cols, int64 row_blocks,
-                                           KernelFunction fn, ReductionOp op,
-                                           FinalizeOp finalize, ValueType init,
-                                           ValueType* result,
-                                           MappedKernelArgs... args)
+void generic_kernel_col_reduction_2d_small(
+    sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks,
+    KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity,
+    ValueType* result, MappedKernelArgs... args)
 {
     constexpr auto wg_size =
         KCFG_1D::decode<0>(static_cast<std::uint32_t>(icfg));
@@ -390,7 +390,7 @@ void generic_kernel_col_reduction_2d_small(sycl::handler& cgh, int64 rows,
             const auto sg_rank = subgroup.thread_rank();
             const auto ssg_rank = sg_rank % ssg_size;
             const auto col = static_cast<int64>(ssg_rank);
-            auto partial = init;
+            auto partial = identity;
             // accumulate within a thread
             if (col < cols) {
                 for (auto row = ssg_id; row < rows; row += ssg_num) {
@@ -409,7 +409,7 @@ void generic_kernel_col_reduction_2d_small(sycl::handler& cgh, int64 rows,
             workgroup.sync();
             // in a single thread: accumulate the results
             if (local_sg_id == 0) {
-                partial = init;
+                partial = identity;
                 // accumulate the partial results within a thread
                 if (shared_storage >= sg_size) {
 #pragma unroll
@@ -439,7 +439,7 @@ template <int icfg, typename ValueType, typename KernelFunction,
 void generic_kernel_col_reduction_2d_blocked(
     sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks,
     int64 col_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize,
-    ValueType init, ValueType* result, MappedKernelArgs... args)
+    ValueType identity, ValueType* result, MappedKernelArgs... args)
 {
     constexpr auto cfg = static_cast<std::uint32_t>(icfg);
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
@@ -461,7 +461,7 @@ void generic_kernel_col_reduction_2d_blocked(
             const auto col =
                 sg_rank + static_cast<int64>(id.get_group(1)) * sg_size;
             auto block_partial = &block_partial_acc[0][0];
-            auto partial = init;
+            auto partial = identity;
             // accumulate within a thread
             if (col < cols) {
                 for (auto row = sg_id; row < rows; row += sg_num) {
@@ -472,7 +472,7 @@ void generic_kernel_col_reduction_2d_blocked(
             workgroup.sync();
             // in a single warp: accumulate the results
             if (id.get_local_id(2) < sg_size) {
-                partial = init;
+                partial = identity;
             // accumulate the partial results within a thread
 #pragma unroll
                 for (int i = 0; i < wg_size; i += sg_size) {
@@ -487,16 +487,14 @@ void generic_kernel_col_reduction_2d_blocked(
 
 
 template <typename ValueType, typename ReductionOp, typename FinalizeOp>
-void generic_kernel_reduction_finalize_2d(sycl::handler& cgh, int64 num_results,
-                                          int64 num_blocks, ReductionOp op,
-                                          FinalizeOp finalize, ValueType init,
-                                          const ValueType* input,
-                                          int64 result_stride,
-                                          ValueType* result)
+void generic_kernel_reduction_finalize_2d(
+    sycl::handler& cgh, int64 num_results, int64 num_blocks, ReductionOp op,
+    FinalizeOp finalize, ValueType identity, const ValueType* input,
+    int64 result_stride, ValueType* result)
 {
     cgh.parallel_for(sycl::range<1>{static_cast<std::size_t>(num_results)},
                      [=](sycl::id<1> id) {
-                         auto partial = init;
+                         auto partial = identity;
                          for (int64 block = 0; block < num_blocks; block++) {
                              partial = op(partial,
                                           input[id[0] + block * num_results]);
@@ -513,7 +511,7 @@ void run_generic_col_reduction_small(syn::value_list<int, ssg_size>,
                                      std::shared_ptr<const DpcppExecutor> exec,
                                      int64 max_workgroups, KernelFunction fn,
                                      ReductionOp op, FinalizeOp finalize,
-                                     ValueType init, ValueType* result,
+                                     ValueType identity, ValueType* result,
                                      dim<2> size, MappedKernelArgs... args)
 {
     constexpr auto wg_size =
@@ -529,7 +527,8 @@ void run_generic_col_reduction_small(syn::value_list<int, ssg_size>,
     if (row_blocks <= 1) {
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_col_reduction_2d_small<icfg, ssg_size>(
-                cgh, rows, cols, 1, fn, op, finalize, init, result, args...);
+                cgh, rows, cols, 1, fn, op, finalize, identity, result,
+                args...);
         });
     } else {
         Array<ValueType> tmp_storage{exec,
@@ -537,11 +536,11 @@ void run_generic_col_reduction_small(syn::value_list<int, ssg_size>,
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_col_reduction_2d_small<icfg, ssg_size>(
                 cgh, rows, cols, row_blocks, fn, op, [](auto v) { return v; },
-                init, tmp_storage.get_data(), args...);
+                identity, tmp_storage.get_data(), args...);
         });
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_reduction_finalize_2d(
-                cgh, cols, row_blocks, op, finalize, init,
+                cgh, cols, row_blocks, op, finalize, identity,
                 tmp_storage.get_const_data(), 1, result);
         });
     }
@@ -557,7 +556,7 @@ template <int icfg, typename ValueType, typename KernelFunction,
 void run_kernel_row_reduction_stage1(syn::value_list<int, icfg>,
                                      std::shared_ptr<const DpcppExecutor> exec,
                                      KernelFunction fn, ReductionOp op,
-                                     FinalizeOp finalize, ValueType init,
+                                     FinalizeOp finalize, ValueType identity,
                                      ValueType* result, size_type result_stride,
                                      dim<2> size, MappedKernelArgs... args)
 {
@@ -580,10 +579,11 @@ void run_kernel_row_reduction_stage1(syn::value_list<int, icfg>,
                                  static_cast<size_type>(col_blocks * rows)};
         generic_kernel_row_reduction_2d<icfg, sg_size>(
             syn::value_list<int, sg_size>{}, exec, rows, cols, col_blocks, fn,
-            op, [](auto v) { return v; }, init, partial.get_data(), 1, args...);
+            op, [](auto v) { return v; }, identity, partial.get_data(), 1,
+            args...);
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_reduction_finalize_2d(
-                cgh, rows, col_blocks, op, finalize, init,
+                cgh, rows, col_blocks, op, finalize, identity,
                 partial.get_const_data(), static_cast<int64>(result_stride),
                 result);
         });
@@ -595,7 +595,7 @@ void run_kernel_row_reduction_stage1(syn::value_list<int, icfg>,
                        compiled_ssg_size == sg_size;
             },
             syn::value_list<int, icfg>(), syn::type_list<>(), exec, rows, cols,
-            1, fn, op, finalize, init, result,
+            1, fn, op, finalize, identity, result,
             static_cast<int64>(result_stride), args...);
     }
 }
@@ -610,7 +610,7 @@ template <int icfg, typename ValueType, typename KernelFunction,
 void run_kernel_col_reduction_stage1(syn::value_list<int, icfg>,
                                      std::shared_ptr<const DpcppExecutor> exec,
                                      KernelFunction fn, ReductionOp op,
-                                     FinalizeOp finalize, ValueType init,
+                                     FinalizeOp finalize, ValueType identity,
                                      ValueType* result, dim<2> size,
                                      MappedKernelArgs... args)
 {
@@ -634,7 +634,7 @@ void run_kernel_col_reduction_stage1(syn::value_list<int, icfg>,
                        compiled_ssg_size == sg_size;
             },
             syn::value_list<int, icfg>(), syn::type_list<>(), exec, max_blocks,
-            fn, op, finalize, init, result, size, args...);
+            fn, op, finalize, identity, result, size, args...);
     } else {
         const auto col_blocks = ceildiv(cols, sg_size);
         const auto row_blocks = ceildiv(
@@ -644,7 +644,7 @@ void run_kernel_col_reduction_stage1(syn::value_list<int, icfg>,
         if (row_blocks <= 1) {
             queue->submit([&](sycl::handler& cgh) {
                 generic_kernel_col_reduction_2d_blocked<icfg>(
-                    cgh, rows, cols, 1, col_blocks, fn, op, finalize, init,
+                    cgh, rows, cols, 1, col_blocks, fn, op, finalize, identity,
                     result, args...);
             });
         } else {
@@ -653,12 +653,12 @@ void run_kernel_col_reduction_stage1(syn::value_list<int, icfg>,
             queue->submit([&](sycl::handler& cgh) {
                 generic_kernel_col_reduction_2d_blocked<icfg>(
                     cgh, rows, cols, row_blocks, col_blocks, fn, op,
-                    [](auto v) { return v; }, init, tmp_storage.get_data(),
+                    [](auto v) { return v; }, identity, tmp_storage.get_data(),
                     args...);
             });
             queue->submit([&](sycl::handler& cgh) {
                 generic_kernel_reduction_finalize_2d(
-                    cgh, cols, row_blocks, op, finalize, init,
+                    cgh, cols, row_blocks, op, finalize, identity,
                     tmp_storage.get_const_data(), 1, result);
             });
         }
@@ -676,7 +676,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_row_reduction(std::shared_ptr<const DpcppExecutor> exec,
                               KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType init,
+                              FinalizeOp finalize, ValueType identity,
                               ValueType* result, size_type result_stride,
                               dim<2> size, KernelArgs&&... args)
 {
@@ -688,8 +688,8 @@ void run_kernel_row_reduction(std::shared_ptr<const DpcppExecutor> exec,
     select_kernel_row_reduction_stage1(
         kcfg_1d_list_simple_reduction,
         [&](int icfg) { return icfg == desired_icfg; }, syn::value_list<int>(),
-        syn::type_list<>(), exec, fn, op, finalize, init, result, result_stride,
-        size, map_to_device(args)...);
+        syn::type_list<>(), exec, fn, op, finalize, identity, result,
+        result_stride, size, map_to_device(args)...);
 }
 
 
@@ -697,7 +697,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_col_reduction(std::shared_ptr<const DpcppExecutor> exec,
                               KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType init,
+                              FinalizeOp finalize, ValueType identity,
                               ValueType* result, dim<2> size,
                               KernelArgs&&... args)
 {
@@ -709,7 +709,7 @@ void run_kernel_col_reduction(std::shared_ptr<const DpcppExecutor> exec,
     select_kernel_col_reduction_stage1(
         kcfg_1d_list_simple_reduction,
         [&](int icfg) { return icfg == desired_icfg; }, syn::value_list<int>(),
-        syn::type_list<>(), exec, fn, op, finalize, init, result, size,
+        syn::type_list<>(), exec, fn, op, finalize, identity, result, size,
         map_to_device(args)...);
 }
 
diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp
index 47b33f411ac..40e4268dccb 100644
--- a/hip/base/kernel_launch_reduction.hip.hpp
+++ b/hip/base/kernel_launch_reduction.hip.hpp
@@ -56,7 +56,7 @@ __global__ __launch_bounds__(
                                                          KernelFunction fn,
                                                          ReductionOp op,
                                                          FinalizeOp finalize,
-                                                         ValueType init,
+                                                         ValueType identity,
                                                          ValueType* storage,
                                                          KernelArgs... args)
 {
@@ -69,7 +69,7 @@ __global__ __launch_bounds__(
     auto grid_size = thread::get_thread_num_flat<int64>();
     auto warp =
         group::tiled_partition<config::warp_size>(group::this_thread_block());
-    auto partial = init;
+    auto partial = identity;
     for (int64 i = tidx; i < size; i += grid_size) {
         partial = op(partial, fn(i, args...));
     }
@@ -82,7 +82,7 @@ __global__ __launch_bounds__(
         partial = reduce(warp,
                          threadIdx.x < default_block_size / config::warp_size
                              ? warp_partial[threadIdx.x]
-                             : init,
+                             : identity,
                          op);
         if (threadIdx.x == 0) {
             storage[blockIdx.x] = finalize(partial);
@@ -98,7 +98,7 @@ __global__ __launch_bounds__(
                                                          KernelFunction fn,
                                                          ReductionOp op,
                                                          FinalizeOp finalize,
-                                                         ValueType init,
+                                                         ValueType identity,
                                                          ValueType* storage,
                                                          KernelArgs... args)
 {
@@ -111,7 +111,7 @@ __global__ __launch_bounds__(
     auto grid_size = thread::get_thread_num_flat<int64>();
     auto warp =
         group::tiled_partition<config::warp_size>(group::this_thread_block());
-    auto partial = init;
+    auto partial = identity;
     for (int64 i = tidx; i < rows * cols; i += grid_size) {
         const auto row = i / cols;
         const auto col = i % cols;
@@ -126,7 +126,7 @@ __global__ __launch_bounds__(
         partial = reduce(warp,
                          threadIdx.x < default_block_size / config::warp_size
                              ? warp_partial[threadIdx.x]
-                             : init,
+                             : identity,
                          op);
         if (threadIdx.x == 0) {
             storage[blockIdx.x] = finalize(partial);
@@ -139,7 +139,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_reduction(std::shared_ptr<const HipExecutor> exec,
                           KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType init,
+                          FinalizeOp finalize, ValueType identity,
                           ValueType* result, size_type size,
                           KernelArgs&&... args)
 {
@@ -153,18 +153,18 @@ void run_kernel_reduction(std::shared_ptr<const HipExecutor> exec,
         hipLaunchKernelGGL(
             generic_kernel_reduction_1d, num_blocks, block_size, 0, 0,
             static_cast<int64>(size), fn, op,
-            [] __device__(auto v) { return v; }, as_hip_type(init),
+            [] __device__(auto v) { return v; }, as_hip_type(identity),
             as_hip_type(partial.get_data()), map_to_device(args)...);
         hipLaunchKernelGGL(
             generic_kernel_reduction_1d, 1, block_size, 0, 0,
             static_cast<int64>(num_blocks),
             [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
-            as_hip_type(init), as_hip_type(result),
+            as_hip_type(identity), as_hip_type(result),
             as_hip_type(partial.get_const_data()));
     } else {
         hipLaunchKernelGGL(generic_kernel_reduction_1d, 1, block_size, 0, 0,
                            static_cast<int64>(size), fn, op, finalize,
-                           as_hip_type(init), as_hip_type(result),
+                           as_hip_type(identity), as_hip_type(result),
                            map_to_device(args)...);
     }
 }
@@ -174,7 +174,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_reduction(std::shared_ptr<const HipExecutor> exec,
                           KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType init,
+                          FinalizeOp finalize, ValueType identity,
                           ValueType* result, dim<2> size, KernelArgs&&... args)
 {
     constexpr int oversubscription = 16;
@@ -190,17 +190,17 @@ void run_kernel_reduction(std::shared_ptr<const HipExecutor> exec,
         hipLaunchKernelGGL(
             generic_kernel_reduction_2d, num_blocks, block_size, 0, 0, rows,
             cols, fn, op, [] __device__(auto v) { return v; },
-            as_hip_type(init), as_hip_type(partial.get_data()),
+            as_hip_type(identity), as_hip_type(partial.get_data()),
             map_to_device(args)...);
         hipLaunchKernelGGL(
             generic_kernel_reduction_1d, 1, block_size, 0, 0,
             static_cast<int64>(num_blocks),
             [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
-            as_hip_type(init), as_hip_type(result),
+            as_hip_type(identity), as_hip_type(result),
             as_hip_type(partial.get_const_data()));
     } else {
         hipLaunchKernelGGL(generic_kernel_reduction_2d, 1, block_size, 0, 0,
-                           rows, cols, fn, op, finalize, as_hip_type(init),
+                           rows, cols, fn, op, finalize, as_hip_type(identity),
                            as_hip_type(result), map_to_device(args)...);
     }
 }
@@ -211,8 +211,8 @@ template <int subwarp_size, typename ValueType, typename KernelFunction,
 __global__
     __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d(
         int64 rows, int64 cols, int64 col_blocks, KernelFunction fn,
-        ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result,
-        int64 result_stride, KernelArgs... args)
+        ReductionOp op, FinalizeOp finalize, ValueType identity,
+        ValueType* result, int64 result_stride, KernelArgs... args)
 {
     const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
     const auto row = idx % rows;
@@ -226,7 +226,7 @@ __global__
     const auto end = min(begin + cols_per_part, cols);
     auto subwarp =
         group::tiled_partition<subwarp_size>(group::this_thread_block());
-    auto partial = init;
+    auto partial = identity;
     for (auto col = begin + subwarp.thread_rank(); col < end;
          col += subwarp_size) {
         partial = op(partial, fn(row, col, args...));
@@ -243,7 +243,7 @@ template <int subwarp_size, typename ValueType, typename KernelFunction,
 __global__
     __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small(
         int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
-        FinalizeOp finalize, ValueType init, ValueType* result,
+        FinalizeOp finalize, ValueType identity, ValueType* result,
         KernelArgs... args)
 {
     constexpr auto warp_size = config::warp_size;
@@ -261,7 +261,7 @@ __global__
     const auto warp_rank = warp.thread_rank();
     const auto subwarp_rank = warp_rank % subwarp_size;
     const auto col = static_cast<int64>(subwarp_rank);
-    auto partial = init;
+    auto partial = identity;
     // accumulate within a thread
     if (col < cols) {
         for (auto row = subwarp_id; row < rows; row += subwarp_num) {
@@ -279,7 +279,7 @@ __global__
     block.sync();
     // in a single thread: accumulate the results
     if (local_warp_id == 0) {
-        partial = init;
+        partial = identity;
         // accumulate the partial results within a thread
         if (shared_storage >= warp_size) {
 #pragma unroll
@@ -306,7 +306,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
 __global__
     __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked(
         int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
-        FinalizeOp finalize, ValueType init, ValueType* result,
+        FinalizeOp finalize, ValueType identity, ValueType* result,
         KernelArgs... args)
 {
     constexpr auto warp_size = config::warp_size;
@@ -317,7 +317,7 @@ __global__
     const auto warp = group::tiled_partition<warp_size>(block);
     const auto warp_rank = warp.thread_rank();
     const auto col = warp_rank + static_cast<int64>(blockIdx.y) * warp_size;
-    auto partial = init;
+    auto partial = identity;
     // accumulate within a thread
     if (col < cols) {
         for (auto row = warp_id; row < rows; row += warp_num) {
@@ -328,7 +328,7 @@ __global__
     block.sync();
     // in a single warp: accumulate the results
     if (threadIdx.x < warp_size) {
-        partial = init;
+        partial = identity;
         // accumulate the partial results within a thread
 #pragma unroll
         for (int i = 0; i < default_block_size; i += warp_size) {
@@ -345,14 +345,14 @@ template <typename ValueType, typename ReductionOp, typename FinalizeOp>
 __global__
     __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d(
         int64 num_results, int64 num_blocks, ReductionOp op,
-        FinalizeOp finalize, ValueType init, const ValueType* input,
+        FinalizeOp finalize, ValueType identity, const ValueType* input,
         int64 result_stride, ValueType* result)
 {
     const auto idx = thread::get_thread_id_flat<int64>();
     if (idx >= num_results) {
         return;
     }
-    auto partial = init;
+    auto partial = identity;
     for (int64 block = 0; block < num_blocks; block++) {
         partial = op(partial, input[idx + block * num_results]);
     }
@@ -368,7 +368,7 @@ template <int subwarp_size, typename ValueType, typename KernelFunction,
 void run_generic_kernel_row_reduction(syn::value_list<int, subwarp_size>,
                                       int64 rows, int64 cols, int64 col_blocks,
                                       KernelFunction fn, ReductionOp op,
-                                      FinalizeOp finalize, ValueType init,
+                                      FinalizeOp finalize, ValueType identity,
                                       ValueType* result, int64 result_stride,
                                       KernelArgs... args)
 {
@@ -377,7 +377,7 @@ void run_generic_kernel_row_reduction(syn::value_list<int, subwarp_size>,
     hipLaunchKernelGGL(
         HIP_KERNEL_NAME(generic_kernel_row_reduction_2d<subwarp_size>),
         num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn, op,
-        finalize, as_hip_type(init), as_hip_type(result), result_stride,
+        finalize, as_hip_type(identity), as_hip_type(result), result_stride,
         args...);
 }
 
@@ -392,7 +392,7 @@ void run_generic_col_reduction_small(syn::value_list<int, subwarp_size>,
                                      int64 max_blocks,
                                      std::shared_ptr<const HipExecutor> exec,
                                      KernelFunction fn, ReductionOp op,
-                                     FinalizeOp finalize, ValueType init,
+                                     FinalizeOp finalize, ValueType identity,
                                      ValueType* result, dim<2> size,
                                      MappedKernelArgs... args)
 {
@@ -405,7 +405,7 @@ void run_generic_col_reduction_small(syn::value_list<int, subwarp_size>,
             HIP_KERNEL_NAME(
                 generic_kernel_col_reduction_2d_small<subwarp_size>),
             1, default_block_size, 0, 0, rows, cols, fn, op, finalize,
-            as_hip_type(init), as_hip_type(result), args...);
+            as_hip_type(identity), as_hip_type(result), args...);
     } else {
         Array<ValueType> tmp_storage{exec,
                                      static_cast<size_type>(num_blocks * cols)};
@@ -413,12 +413,12 @@ void run_generic_col_reduction_small(syn::value_list<int, subwarp_size>,
             HIP_KERNEL_NAME(
                 generic_kernel_col_reduction_2d_small<subwarp_size>),
             num_blocks, default_block_size, 0, 0, rows, cols, fn, op,
-            [] __device__(auto v) { return v; }, as_hip_type(init),
+            [] __device__(auto v) { return v; }, as_hip_type(identity),
             as_hip_type(tmp_storage.get_data()), args...);
         hipLaunchKernelGGL(
             generic_kernel_reduction_finalize_2d,
             ceildiv(cols, default_block_size), default_block_size, 0, 0, cols,
-            num_blocks, op, finalize, as_hip_type(init),
+            num_blocks, op, finalize, as_hip_type(identity),
             as_hip_type(tmp_storage.get_const_data()), 1, as_hip_type(result));
     }
 }
@@ -434,7 +434,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_row_reduction(std::shared_ptr<const HipExecutor> exec,
                               KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType init,
+                              FinalizeOp finalize, ValueType identity,
                               ValueType* result, size_type result_stride,
                               dim<2> size, KernelArgs&&... args)
 {
@@ -455,13 +455,13 @@ void run_kernel_row_reduction(std::shared_ptr<const HipExecutor> exec,
         hipLaunchKernelGGL(
             HIP_KERNEL_NAME(generic_kernel_row_reduction_2d<config::warp_size>),
             num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn,
-            op, [] __device__(auto v) { return v; }, as_hip_type(init),
+            op, [] __device__(auto v) { return v; }, as_hip_type(identity),
             as_hip_type(partial.get_data()), 1, map_to_device(args)...);
         const auto num_finalize_blocks = ceildiv(rows, default_block_size);
         hipLaunchKernelGGL(
             generic_kernel_reduction_finalize_2d, num_finalize_blocks,
             default_block_size, 0, 0, rows, col_blocks, op, finalize,
-            as_hip_type(init), as_hip_type(partial.get_const_data()),
+            as_hip_type(identity), as_hip_type(partial.get_const_data()),
             static_cast<int64>(result_stride), as_hip_type(result));
     } else {
         select_run_generic_kernel_row_reduction(
@@ -471,7 +471,7 @@ void run_kernel_row_reduction(std::shared_ptr<const HipExecutor> exec,
                        compiled_subwarp_size == config::warp_size;
             },
             syn::value_list<int>(), syn::type_list<>(), rows, cols, 1, fn, op,
-            finalize, init, result, static_cast<int64>(result_stride),
+            finalize, identity, result, static_cast<int64>(result_stride),
             map_to_device(args)...);
     }
 }
@@ -481,7 +481,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_col_reduction(std::shared_ptr<const HipExecutor> exec,
                               KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType init,
+                              FinalizeOp finalize, ValueType identity,
                               ValueType* result, dim<2> size,
                               KernelArgs&&... args)
 {
@@ -501,7 +501,7 @@ void run_kernel_col_reduction(std::shared_ptr<const HipExecutor> exec,
                        compiled_subwarp_size == config::warp_size;
             },
             syn::value_list<int>(), syn::type_list<>(), max_blocks, exec, fn,
-            op, finalize, init, result, size, map_to_device(args)...);
+            op, finalize, identity, result, size, map_to_device(args)...);
     } else {
         const auto col_blocks = ceildiv(cols, config::warp_size);
         const auto row_blocks =
@@ -512,8 +512,9 @@ void run_kernel_col_reduction(std::shared_ptr<const HipExecutor> exec,
         if (row_blocks <= 1) {
             hipLaunchKernelGGL(generic_kernel_col_reduction_2d_blocked,
                                dim3(1, col_blocks), default_block_size, 0, 0,
-                               rows, cols, fn, op, finalize, as_hip_type(init),
-                               as_hip_type(result), map_to_device(args)...);
+                               rows, cols, fn, op, finalize,
+                               as_hip_type(identity), as_hip_type(result),
+                               map_to_device(args)...);
         } else {
             Array<ValueType> tmp_storage{
                 exec, static_cast<size_type>(row_blocks * cols)};
@@ -521,12 +522,12 @@ void run_kernel_col_reduction(std::shared_ptr<const HipExecutor> exec,
                 generic_kernel_col_reduction_2d_blocked,
                 dim3(row_blocks, col_blocks), default_block_size, 0, 0, rows,
                 cols, fn, op, [] __device__(auto v) { return v; },
-                as_hip_type(init), as_hip_type(tmp_storage.get_data()),
+                as_hip_type(identity), as_hip_type(tmp_storage.get_data()),
                 map_to_device(args)...);
             hipLaunchKernelGGL(generic_kernel_reduction_finalize_2d,
                                ceildiv(cols, default_block_size),
                                default_block_size, 0, 0, cols, row_blocks, op,
-                               finalize, as_hip_type(init),
+                               finalize, as_hip_type(identity),
                                as_hip_type(tmp_storage.get_const_data()), 1,
                                as_hip_type(result));
         }
diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
index 0c5acf0ebe2..4f9e8267633 100644
--- a/omp/base/kernel_launch_reduction.hpp
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -58,7 +58,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... MappedKernelArgs>
 void run_kernel_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
                                KernelFunction fn, ReductionOp op,
-                               FinalizeOp finalize, ValueType init,
+                               FinalizeOp finalize, ValueType identity,
                                ValueType* result, size_type size,
                                MappedKernelArgs... args)
 {
@@ -72,7 +72,7 @@ void run_kernel_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
         const auto begin = thread_id * work_per_thread;
         const auto end = std::min(ssize, begin + work_per_thread);
 
-        auto local_partial = init;
+        auto local_partial = identity;
         for (auto i = begin; i < end; i++) {
             local_partial = op(local_partial, fn(i, map_to_device(args)...));
         }
@@ -80,7 +80,7 @@ void run_kernel_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
     }
     *result = finalize(std::accumulate(partial.get_const_data(),
                                        partial.get_const_data() + num_threads,
-                                       init, op));
+                                       identity, op));
 }
 
 
@@ -90,7 +90,7 @@ template <int block_size, int remainder_cols, typename ValueType,
 void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
                                      std::shared_ptr<const OmpExecutor> exec,
                                      KernelFunction fn, ReductionOp op,
-                                     FinalizeOp finalize, ValueType init,
+                                     FinalizeOp finalize, ValueType identity,
                                      ValueType* result, dim<2> size,
                                      MappedKernelArgs... args)
 {
@@ -108,7 +108,7 @@ void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
         const auto begin = thread_id * work_per_thread;
         const auto end = std::min(rows, begin + work_per_thread);
 
-        auto local_partial = init;
+        auto local_partial = identity;
         if (rounded_cols == 0 || cols == block_size) {
             // we group all sizes <= block_size here and unroll explicitly
             constexpr auto local_cols =
@@ -142,7 +142,7 @@ void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
     }
     *result = finalize(std::accumulate(partial.get_const_data(),
                                        partial.get_const_data() + num_threads,
-                                       init, op));
+                                       identity, op));
 }
 
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized,
@@ -156,11 +156,11 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
                           KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType init,
+                          FinalizeOp finalize, ValueType identity,
                           ValueType* result, size_type size,
                           KernelArgs&&... args)
 {
-    run_kernel_reduction_impl(exec, fn, op, finalize, init, result, size,
+    run_kernel_reduction_impl(exec, fn, op, finalize, identity, result, size,
                               map_to_device(args)...);
 }
 
@@ -169,7 +169,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
                           KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType init,
+                          FinalizeOp finalize, ValueType identity,
                           ValueType* result, dim<2> size, KernelArgs&&... args)
 {
     const auto cols = static_cast<int64>(size[1]);
@@ -177,14 +177,14 @@ void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
     using remainders = syn::as_list<syn::range<0, block_size, 1>>;
 
     if (cols <= 0) {
-        *result = init;
+        *result = identity;
         return;
     }
     select_run_kernel_reduction_sized(
         remainders(),
         [&](int remainder) { return remainder == cols % block_size; },
         syn::value_list<int, block_size>(), syn::type_list<>(), exec, fn, op,
-        finalize, init, result, size, map_to_device(args)...);
+        finalize, identity, result, size, map_to_device(args)...);
 }
 
 
@@ -195,7 +195,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... MappedKernelArgs>
 void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
                                    KernelFunction fn, ReductionOp op,
-                                   FinalizeOp finalize, ValueType init,
+                                   FinalizeOp finalize, ValueType identity,
                                    ValueType* result, size_type result_stride,
                                    dim<2> size, MappedKernelArgs... args)
 {
@@ -212,7 +212,7 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
 #pragma omp parallel for
         for (int64 row = 0; row < rows; row++) {
             [&]() {
-                auto partial = init;
+                auto partial = identity;
                 for (int64 col = 0; col < cols; col++) {
                     partial = op(partial, fn(row, col, args...));
                 }
@@ -230,7 +230,7 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
             const auto begin = thread_id * work_per_thread;
             const auto end = std::min(begin + work_per_thread, cols);
             for (int64 row = 0; row < rows; row++) {
-                auto local_partial = init;
+                auto local_partial = identity;
                 for (int64 col = begin; col < end; col++) {
                     local_partial = op(local_partial, [&]() {
                         return fn(row, col, args...);
@@ -244,7 +244,7 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
 #pragma omp parallel for
         for (int64 row = 0; row < rows; row++) {
             [&] {
-                auto local_partial = init;
+                auto local_partial = identity;
                 for (int64 thread_id = 0; thread_id < num_threads;
                      thread_id++) {
                     local_partial = op(
@@ -263,12 +263,12 @@ template <int local_cols, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
 void run_kernel_col_reduction_sized_block_impl(
-    KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType init,
+    KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity,
     ValueType* result, int64 row_begin, int64 row_end, int64 base_col,
     MappedKernelArgs... args)
 {
     std::array<ValueType, local_cols> partial;
-    partial.fill(init);
+    partial.fill(identity);
     for (auto row = row_begin; row < row_end; row++) {
 #pragma unroll
         for (int64 rel_col = 0; rel_col < local_cols; rel_col++) {
@@ -289,7 +289,7 @@ template <int block_size, int remainder_cols, typename ValueType,
 void run_kernel_col_reduction_sized_impl(
     syn::value_list<int, remainder_cols>,
     std::shared_ptr<const OmpExecutor> exec, KernelFunction fn, ReductionOp op,
-    FinalizeOp finalize, ValueType init, ValueType* result, dim<2> size,
+    FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size,
     MappedKernelArgs... args)
 {
     const auto rows = static_cast<int64>(size[0]);
@@ -306,10 +306,12 @@ void run_kernel_col_reduction_sized_impl(
             const auto base_col = col_block * block_size;
             if (base_col + block_size <= cols) {
                 run_kernel_col_reduction_sized_block_impl<block_size>(
-                    fn, op, finalize, init, result, 0, rows, base_col, args...);
+                    fn, op, finalize, identity, result, 0, rows, base_col,
+                    args...);
             } else {
                 run_kernel_col_reduction_sized_block_impl<remainder_cols>(
-                    fn, op, finalize, init, result, 0, rows, base_col, args...);
+                    fn, op, finalize, identity, result, 0, rows, base_col,
+                    args...);
             }
         }
     } else {
@@ -329,12 +331,12 @@ void run_kernel_col_reduction_sized_impl(
             const auto identity = [](auto i) { return i; };
             if (base_col + block_size <= cols) {
                 run_kernel_col_reduction_sized_block_impl<block_size>(
-                    fn, op, identity, init,
+                    fn, op, identity, identity,
                     partial.get_data() + cols * row_block, begin, end, base_col,
                     args...);
             } else {
                 run_kernel_col_reduction_sized_block_impl<remainder_cols>(
-                    fn, op, identity, init,
+                    fn, op, identity, identity,
                     partial.get_data() + cols * row_block, begin, end, base_col,
                     args...);
             }
@@ -342,7 +344,7 @@ void run_kernel_col_reduction_sized_impl(
 #pragma omp parallel for
         for (int64 col = 0; col < cols; col++) {
             [&] {
-                auto total = init;
+                auto total = identity;
                 for (int64 row_block = 0; row_block < reduction_size;
                      row_block++) {
                     total =
@@ -366,11 +368,11 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... MappedKernelArgs>
 void run_kernel_row_reduction(std::shared_ptr<const OmpExecutor> exec,
                               KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType init,
+                              FinalizeOp finalize, ValueType identity,
                               ValueType* result, size_type result_stride,
                               dim<2> size, MappedKernelArgs... args)
 {
-    run_kernel_row_reduction_impl(exec, fn, op, finalize, init, result,
+    run_kernel_row_reduction_impl(exec, fn, op, finalize, identity, result,
                                   result_stride, size, map_to_device(args)...);
 }
 
@@ -379,7 +381,7 @@ template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 void run_kernel_col_reduction(std::shared_ptr<const OmpExecutor> exec,
                               KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType init,
+                              FinalizeOp finalize, ValueType identity,
                               ValueType* result, dim<2> size,
                               KernelArgs&&... args)
 {
@@ -394,7 +396,7 @@ void run_kernel_col_reduction(std::shared_ptr<const OmpExecutor> exec,
         remainders(),
         [&](int remainder) { return remainder == cols % block_size; },
         syn::value_list<int, block_size>(), syn::type_list<>(), exec, fn, op,
-        finalize, init, result, size, map_to_device(args)...);
+        finalize, identity, result, size, map_to_device(args)...);
 }
 
 

From d076d5fa8aa7b84574130c4175f910399df1295e Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Tue, 5 Oct 2021 14:13:32 +0200
Subject: [PATCH 22/25] simplify DPCPP kernel selection

---
 dpcpp/base/kernel_launch_reduction.dp.hpp | 148 ++++++++++------------
 1 file changed, 67 insertions(+), 81 deletions(-)

diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp
index 4b29a5af55e..c4b8d32642a 100644
--- a/dpcpp/base/kernel_launch_reduction.dp.hpp
+++ b/dpcpp/base/kernel_launch_reduction.dp.hpp
@@ -55,12 +55,10 @@ namespace dpcpp {
 
 using KCFG_1D = ConfigSet<11, 7>;
 constexpr auto kcfg_1d_list_simple_reduction =
-    syn::value_list<int, static_cast<int>(KCFG_1D::encode(512, 64)),
-                    static_cast<int>(KCFG_1D::encode(512, 32)),
-                    static_cast<int>(KCFG_1D::encode(512, 16)),
-                    static_cast<int>(KCFG_1D::encode(256, 32)),
-                    static_cast<int>(KCFG_1D::encode(256, 16)),
-                    static_cast<int>(KCFG_1D::encode(256, 8))>();
+    syn::value_list<std::uint32_t, KCFG_1D::encode(512, 64),
+                    KCFG_1D::encode(512, 32), KCFG_1D::encode(512, 16),
+                    KCFG_1D::encode(256, 32), KCFG_1D::encode(256, 16),
+                    KCFG_1D::encode(256, 8)>();
 
 
 template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
@@ -163,17 +161,15 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols,
 }
 
 
-template <int icfg, typename ValueType, typename KernelFunction,
+template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
-void run_kernel_reduction_impl(syn::value_list<int, icfg>,
-                               std::shared_ptr<const DpcppExecutor> exec,
+void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
                                KernelFunction fn, ReductionOp op,
                                FinalizeOp finalize, ValueType identity,
                                ValueType* result, size_type size,
                                MappedKernelArgs... args)
 {
-    constexpr auto cfg = static_cast<std::uint32_t>(icfg);
     constexpr int oversubscription = 4;
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
@@ -205,17 +201,15 @@ void run_kernel_reduction_impl(syn::value_list<int, icfg>,
 }
 
 
-template <int icfg, typename ValueType, typename KernelFunction,
+template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
-void run_kernel_reduction_impl(syn::value_list<int, icfg>,
-                               std::shared_ptr<const DpcppExecutor> exec,
+void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
                                KernelFunction fn, ReductionOp op,
                                FinalizeOp finalize, ValueType identity,
                                ValueType* result, dim<2> size,
                                MappedKernelArgs... args)
 {
-    constexpr auto cfg = static_cast<std::uint32_t>(icfg);
     constexpr int oversubscription = 4;
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
@@ -249,8 +243,8 @@ void run_kernel_reduction_impl(syn::value_list<int, icfg>,
     }
 }
 
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction,
-                                    run_kernel_reduction_impl)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_run_kernel_reduction,
+                                           run_kernel_reduction_impl)
 
 
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
@@ -260,16 +254,17 @@ void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
                           FinalizeOp finalize, ValueType identity,
                           ValueType* result, dim<2> size, KernelArgs&&... args)
 {
-    const auto desired_icfg = static_cast<int>(get_first_cfg(
+    const auto desired_cfg = get_first_cfg(
         as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
             return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
                             KCFG_1D::decode<1>(cfg));
-        }));
+        });
     select_run_kernel_reduction(
         kcfg_1d_list_simple_reduction,
-        [&](int icfg) { return icfg == desired_icfg; }, syn::value_list<int>(),
-        syn::type_list<>(), exec, fn, op, finalize, identity, result, size,
-        map_to_device(args)...);
+        [&](std::uint32_t cfg) { return cfg == desired_cfg; },
+        syn::value_list<bool>(), syn::value_list<int>(),
+        syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
+        finalize, identity, result, size, map_to_device(args)...);
 }
 
 
@@ -281,24 +276,25 @@ void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
                           ValueType* result, size_type size,
                           KernelArgs&&... args)
 {
-    const auto desired_icfg = static_cast<int>(get_first_cfg(
+    const auto desired_cfg = get_first_cfg(
         as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
             return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
                             KCFG_1D::decode<1>(cfg));
-        }));
+        });
     select_run_kernel_reduction(
         kcfg_1d_list_simple_reduction,
-        [&](int icfg) { return icfg == desired_icfg; }, syn::value_list<int>(),
-        syn::type_list<>(), exec, fn, op, finalize, identity, result, size,
-        map_to_device(args)...);
+        [&](std::uint32_t cfg) { return cfg == desired_cfg; },
+        syn::value_list<bool>(), syn::value_list<int>(),
+        syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
+        finalize, identity, result, size, map_to_device(args)...);
 }
 
 
 namespace {
 
 
-template <int icfg, int ssg_size, typename ValueType, typename KernelFunction,
-          typename ReductionOp, typename FinalizeOp,
+template <std::uint32_t cfg, int ssg_size, typename ValueType,
+          typename KernelFunction, typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
 void generic_kernel_row_reduction_2d(syn::value_list<int, ssg_size>,
                                      std::shared_ptr<const DpcppExecutor> exec,
@@ -308,10 +304,8 @@ void generic_kernel_row_reduction_2d(syn::value_list<int, ssg_size>,
                                      ValueType* result, int64 result_stride,
                                      MappedKernelArgs... args)
 {
-    constexpr auto wg_size =
-        KCFG_1D::decode<0>(static_cast<std::uint32_t>(icfg));
-    constexpr auto sg_size =
-        KCFG_1D::decode<1>(static_cast<std::uint32_t>(icfg));
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     static_assert(ssg_size <= sg_size, "ssg must be smaller than sg");
     const auto num_workgroups = ceildiv(rows * col_blocks * ssg_size, wg_size);
     const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size));
@@ -355,18 +349,16 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_kernel_row_reduction_2d,
                                     generic_kernel_row_reduction_2d);
 
 
-template <int icfg, int ssg_size, typename ValueType, typename KernelFunction,
-          typename ReductionOp, typename FinalizeOp,
+template <std::uint32_t cfg, int ssg_size, typename ValueType,
+          typename KernelFunction, typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
 void generic_kernel_col_reduction_2d_small(
     sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks,
     KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity,
     ValueType* result, MappedKernelArgs... args)
 {
-    constexpr auto wg_size =
-        KCFG_1D::decode<0>(static_cast<std::uint32_t>(icfg));
-    constexpr auto sg_size =
-        KCFG_1D::decode<1>(static_cast<std::uint32_t>(icfg));
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     static_assert(ssg_size <= sg_size, "ssg must be smaller than sg");
     constexpr auto subgroups_per_workgroup = wg_size / sg_size;
     // stores the subwarp_size partial sums from each warp, grouped by warp
@@ -433,7 +425,7 @@ void generic_kernel_col_reduction_2d_small(
 }
 
 
-template <int icfg, typename ValueType, typename KernelFunction,
+template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
 void generic_kernel_col_reduction_2d_blocked(
@@ -441,7 +433,6 @@ void generic_kernel_col_reduction_2d_blocked(
     int64 col_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize,
     ValueType identity, ValueType* result, MappedKernelArgs... args)
 {
-    constexpr auto cfg = static_cast<std::uint32_t>(icfg);
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     const auto range =
@@ -504,8 +495,8 @@ void generic_kernel_reduction_finalize_2d(
 }
 
 
-template <int icfg, int ssg_size, typename ValueType, typename KernelFunction,
-          typename ReductionOp, typename FinalizeOp,
+template <std::uint32_t cfg, int ssg_size, typename ValueType,
+          typename KernelFunction, typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
 void run_generic_col_reduction_small(syn::value_list<int, ssg_size>,
                                      std::shared_ptr<const DpcppExecutor> exec,
@@ -514,10 +505,8 @@ void run_generic_col_reduction_small(syn::value_list<int, ssg_size>,
                                      ValueType identity, ValueType* result,
                                      dim<2> size, MappedKernelArgs... args)
 {
-    constexpr auto wg_size =
-        KCFG_1D::decode<0>(static_cast<std::uint32_t>(icfg));
-    constexpr auto sg_size =
-        KCFG_1D::decode<1>(static_cast<std::uint32_t>(icfg));
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     static_assert(ssg_size <= sg_size, "ssg must be smaller than sg");
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
@@ -526,7 +515,7 @@ void run_generic_col_reduction_small(syn::value_list<int, ssg_size>,
     auto queue = exec->get_queue();
     if (row_blocks <= 1) {
         queue->submit([&](sycl::handler& cgh) {
-            generic_kernel_col_reduction_2d_small<icfg, ssg_size>(
+            generic_kernel_col_reduction_2d_small<cfg, ssg_size>(
                 cgh, rows, cols, 1, fn, op, finalize, identity, result,
                 args...);
         });
@@ -534,7 +523,7 @@ void run_generic_col_reduction_small(syn::value_list<int, ssg_size>,
         Array<ValueType> tmp_storage{exec,
                                      static_cast<size_type>(row_blocks * cols)};
         queue->submit([&](sycl::handler& cgh) {
-            generic_kernel_col_reduction_2d_small<icfg, ssg_size>(
+            generic_kernel_col_reduction_2d_small<cfg, ssg_size>(
                 cgh, rows, cols, row_blocks, fn, op, [](auto v) { return v; },
                 identity, tmp_storage.get_data(), args...);
         });
@@ -550,20 +539,17 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small,
                                     run_generic_col_reduction_small);
 
 
-template <int icfg, typename ValueType, typename KernelFunction,
+template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
-void run_kernel_row_reduction_stage1(syn::value_list<int, icfg>,
-                                     std::shared_ptr<const DpcppExecutor> exec,
+void run_kernel_row_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
                                      KernelFunction fn, ReductionOp op,
                                      FinalizeOp finalize, ValueType identity,
                                      ValueType* result, size_type result_stride,
                                      dim<2> size, MappedKernelArgs... args)
 {
-    constexpr auto wg_size =
-        KCFG_1D::decode<0>(static_cast<std::uint32_t>(icfg));
-    constexpr auto sg_size =
-        KCFG_1D::decode<1>(static_cast<std::uint32_t>(icfg));
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     using subsubgroup_sizes =
         syn::value_list<int, 1, 2, 4, 8, std::min<int>(16, sg_size),
                         std::min<int>(32, sg_size), sg_size>;
@@ -577,7 +563,7 @@ void run_kernel_row_reduction_stage1(syn::value_list<int, icfg>,
         const auto col_blocks = ceildiv(rows * cols, resources);
         Array<ValueType> partial{exec,
                                  static_cast<size_type>(col_blocks * rows)};
-        generic_kernel_row_reduction_2d<icfg, sg_size>(
+        generic_kernel_row_reduction_2d<cfg, sg_size>(
             syn::value_list<int, sg_size>{}, exec, rows, cols, col_blocks, fn,
             op, [](auto v) { return v; }, identity, partial.get_data(), 1,
             args...);
@@ -594,30 +580,27 @@ void run_kernel_row_reduction_stage1(syn::value_list<int, icfg>,
                 return compiled_ssg_size >= cols ||
                        compiled_ssg_size == sg_size;
             },
-            syn::value_list<int, icfg>(), syn::type_list<>(), exec, rows, cols,
+            syn::value_list<int, cfg>(), syn::type_list<>(), exec, rows, cols,
             1, fn, op, finalize, identity, result,
             static_cast<int64>(result_stride), args...);
     }
 }
 
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_kernel_row_reduction_stage1,
-                                    run_kernel_row_reduction_stage1);
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_kernel_row_reduction_stage1,
+                                           run_kernel_row_reduction_stage1);
 
 
-template <int icfg, typename ValueType, typename KernelFunction,
+template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
-void run_kernel_col_reduction_stage1(syn::value_list<int, icfg>,
-                                     std::shared_ptr<const DpcppExecutor> exec,
+void run_kernel_col_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
                                      KernelFunction fn, ReductionOp op,
                                      FinalizeOp finalize, ValueType identity,
                                      ValueType* result, dim<2> size,
                                      MappedKernelArgs... args)
 {
-    constexpr auto wg_size =
-        KCFG_1D::decode<0>(static_cast<std::uint32_t>(icfg));
-    constexpr auto sg_size =
-        KCFG_1D::decode<1>(static_cast<std::uint32_t>(icfg));
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     using subsubgroup_sizes =
         syn::value_list<int, 1, 2, 4, 8, std::min<int>(16, sg_size),
                         std::min<int>(32, sg_size), sg_size>;
@@ -633,7 +616,7 @@ void run_kernel_col_reduction_stage1(syn::value_list<int, icfg>,
                 return compiled_ssg_size >= cols ||
                        compiled_ssg_size == sg_size;
             },
-            syn::value_list<int, icfg>(), syn::type_list<>(), exec, max_blocks,
+            syn::value_list<int, cfg>(), syn::type_list<>(), exec, max_blocks,
             fn, op, finalize, identity, result, size, args...);
     } else {
         const auto col_blocks = ceildiv(cols, sg_size);
@@ -643,7 +626,7 @@ void run_kernel_col_reduction_stage1(syn::value_list<int, icfg>,
         auto queue = exec->get_queue();
         if (row_blocks <= 1) {
             queue->submit([&](sycl::handler& cgh) {
-                generic_kernel_col_reduction_2d_blocked<icfg>(
+                generic_kernel_col_reduction_2d_blocked<cfg>(
                     cgh, rows, cols, 1, col_blocks, fn, op, finalize, identity,
                     result, args...);
             });
@@ -651,7 +634,7 @@ void run_kernel_col_reduction_stage1(syn::value_list<int, icfg>,
             Array<ValueType> tmp_storage{
                 exec, static_cast<size_type>(row_blocks * cols)};
             queue->submit([&](sycl::handler& cgh) {
-                generic_kernel_col_reduction_2d_blocked<icfg>(
+                generic_kernel_col_reduction_2d_blocked<cfg>(
                     cgh, rows, cols, row_blocks, col_blocks, fn, op,
                     [](auto v) { return v; }, identity, tmp_storage.get_data(),
                     args...);
@@ -665,8 +648,8 @@ void run_kernel_col_reduction_stage1(syn::value_list<int, icfg>,
     }
 }
 
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_kernel_col_reduction_stage1,
-                                    run_kernel_col_reduction_stage1);
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_kernel_col_reduction_stage1,
+                                           run_kernel_col_reduction_stage1);
 
 
 }  // namespace
@@ -680,16 +663,18 @@ void run_kernel_row_reduction(std::shared_ptr<const DpcppExecutor> exec,
                               ValueType* result, size_type result_stride,
                               dim<2> size, KernelArgs&&... args)
 {
-    const auto desired_icfg = static_cast<int>(get_first_cfg(
+    const auto desired_cfg = get_first_cfg(
         as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
             return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
                             KCFG_1D::decode<1>(cfg));
-        }));
+        });
     select_kernel_row_reduction_stage1(
         kcfg_1d_list_simple_reduction,
-        [&](int icfg) { return icfg == desired_icfg; }, syn::value_list<int>(),
-        syn::type_list<>(), exec, fn, op, finalize, identity, result,
-        result_stride, size, map_to_device(args)...);
+        [&](std::uint32_t cfg) { return cfg == desired_cfg; },
+        syn::value_list<bool>(), syn::value_list<int>(),
+        syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
+        finalize, identity, result, result_stride, size,
+        map_to_device(args)...);
 }
 
 
@@ -701,16 +686,17 @@ void run_kernel_col_reduction(std::shared_ptr<const DpcppExecutor> exec,
                               ValueType* result, dim<2> size,
                               KernelArgs&&... args)
 {
-    const auto desired_icfg = static_cast<int>(get_first_cfg(
+    const auto desired_cfg = get_first_cfg(
         as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
             return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
                             KCFG_1D::decode<1>(cfg));
-        }));
+        });
     select_kernel_col_reduction_stage1(
         kcfg_1d_list_simple_reduction,
-        [&](int icfg) { return icfg == desired_icfg; }, syn::value_list<int>(),
-        syn::type_list<>(), exec, fn, op, finalize, identity, result, size,
-        map_to_device(args)...);
+        [&](std::uint32_t cfg) { return cfg == desired_cfg; },
+        syn::value_list<bool>(), syn::value_list<int>(),
+        syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
+        finalize, identity, result, size, map_to_device(args)...);
 }
 
 

From ceaa76b7b45234c3a19928a185f249972ce93fc2 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Tue, 5 Oct 2021 14:36:12 +0200
Subject: [PATCH 23/25] simplify dpcpp local memory usage

---
 dpcpp/base/kernel_launch_reduction.dp.hpp | 24 +++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp
index c4b8d32642a..5ebf06b0f71 100644
--- a/dpcpp/base/kernel_launch_reduction.dp.hpp
+++ b/dpcpp/base/kernel_launch_reduction.dp.hpp
@@ -73,16 +73,16 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size,
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     constexpr auto num_partials = wg_size / sg_size;
-    sycl::accessor<UninitializedArray<ValueType, num_partials>, 1,
+    sycl::accessor<UninitializedArray<ValueType, num_partials>, 0,
                    sycl::access_mode::read_write, sycl::access::target::local>
-        subgroup_partial_acc(sycl::range<1>{1}, cgh);
+        subgroup_partial_acc(cgh);
     const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size));
     const auto global_size = num_workgroups * wg_size;
 
     cgh.parallel_for(
         range, [=
     ](sycl::nd_item<3> idx) [[intel::reqd_sub_group_size(sg_size)]] {
-            auto subgroup_partial = &subgroup_partial_acc[0][0];
+            auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0];
             const auto tidx = thread::get_thread_id_flat<int64>(idx);
             const auto local_tidx = static_cast<int64>(tidx % wg_size);
             auto subgroup =
@@ -122,16 +122,16 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols,
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     constexpr auto num_partials = wg_size / sg_size;
-    sycl::accessor<UninitializedArray<ValueType, num_partials>, 1,
+    sycl::accessor<UninitializedArray<ValueType, num_partials>, 0,
                    sycl::access_mode::read_write, sycl::access::target::local>
-        subgroup_partial_acc(sycl::range<1>{1}, cgh);
+        subgroup_partial_acc(cgh);
     const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size));
     const auto global_size = num_workgroups * wg_size;
 
     cgh.parallel_for(
         range, [=
     ](sycl::nd_item<3> idx) [[intel::reqd_sub_group_size(sg_size)]] {
-            auto subgroup_partial = &subgroup_partial_acc[0][0];
+            auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0];
             const auto tidx = thread::get_thread_id_flat<int64>(idx);
             const auto local_tidx = static_cast<int64>(tidx % wg_size);
             auto subgroup =
@@ -363,14 +363,14 @@ void generic_kernel_col_reduction_2d_small(
     constexpr auto subgroups_per_workgroup = wg_size / sg_size;
     // stores the subwarp_size partial sums from each warp, grouped by warp
     constexpr auto shared_storage = subgroups_per_workgroup * ssg_size;
-    sycl::accessor<UninitializedArray<ValueType, shared_storage>, 1,
+    sycl::accessor<UninitializedArray<ValueType, shared_storage>, 0,
                    sycl::access_mode::read_write, sycl::access::target::local>
-        block_partial_acc(sycl::range<1>{1}, cgh);
+        block_partial_acc(cgh);
     const auto range = sycl_nd_range(dim3(row_blocks), dim3(wg_size));
     cgh.parallel_for(
         range, [=
     ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] {
-            auto block_partial = &block_partial_acc[0][0];
+            auto block_partial = &(*block_partial_acc.get_pointer())[0];
             const auto ssg_id =
                 thread::get_subwarp_id_flat<ssg_size, int64>(id);
             const auto local_sg_id = id.get_local_id(2) / sg_size;
@@ -437,9 +437,9 @@ void generic_kernel_col_reduction_2d_blocked(
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     const auto range =
         sycl_nd_range(dim3(row_blocks, col_blocks), dim3(wg_size));
-    sycl::accessor<UninitializedArray<ValueType, wg_size>, 1,
+    sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
                    sycl::access_mode::read_write, sycl::access::target::local>
-        block_partial_acc(sycl::range<1>{1}, cgh);
+        block_partial_acc(cgh);
     cgh.parallel_for(
         range, [=
     ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] {
@@ -451,7 +451,7 @@ void generic_kernel_col_reduction_2d_blocked(
             const auto sg_rank = subgroup.thread_rank();
             const auto col =
                 sg_rank + static_cast<int64>(id.get_group(1)) * sg_size;
-            auto block_partial = &block_partial_acc[0][0];
+            auto block_partial = &(*block_partial_acc.get_pointer())[0];
             auto partial = identity;
             // accumulate within a thread
             if (col < cols) {

From 817d1d2d24de3926d46c18a5ff425c5173d0b0f3 Mon Sep 17 00:00:00 2001
From: ginkgo-bot <ginkgo.library@gmail.com>
Date: Tue, 5 Oct 2021 12:43:55 +0000
Subject: [PATCH 24/25] Format files

Co-authored-by: Tobias Ribizel <upsj@users.noreply.github.com>
---
 cuda/test/base/kernel_launch.cu | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu
index 66fc3d9e94d..e2f6583f930 100644
--- a/cuda/test/base/kernel_launch.cu
+++ b/cuda/test/base/kernel_launch.cu
@@ -285,7 +285,7 @@ void run1d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
         exec,
         [] GKO_KERNEL(auto i, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
-            static_assert(is_same<decltype(a), int64 *>::value, "value");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
             return i + 1;
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
@@ -299,7 +299,7 @@ void run1d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
         exec,
         [] GKO_KERNEL(auto i, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
-            static_assert(is_same<decltype(a), int64 *>::value, "value");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
             return i + 1;
         },
         [] GKO_KERNEL(auto i, auto j) {
@@ -329,7 +329,7 @@ void run2d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
         [] GKO_KERNEL(auto i, auto j, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(j), int64>::value, "index");
-            static_assert(is_same<decltype(a), int64 *>::value, "value");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) {
@@ -351,7 +351,7 @@ void run2d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
         [] GKO_KERNEL(auto i, auto j, auto a) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(j), int64>::value, "index");
-            static_assert(is_same<decltype(a), int64 *>::value, "value");
+            static_assert(is_same<decltype(a), int64*>::value, "value");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) {
@@ -394,8 +394,7 @@ void run2d_row_reduction(std::shared_ptr<gko::CudaExecutor> exec)
                 [] GKO_KERNEL(auto i, auto j, auto a) {
                     static_assert(is_same<decltype(i), int64>::value, "index");
                     static_assert(is_same<decltype(j), int64>::value, "index");
-                    static_assert(is_same<decltype(a), int64 *>::value,
-                                  "value");
+                    static_assert(is_same<decltype(a), int64*>::value, "value");
                     return (i + 1) * (j + 1);
                 },
                 [] GKO_KERNEL(auto i, auto j) {
@@ -444,8 +443,7 @@ void run2d_col_reduction(std::shared_ptr<gko::CudaExecutor> exec)
                 [] GKO_KERNEL(auto i, auto j, auto a) {
                     static_assert(is_same<decltype(i), int64>::value, "index");
                     static_assert(is_same<decltype(j), int64>::value, "index");
-                    static_assert(is_same<decltype(a), int64 *>::value,
-                                  "value");
+                    static_assert(is_same<decltype(a), int64*>::value, "value");
                     return (i + 1) * (j + 1);
                 },
                 [] GKO_KERNEL(auto i, auto j) {

From dfee616e12102434e6b293d5efcd0ac2278a0b51 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 6 Oct 2021 08:15:03 +0200
Subject: [PATCH 25/25] fix omp name collision

---
 omp/base/kernel_launch_reduction.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
index 4f9e8267633..030da93c245 100644
--- a/omp/base/kernel_launch_reduction.hpp
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -328,15 +328,15 @@ void run_kernel_col_reduction_sized_impl(
             const auto begin = row_block * rows_per_thread;
             const auto end = std::min(begin + rows_per_thread, rows);
             const auto base_col = col_block * block_size;
-            const auto identity = [](auto i) { return i; };
+            const auto identity_fn = [](auto i) { return i; };
             if (base_col + block_size <= cols) {
                 run_kernel_col_reduction_sized_block_impl<block_size>(
-                    fn, op, identity, identity,
+                    fn, op, identity_fn, identity,
                     partial.get_data() + cols * row_block, begin, end, base_col,
                     args...);
             } else {
                 run_kernel_col_reduction_sized_block_impl<remainder_cols>(
-                    fn, op, identity, identity,
+                    fn, op, identity_fn, identity,
                     partial.get_data() + cols * row_block, begin, end, base_col,
                     args...);
             }