From 22c318448b8fe41a83ed4d468e8029d9df2f7994 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Mon, 21 Mar 2022 15:47:01 +0100
Subject: [PATCH] add tuning to reduction kernels

---
 benchmark/blas/blas.cpp                   | 105 ++++++++++++++++++++--
 core/device_hooks/common_kernels.inc.cpp  |   5 +-
 core/matrix/dense_kernels.hpp             |  15 ++++
 cuda/base/kernel_launch_reduction.cuh     |   5 ++
 cuda/matrix/dense_kernels.cu              |  42 +++++++++
 dpcpp/base/kernel_launch_reduction.dp.hpp |  21 +++++
 dpcpp/matrix/dense_kernels.dp.cpp         |  20 +++++
 hip/base/kernel_launch_reduction.hip.hpp  |   5 ++
 hip/matrix/dense_kernels.hip.cpp          |  43 +++++++++
 omp/base/kernel_launch_reduction.hpp      |  27 ++++--
 omp/matrix/dense_kernels.cpp              |  20 +++++
 reference/matrix/dense_kernels.cpp        |  20 +++++
 12 files changed, 310 insertions(+), 18 deletions(-)
diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp
index 6dfe2414a9f..39d47dc9339 100644
--- a/benchmark/blas/blas.cpp
+++ b/benchmark/blas/blas.cpp
@@ -46,6 +46,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "benchmark/utils/loggers.hpp"
 #include "benchmark/utils/timer.hpp"
 #include "benchmark/utils/types.hpp"
+#include "core/matrix/dense_kernels.hpp"
+
+
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+#include "benchmark/utils/tuning_variables.hpp"
+#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
+
+
+GKO_REGISTER_OPERATION(compute_sparselib_dot, dense::compute_sparselib_dot);
+GKO_REGISTER_OPERATION(compute_sparselib_norm, dense::compute_sparselib_norm2);
 
 
 // Command-line arguments
@@ -57,8 +67,10 @@ DEFINE_string(
     "   multiaxpy (like axpy, but a has one entry per column),\n"
     "   scal (y = a * y),\n"
     "   multiscal (like scal, but a has one entry per column),\n"
-    "   dot (a = x' * y),"
+    "   dot (a = x' * y),\n"
+    "   sparselib_dot (like dot, but using vendor libraries),\n"
     "   norm (a = sqrt(x' * x)),\n"
+    "   sparselib_norm (like norm, but using vendor libraries),\n"
     "   mm (C = A * B),\n"
     "   gemm (C = a * A * B + b * C)\n"
     "where A has dimensions n x k, B has dimensions k x m,\n"
@@ -73,6 +85,7 @@ class BenchmarkOperation {
     virtual gko::size_type get_memory() const = 0;
     virtual void prepare(){};
     virtual void run() = 0;
+    virtual bool is_tunable() { return false; }
 };
 
 
@@ -205,20 +218,36 @@ class DotOperation : public BenchmarkOperation {
 
     void run() override { x_->compute_dot(lend(y_), lend(alpha_)); }
 
-private:
+    bool is_tunable() override { return true; }
+
+protected:
     std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
     std::unique_ptr<gko::matrix::Dense<etype>> x_;
     std::unique_ptr<gko::matrix::Dense<etype>> y_;
 };
 
 
+class SparselibDotOperation : public DotOperation {
+    using DotOperation::DotOperation;
+
+    void run() override
+    {
+        auto exec = alpha_->get_executor();
+        exec->run(make_compute_sparselib_dot(x_.get(), y_.get(), alpha_.get()));
+    }
+
+    bool is_tunable() override { return false; }
+};
+
+
 class NormOperation : public BenchmarkOperation {
 public:
     NormOperation(std::shared_ptr<const gko::Executor> exec,
                   gko::size_type rows, gko::size_type cols,
                   gko::size_type stride)
     {
-        alpha_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{1, cols});
+        alpha_ = gko::matrix::Dense<gko::remove_complex<etype>>::create(
+            exec, gko::dim<2>{1, cols});
         y_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{rows, cols},
                                                stride);
         y_->fill(1);
@@ -236,12 +265,27 @@ class NormOperation : public BenchmarkOperation {
 
     void run() override { y_->compute_norm2(lend(alpha_)); }
 
-private:
-    std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
+    bool is_tunable() override { return true; }
+
+protected:
+    std::unique_ptr<gko::matrix::Dense<gko::remove_complex<etype>>> alpha_;
     std::unique_ptr<gko::matrix::Dense<etype>> y_;
 };
 
 
+class SparselibNormOperation : public NormOperation {
+    using NormOperation::NormOperation;
+
+    void run() override
+    {
+        auto exec = alpha_->get_executor();
+        exec->run(make_compute_sparselib_norm(y_.get(), alpha_.get()));
+    }
+
+    bool is_tunable() override { return false; }
+};
+
+
 class ApplyOperation : public BenchmarkOperation {
 public:
     ApplyOperation(std::shared_ptr<const gko::Executor> exec, gko::size_type n,
@@ -407,11 +451,21 @@ std::map<std::string, std::function<std::unique_ptr<BenchmarkOperation>(
              return std::make_unique<DotOperation>(
                  exec, dims.n, dims.r, dims.stride_x, dims.stride_y);
          }},
+        {"sparselib_dot",
+         [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
+             return std::make_unique<SparselibDotOperation>(
+                 exec, dims.n, dims.r, dims.stride_x, dims.stride_y);
+         }},
         {"norm",
          [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
              return std::make_unique<NormOperation>(exec, dims.n, dims.r,
                                                     dims.stride_y);
          }},
+        {"sparselib_norm",
+         [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
+             return std::make_unique<SparselibNormOperation>(
+                 exec, dims.n, dims.r, dims.stride_y);
+         }},
         {"mm",
          [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
              return std::make_unique<ApplyOperation>(
@@ -466,6 +520,44 @@ void apply_blas(const char* operation_name, std::shared_ptr<gko::Executor> exec,
         add_or_set_member(blas_case[operation_name], "repetitions", repetitions,
                           allocator);
 
+        // tuning run
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+        if (op->is_tunable()) {
+            if (!blas_case[operation_name].HasMember("tuning")) {
+                blas_case[operation_name].AddMember(
+                    "tuning", rapidjson::Value(rapidjson::kObjectType),
+                    allocator);
+            }
+            auto& tuning_case = blas_case[operation_name]["tuning"];
+            add_or_set_member(tuning_case, "time",
+                              rapidjson::Value(rapidjson::kArrayType),
+                              allocator);
+            add_or_set_member(tuning_case, "values",
+                              rapidjson::Value(rapidjson::kArrayType),
+                              allocator);
+
+            // Enable tuning for this portion of code
+            gko::_tuning_flag = true;
+            // Select some values we want to tune.
+            std::vector<gko::size_type> tuning_values(32);
+            std::iota(tuning_values.begin(), tuning_values.end(), 0);
+            for (auto val : tuning_values) {
+                gko::_tuned_value = val;
+                IterationControl ic_tuning{timer};
+                op->prepare();
+                for (auto _ : ic_tuning.run()) {
+                    op->run();
+                }
+                tuning_case["time"].PushBack(ic_tuning.compute_average_time(),
+                                             allocator);
+                tuning_case["values"].PushBack(val, allocator);
+            }
+            // We put back the flag to false to use the default (non-tuned)
+            // values for the following
+            gko::_tuning_flag = false;
+        }
+#endif
+
         // compute and write benchmark data
         add_or_set_member(blas_case[operation_name], "completed", true,
                           allocator);
@@ -504,7 +596,8 @@ int main(int argc, char* argv[])
                          "  ]\n\n";
     initialize_argument_parsing(&argc, &argv, header, format);
 
-    std::string extra_information = "The operations are " + FLAGS_operations;
+    std::string extra_information =
+        "The operations are " + FLAGS_operations + "\n";
     print_general_information(extra_information);
 
     auto exec = executor_factory.at(FLAGS_executor)();
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index a89d1f3c140..d63e75684f8 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -256,12 +256,11 @@ GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp
index 3cf15052826..89bb9f7042d 100644
--- a/core/matrix/dense_kernels.hpp
+++ b/core/matrix/dense_kernels.hpp
@@ -146,6 +146,17 @@ namespace kernels {
                        matrix::Dense<remove_complex<_type>>* result, \
                        Array<char>& tmp)
 
+#define GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL(_type)               \
+    void compute_sparselib_dot(std::shared_ptr<const DefaultExecutor> exec, \
+                               const matrix::Dense<_type>* x,               \
+                               const matrix::Dense<_type>* y,               \
+                               matrix::Dense<_type>* result)
+
+#define GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL(_type)               \
+    void compute_sparselib_norm2(std::shared_ptr<const DefaultExecutor> exec, \
+                                 const matrix::Dense<_type>* x,               \
+                                 matrix::Dense<remove_complex<_type>>* result)
+
 #define GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(_type, _prec)         \
     void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,  \
                              const device_matrix_data<_type, _prec>& data, \
@@ -339,6 +350,10 @@ namespace kernels {
     GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType);             \
     template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType);                      \
+    template <typename ValueType>                                           \
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL(ValueType);              \
+    template <typename ValueType>                                           \
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL(ValueType);            \
     template <typename ValueType, typename IndexType>                       \
     GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType);     \
     template <typename ValueType, typename IndexType>                       \
diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
index e6da25a1706..32783ed0951 100644
--- a/cuda/base/kernel_launch_reduction.cuh
+++ b/cuda/base/kernel_launch_reduction.cuh
@@ -43,6 +43,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/components/thread_ids.cuh"
 
 
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+#include "benchmark/utils/tuning_variables.hpp"
+#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
+
+
 namespace gko {
 namespace kernels {
 namespace cuda {
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 7eeebd82357..2866c40a792 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -122,6 +122,28 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 
 
+template <typename ValueType>
+void compute_sparselib_dot(std::shared_ptr<const DefaultExecutor> exec,
+                           const matrix::Dense<ValueType>* x,
+                           const matrix::Dense<ValueType>* y,
+                           matrix::Dense<ValueType>* result)
+{
+    if (cublas::is_supported<ValueType>::value) {
+        auto handle = exec->get_cublas_handle();
+        for (size_type col = 0; col < x->get_size()[1]; col++) {
+            cublas::dot(handle, x->get_size()[0], x->get_const_values() + col,
+                        x->get_stride(), y->get_const_values() + col,
+                        y->get_stride(), result->get_values() + col);
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL);
+
+
 template <typename ValueType>
 void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                             const matrix::Dense<ValueType>* x,
@@ -145,6 +167,26 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 
 
+template <typename ValueType>
+void compute_sparselib_norm2(std::shared_ptr<const DefaultExecutor> exec,
+                             const matrix::Dense<ValueType>* x,
+                             matrix::Dense<remove_complex<ValueType>>* result)
+{
+    if (cublas::is_supported<ValueType>::value) {
+        auto handle = exec->get_cublas_handle();
+        for (size_type col = 0; col < x->get_size()[1]; col++) {
+            cublas::norm2(handle, x->get_size()[0], x->get_const_values() + col,
+                          x->get_stride(), result->get_values() + col);
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL);
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp
index 601527c6cae..30f0548ff02 100644
--- a/dpcpp/base/kernel_launch_reduction.dp.hpp
+++ b/dpcpp/base/kernel_launch_reduction.dp.hpp
@@ -48,6 +48,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dpcpp/components/uninitialized_array.hpp"
 
 
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+#include "benchmark/utils/tuning_variables.hpp"
+#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
+
+
 namespace gko {
 namespace kernels {
 namespace dpcpp {
@@ -170,7 +175,11 @@ void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
                                ValueType* result, size_type size,
                                Array<char>& tmp, MappedKernelArgs... args)
 {
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+    const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 4;
+#else
     constexpr int oversubscription = 4;
+#endif
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     const auto num_workgroups =
@@ -214,7 +223,11 @@ void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
                                ValueType* result, dim<2> size, Array<char>& tmp,
                                MappedKernelArgs... args)
 {
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+    const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 4;
+#else
     constexpr int oversubscription = 4;
+#endif
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
     const auto flat_size = rows * cols;
@@ -566,7 +579,11 @@ void run_kernel_row_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
     using subsubgroup_sizes =
         syn::value_list<int, 1, 2, 4, 8, std::min<int>(16, sg_size),
                         std::min<int>(32, sg_size), sg_size>;
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+    const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
+#else
     constexpr int oversubscription = 16;
+#endif
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
     const auto resources =
@@ -619,7 +636,11 @@ void run_kernel_col_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
     using subsubgroup_sizes =
         syn::value_list<int, 1, 2, 4, 8, std::min<int>(16, sg_size),
                         std::min<int>(32, sg_size), sg_size>;
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+    const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
+#else
     constexpr int oversubscription = 16;
+#endif
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
     const auto max_blocks =
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 0503b490e47..c88a992d54d 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -232,6 +232,26 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 
 
+template <typename ValueType>
+void compute_sparselib_dot(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Dense<ValueType>* x, const matrix::Dense<ValueType>* y,
+    matrix::Dense<ValueType>* result) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_sparselib_norm2(std::shared_ptr<const DefaultExecutor> exec,
+                             const matrix::Dense<ValueType>* x,
+                             matrix::Dense<remove_complex<ValueType>>* result)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL);
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp
index 575d772855b..cc15836cfc3 100644
--- a/hip/base/kernel_launch_reduction.hip.hpp
+++ b/hip/base/kernel_launch_reduction.hip.hpp
@@ -43,6 +43,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "hip/components/thread_ids.hip.hpp"
 
 
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+#include "benchmark/utils/tuning_variables.hpp"
+#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
+
+
 namespace gko {
 namespace kernels {
 namespace hip {
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index 0f1d2c455da..a0ad620eedf 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -125,6 +125,28 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 
 
+template <typename ValueType>
+void compute_sparselib_dot(std::shared_ptr<const DefaultExecutor> exec,
+                           const matrix::Dense<ValueType>* x,
+                           const matrix::Dense<ValueType>* y,
+                           matrix::Dense<ValueType>* result)
+{
+    if (hipblas::is_supported<ValueType>::value) {
+        auto handle = exec->get_hipblas_handle();
+        for (size_type col = 0; col < x->get_size()[1]; col++) {
+            hipblas::dot(handle, x->get_size()[0], x->get_const_values() + col,
+                         x->get_stride(), y->get_const_values() + col,
+                         y->get_stride(), result->get_values() + col);
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL);
+
+
 template <typename ValueType>
 void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                             const matrix::Dense<ValueType>* x,
@@ -148,6 +170,27 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 
 
+template <typename ValueType>
+void compute_sparselib_norm2(std::shared_ptr<const DefaultExecutor> exec,
+                             const matrix::Dense<ValueType>* x,
+                             matrix::Dense<remove_complex<ValueType>>* result)
+{
+    if (hipblas::is_supported<ValueType>::value) {
+        auto handle = exec->get_hipblas_handle();
+        for (size_type col = 0; col < x->get_size()[1]; col++) {
+            hipblas::norm2(handle, x->get_size()[0],
+                           x->get_const_values() + col, x->get_stride(),
+                           result->get_values() + col);
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL);
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
index c95a9caa5de..8cb37efd314 100644
--- a/omp/base/kernel_launch_reduction.hpp
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -42,15 +42,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <omp.h>
 
 
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+#include "benchmark/utils/tuning_variables.hpp"
+#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
+
+
 namespace gko {
 namespace kernels {
 namespace omp {
 
 
-// how many more reduction tasks we launch relative to the number of threads
-constexpr int reduction_kernel_oversubscription = 4;
-
-
 namespace {
 
 
@@ -207,6 +208,11 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
                                    dim<2> size, Array<char>& tmp,
                                    MappedKernelArgs... args)
 {
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+    const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 4;
+#else
+    constexpr int oversubscription = 4;
+#endif
     constexpr int block_size = 8;
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
@@ -215,8 +221,7 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
         return;
     }
     // enough work to keep all threads busy or only very small reduction sizes
-    if (rows >= reduction_kernel_oversubscription * num_threads ||
-        cols < rows) {
+    if (rows >= oversubscription * num_threads || cols < rows) {
 #pragma omp parallel for
         for (int64 row = 0; row < rows; row++) {
             [&]() {
@@ -300,6 +305,11 @@ void run_kernel_col_reduction_sized_impl(
     FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size,
     Array<char>& tmp, MappedKernelArgs... args)
 {
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+    const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 4;
+#else
+    constexpr int oversubscription = 4;
+#endif
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
     const auto num_threads = static_cast<int64>(omp_get_max_threads());
@@ -307,8 +317,7 @@ void run_kernel_col_reduction_sized_impl(
     GKO_ASSERT(cols % block_size == remainder_cols);
     const auto num_col_blocks = ceildiv(cols, block_size);
     // enough work to keep all threads busy or only very small reduction sizes
-    if (cols >= reduction_kernel_oversubscription * num_threads ||
-        rows < cols) {
+    if (cols >= oversubscription * num_threads || rows < cols) {
 #pragma omp parallel for
         for (int64 col_block = 0; col_block < num_col_blocks; col_block++) {
             const auto base_col = col_block * block_size;
@@ -325,7 +334,7 @@ void run_kernel_col_reduction_sized_impl(
     } else {
         // number of blocks that need to be reduced afterwards
         const auto reduction_size =
-            ceildiv(reduction_kernel_oversubscription * num_threads, cols);
+            ceildiv(oversubscription * num_threads, cols);
         const auto rows_per_thread = ceildiv(rows, reduction_size);
         const auto required_storage = sizeof(ValueType) * rows * reduction_size;
         if (tmp.get_num_elems() < required_storage) {
diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp
index e09c54b2522..f4ba9651ffd 100644
--- a/omp/matrix/dense_kernels.cpp
+++ b/omp/matrix/dense_kernels.cpp
@@ -109,6 +109,26 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 
 
+template <typename ValueType>
+void compute_sparselib_dot(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Dense<ValueType>* x, const matrix::Dense<ValueType>* y,
+    matrix::Dense<ValueType>* result) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_sparselib_norm2(std::shared_ptr<const DefaultExecutor> exec,
+                             const matrix::Dense<ValueType>* x,
+                             matrix::Dense<remove_complex<ValueType>>* result)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL);
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index 317b3599ae3..20374d75cb8 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -397,6 +397,26 @@ void compute_norm1(std::shared_ptr<const ReferenceExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
 
 
+template <typename ValueType>
+void compute_sparselib_dot(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Dense<ValueType>* x, const matrix::Dense<ValueType>* y,
+    matrix::Dense<ValueType>* result) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_sparselib_norm2(std::shared_ptr<const DefaultExecutor> exec,
+                             const matrix::Dense<ValueType>* x,
+                             matrix::Dense<remove_complex<ValueType>>* result)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void fill_in_matrix_data(std::shared_ptr<const ReferenceExecutor> exec,
                          const device_matrix_data<ValueType, IndexType>& data,