ginkgo-project · upsj · Mar 21, 2022
diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp
@@ -46,6 +46,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "benchmark/utils/loggers.hpp"
 #include "benchmark/utils/timer.hpp"
 #include "benchmark/utils/types.hpp"
+#include "core/matrix/dense_kernels.hpp"
+
+
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+#include "benchmark/utils/tuning_variables.hpp"
+#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
+
+
+GKO_REGISTER_OPERATION(compute_sparselib_dot, dense::compute_sparselib_dot);
+GKO_REGISTER_OPERATION(compute_sparselib_norm, dense::compute_sparselib_norm2);
 
 
 // Command-line arguments
@@ -57,8 +67,10 @@ DEFINE_string(
     "   multiaxpy (like axpy, but a has one entry per column),\n"
     "   scal (y = a * y),\n"
     "   multiscal (like scal, but a has one entry per column),\n"
-    "   dot (a = x' * y),"
+    "   dot (a = x' * y),\n"
+    "   sparselib_dot (like dot, but using vendor libraries),\n"
     "   norm (a = sqrt(x' * x)),\n"
+    "   sparselib_norm (like norm, but using vendor libraries),\n"
     "   mm (C = A * B),\n"
     "   gemm (C = a * A * B + b * C)\n"
     "where A has dimensions n x k, B has dimensions k x m,\n"
@@ -73,6 +85,7 @@ class BenchmarkOperation {
     virtual gko::size_type get_memory() const = 0;
     virtual void prepare(){};
     virtual void run() = 0;
+    virtual bool is_tunable() { return false; }
 };
 
 
@@ -205,20 +218,36 @@ class DotOperation : public BenchmarkOperation {
 
     void run() override { x_->compute_dot(lend(y_), lend(alpha_)); }
 
-private:
+    bool is_tunable() override { return true; }
+
+protected:
     std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
     std::unique_ptr<gko::matrix::Dense<etype>> x_;
     std::unique_ptr<gko::matrix::Dense<etype>> y_;
 };
 
 
+class SparselibDotOperation : public DotOperation {
+    using DotOperation::DotOperation;
+
+    void run() override
+    {
+        auto exec = alpha_->get_executor();
+        exec->run(make_compute_sparselib_dot(x_.get(), y_.get(), alpha_.get()));
+    }
+
+    bool is_tunable() override { return false; }
+};
+
+
 class NormOperation : public BenchmarkOperation {
 public:
     NormOperation(std::shared_ptr<const gko::Executor> exec,
                   gko::size_type rows, gko::size_type cols,
                   gko::size_type stride)
     {
-        alpha_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{1, cols});
+        alpha_ = gko::matrix::Dense<gko::remove_complex<etype>>::create(
+            exec, gko::dim<2>{1, cols});
         y_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{rows, cols},
                                                stride);
         y_->fill(1);
@@ -236,12 +265,27 @@ class NormOperation : public BenchmarkOperation {
 
     void run() override { y_->compute_norm2(lend(alpha_)); }
 
-private:
-    std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
+    bool is_tunable() override { return true; }
+
+protected:
+    std::unique_ptr<gko::matrix::Dense<gko::remove_complex<etype>>> alpha_;
     std::unique_ptr<gko::matrix::Dense<etype>> y_;
 };
 
 
+class SparselibNormOperation : public NormOperation {
+    using NormOperation::NormOperation;
+
+    void run() override
+    {
+        auto exec = alpha_->get_executor();
+        exec->run(make_compute_sparselib_norm(y_.get(), alpha_.get()));
+    }
+
+    bool is_tunable() override { return false; }
+};
+
+
 class ApplyOperation : public BenchmarkOperation {
 public:
     ApplyOperation(std::shared_ptr<const gko::Executor> exec, gko::size_type n,
@@ -407,11 +451,21 @@ std::map<std::string, std::function<std::unique_ptr<BenchmarkOperation>(
              return std::make_unique<DotOperation>(
                  exec, dims.n, dims.r, dims.stride_x, dims.stride_y);
          }},
+        {"sparselib_dot",
+         [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
+             return std::make_unique<SparselibDotOperation>(
+                 exec, dims.n, dims.r, dims.stride_x, dims.stride_y);
+         }},
         {"norm",
          [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
              return std::make_unique<NormOperation>(exec, dims.n, dims.r,
                                                     dims.stride_y);
          }},
+        {"sparselib_norm",
+         [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
+             return std::make_unique<SparselibNormOperation>(
+                 exec, dims.n, dims.r, dims.stride_y);
+         }},
         {"mm",
          [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
              return std::make_unique<ApplyOperation>(
@@ -466,6 +520,44 @@ void apply_blas(const char* operation_name, std::shared_ptr<gko::Executor> exec,
         add_or_set_member(blas_case[operation_name], "repetitions", repetitions,
                           allocator);
 
+        // tuning run
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+        if (op->is_tunable()) {
+            if (!blas_case[operation_name].HasMember("tuning")) {
+                blas_case[operation_name].AddMember(
+                    "tuning", rapidjson::Value(rapidjson::kObjectType),
+                    allocator);
+            }
+            auto& tuning_case = blas_case[operation_name]["tuning"];
+            add_or_set_member(tuning_case, "time",
+                              rapidjson::Value(rapidjson::kArrayType),
+                              allocator);
+            add_or_set_member(tuning_case, "values",
+                              rapidjson::Value(rapidjson::kArrayType),
+                              allocator);
+
+            // Enable tuning for this portion of code
+            gko::_tuning_flag = true;
+            // Select some values we want to tune.
+            std::vector<gko::size_type> tuning_values(32);
+            std::iota(tuning_values.begin(), tuning_values.end(), 0);
+            for (auto val : tuning_values) {
+                gko::_tuned_value = val;
+                IterationControl ic_tuning{timer};
+                op->prepare();
+                for (auto _ : ic_tuning.run()) {
+                    op->run();
+                }
+                tuning_case["time"].PushBack(ic_tuning.compute_average_time(),
+                                             allocator);
+                tuning_case["values"].PushBack(val, allocator);
+            }
+            // We put back the flag to false to use the default (non-tuned)
+            // values for the following
+            gko::_tuning_flag = false;
+        }
+#endif
+
         // compute and write benchmark data
         add_or_set_member(blas_case[operation_name], "completed", true,
                           allocator);
@@ -504,7 +596,8 @@ int main(int argc, char* argv[])
                          "  ]\n\n";
     initialize_argument_parsing(&argc, &argv, header, format);
 
-    std::string extra_information = "The operations are " + FLAGS_operations;
+    std::string extra_information =
+        "The operations are " + FLAGS_operations + "\n";
     print_general_information(extra_information);
 
     auto exec = executor_factory.at(FLAGS_executor)();

diff --git a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc b/common/cuda_hip/base/kernel_launch_reduction.hpp.inc
@@ -124,7 +124,11 @@ void run_kernel_reduction_cached(std::shared_ptr<const DefaultExecutor> exec,
                                  ValueType* result, size_type size,
                                  Array<char>& tmp, KernelArgs&&... args)
 {
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+    const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
+#else
     constexpr int oversubscription = 16;
+#endif
     constexpr auto block_size = default_block_size;
     const auto num_blocks = std::min<int64>(
         ceildiv(size, block_size), exec->get_num_warps() * oversubscription);
@@ -161,7 +165,11 @@ void run_kernel_reduction_cached(std::shared_ptr<const DefaultExecutor> exec,
                                  ValueType* result, dim<2> size,
                                  Array<char>& tmp, KernelArgs&&... args)
 {
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+    const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
+#else
     constexpr int oversubscription = 16;
+#endif
     constexpr auto block_size = default_block_size;
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
@@ -195,10 +203,10 @@ void run_kernel_reduction_cached(std::shared_ptr<const DefaultExecutor> exec,
 template <int subwarp_size, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
 __global__
-    __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d(
-        int64 rows, int64 cols, int64 col_blocks, KernelFunction fn,
-        ReductionOp op, FinalizeOp finalize, ValueType identity,
-        ValueType* result, int64 result_stride, KernelArgs... args)
+__launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d(
+    int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op,
+    FinalizeOp finalize, ValueType identity, ValueType* result,
+    int64 result_stride, KernelArgs... args)
 {
     const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
     const auto row = idx % rows;
@@ -227,10 +235,10 @@ __global__
 template <int subwarp_size, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
 __global__
-    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small(
-        int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
-        FinalizeOp finalize, ValueType identity, ValueType* result,
-        KernelArgs... args)
+__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small(
+    int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
+    FinalizeOp finalize, ValueType identity, ValueType* result,
+    KernelArgs... args)
 {
     constexpr auto warp_size = config::warp_size;
     constexpr auto warps_per_block = default_block_size / warp_size;
@@ -290,10 +298,10 @@ __global__
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 __global__
-    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked(
-        int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
-        FinalizeOp finalize, ValueType identity, ValueType* result,
-        KernelArgs... args)
+__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked(
+    int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
+    FinalizeOp finalize, ValueType identity, ValueType* result,
+    KernelArgs... args)
 {
     constexpr auto warp_size = config::warp_size;
     __shared__ UninitializedArray<ValueType, default_block_size> block_partial;
@@ -329,10 +337,10 @@ __global__
 
 template <typename ValueType, typename ReductionOp, typename FinalizeOp>
 __global__
-    __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d(
-        int64 num_results, int64 num_blocks, ReductionOp op,
-        FinalizeOp finalize, ValueType identity, const ValueType* input,
-        int64 result_stride, ValueType* result)
+__launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d(
+    int64 num_results, int64 num_blocks, ReductionOp op, FinalizeOp finalize,
+    ValueType identity, const ValueType* input, int64 result_stride,
+    ValueType* result)
 {
     const auto idx = thread::get_thread_id_flat<int64>();
     if (idx >= num_results) {
@@ -430,7 +438,11 @@ void run_kernel_row_reduction_cached(
 {
     using subwarp_sizes =
         syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+    const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
+#else
     constexpr int oversubscription = 16;
+#endif
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
     const auto resources =
@@ -480,7 +492,11 @@ void run_kernel_col_reduction_cached(
 {
     using subwarp_sizes =
         syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+    const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
+#else
     constexpr int oversubscription = 16;
+#endif
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
     const auto max_blocks = exec->get_num_warps() * config::warp_size *

diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
@@ -256,12 +256,11 @@ GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);

diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp
@@ -146,6 +146,17 @@ namespace kernels {
                        matrix::Dense<remove_complex<_type>>* result, \
                        Array<char>& tmp)
 
+#define GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL(_type)               \
+    void compute_sparselib_dot(std::shared_ptr<const DefaultExecutor> exec, \
+                               const matrix::Dense<_type>* x,               \
+                               const matrix::Dense<_type>* y,               \
+                               matrix::Dense<_type>* result)
+
+#define GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL(_type)               \
+    void compute_sparselib_norm2(std::shared_ptr<const DefaultExecutor> exec, \
+                                 const matrix::Dense<_type>* x,               \
+                                 matrix::Dense<remove_complex<_type>>* result)
+
 #define GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(_type, _prec)         \
     void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,  \
                              const device_matrix_data<_type, _prec>& data, \
@@ -339,6 +350,10 @@ namespace kernels {
     GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType);             \
     template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType);                      \
+    template <typename ValueType>                                           \
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL(ValueType);              \
+    template <typename ValueType>                                           \
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL(ValueType);            \
     template <typename ValueType, typename IndexType>                       \
     GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType);     \
     template <typename ValueType, typename IndexType>                       \

diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
@@ -43,6 +43,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/components/thread_ids.cuh"
 
 
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+#include "benchmark/utils/tuning_variables.hpp"
+#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
+
+
 namespace gko {
 namespace kernels {
 namespace cuda {