add tuning to reduction kernels

ginkgo-project · Apr 11, 2022 · 22c3184 · 22c3184
1 parent 029e0f8
commit 22c3184
Show file tree

Hide file tree

Showing 12 changed files with 310 additions and 18 deletions.
diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp
@@ -46,6 +46,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "benchmark/utils/loggers.hpp"
 #include "benchmark/utils/timer.hpp"
 #include "benchmark/utils/types.hpp"
+#include "core/matrix/dense_kernels.hpp"
+
+
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+#include "benchmark/utils/tuning_variables.hpp"
+#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
+
+
+GKO_REGISTER_OPERATION(compute_sparselib_dot, dense::compute_sparselib_dot);
+GKO_REGISTER_OPERATION(compute_sparselib_norm, dense::compute_sparselib_norm2);
 
 
 // Command-line arguments
@@ -57,8 +67,10 @@ DEFINE_string(
     "   multiaxpy (like axpy, but a has one entry per column),\n"
     "   scal (y = a * y),\n"
     "   multiscal (like scal, but a has one entry per column),\n"
-    "   dot (a = x' * y),"
+    "   dot (a = x' * y),\n"
+    "   sparselib_dot (like dot, but using vendor libraries),\n"
     "   norm (a = sqrt(x' * x)),\n"
+    "   sparselib_norm (like norm, but using vendor libraries),\n"
     "   mm (C = A * B),\n"
     "   gemm (C = a * A * B + b * C)\n"
     "where A has dimensions n x k, B has dimensions k x m,\n"
@@ -73,6 +85,7 @@ class BenchmarkOperation {
     virtual gko::size_type get_memory() const = 0;
     virtual void prepare(){};
     virtual void run() = 0;
+    virtual bool is_tunable() { return false; }
 };
 
 
@@ -205,20 +218,36 @@ class DotOperation : public BenchmarkOperation {
 
     void run() override { x_->compute_dot(lend(y_), lend(alpha_)); }
 
-private:
+    bool is_tunable() override { return true; }
+
+protected:
     std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
     std::unique_ptr<gko::matrix::Dense<etype>> x_;
     std::unique_ptr<gko::matrix::Dense<etype>> y_;
 };
 
 
+class SparselibDotOperation : public DotOperation {
+    using DotOperation::DotOperation;
+
+    void run() override
+    {
+        auto exec = alpha_->get_executor();
+        exec->run(make_compute_sparselib_dot(x_.get(), y_.get(), alpha_.get()));
+    }
+
+    bool is_tunable() override { return false; }
+};
+
+
 class NormOperation : public BenchmarkOperation {
 public:
     NormOperation(std::shared_ptr<const gko::Executor> exec,
                   gko::size_type rows, gko::size_type cols,
                   gko::size_type stride)
     {
-        alpha_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{1, cols});
+        alpha_ = gko::matrix::Dense<gko::remove_complex<etype>>::create(
+            exec, gko::dim<2>{1, cols});
         y_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{rows, cols},
                                                stride);
         y_->fill(1);
@@ -236,12 +265,27 @@ class NormOperation : public BenchmarkOperation {
 
     void run() override { y_->compute_norm2(lend(alpha_)); }
 
-private:
-    std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
+    bool is_tunable() override { return true; }
+
+protected:
+    std::unique_ptr<gko::matrix::Dense<gko::remove_complex<etype>>> alpha_;
     std::unique_ptr<gko::matrix::Dense<etype>> y_;
 };
 
 
+class SparselibNormOperation : public NormOperation {
+    using NormOperation::NormOperation;
+
+    void run() override
+    {
+        auto exec = alpha_->get_executor();
+        exec->run(make_compute_sparselib_norm(y_.get(), alpha_.get()));
+    }
+
+    bool is_tunable() override { return false; }
+};
+
+
 class ApplyOperation : public BenchmarkOperation {
 public:
     ApplyOperation(std::shared_ptr<const gko::Executor> exec, gko::size_type n,
@@ -407,11 +451,21 @@ std::map<std::string, std::function<std::unique_ptr<BenchmarkOperation>(
              return std::make_unique<DotOperation>(
                  exec, dims.n, dims.r, dims.stride_x, dims.stride_y);
          }},
+        {"sparselib_dot",
+         [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
+             return std::make_unique<SparselibDotOperation>(
+                 exec, dims.n, dims.r, dims.stride_x, dims.stride_y);
+         }},
         {"norm",
          [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
              return std::make_unique<NormOperation>(exec, dims.n, dims.r,
                                                     dims.stride_y);
          }},
+        {"sparselib_norm",
+         [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
+             return std::make_unique<SparselibNormOperation>(
+                 exec, dims.n, dims.r, dims.stride_y);
+         }},
         {"mm",
          [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
              return std::make_unique<ApplyOperation>(
@@ -466,6 +520,44 @@ void apply_blas(const char* operation_name, std::shared_ptr<gko::Executor> exec,
         add_or_set_member(blas_case[operation_name], "repetitions", repetitions,
                           allocator);
 
+        // tuning run
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+        if (op->is_tunable()) {
+            if (!blas_case[operation_name].HasMember("tuning")) {
+                blas_case[operation_name].AddMember(
+                    "tuning", rapidjson::Value(rapidjson::kObjectType),
+                    allocator);
+            }
+            auto& tuning_case = blas_case[operation_name]["tuning"];
+            add_or_set_member(tuning_case, "time",
+                              rapidjson::Value(rapidjson::kArrayType),
+                              allocator);
+            add_or_set_member(tuning_case, "values",
+                              rapidjson::Value(rapidjson::kArrayType),
+                              allocator);
+
+            // Enable tuning for this portion of code
+            gko::_tuning_flag = true;
+            // Select some values we want to tune.
+            std::vector<gko::size_type> tuning_values(32);
+            std::iota(tuning_values.begin(), tuning_values.end(), 0);
+            for (auto val : tuning_values) {
+                gko::_tuned_value = val;
+                IterationControl ic_tuning{timer};
+                op->prepare();
+                for (auto _ : ic_tuning.run()) {
+                    op->run();
+                }
+                tuning_case["time"].PushBack(ic_tuning.compute_average_time(),
+                                             allocator);
+                tuning_case["values"].PushBack(val, allocator);
+            }
+            // We put back the flag to false to use the default (non-tuned)
+            // values for the following
+            gko::_tuning_flag = false;
+        }
+#endif
+
         // compute and write benchmark data
         add_or_set_member(blas_case[operation_name], "completed", true,
                           allocator);
@@ -504,7 +596,8 @@ int main(int argc, char* argv[])
                          "  ]\n\n";
     initialize_argument_parsing(&argc, &argv, header, format);
 
-    std::string extra_information = "The operations are " + FLAGS_operations;
+    std::string extra_information =
+        "The operations are " + FLAGS_operations + "\n";
     print_general_information(extra_information);
 
     auto exec = executor_factory.at(FLAGS_executor)();

diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
@@ -256,12 +256,11 @@ GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);

diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp
@@ -146,6 +146,17 @@ namespace kernels {
                        matrix::Dense<remove_complex<_type>>* result, \
                        Array<char>& tmp)
 
+#define GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL(_type)               \
+    void compute_sparselib_dot(std::shared_ptr<const DefaultExecutor> exec, \
+                               const matrix::Dense<_type>* x,               \
+                               const matrix::Dense<_type>* y,               \
+                               matrix::Dense<_type>* result)
+
+#define GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL(_type)               \
+    void compute_sparselib_norm2(std::shared_ptr<const DefaultExecutor> exec, \
+                                 const matrix::Dense<_type>* x,               \
+                                 matrix::Dense<remove_complex<_type>>* result)
+
 #define GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(_type, _prec)         \
     void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,  \
                              const device_matrix_data<_type, _prec>& data, \
@@ -339,6 +350,10 @@ namespace kernels {
     GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType);             \
     template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType);                      \
+    template <typename ValueType>                                           \
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL(ValueType);              \
+    template <typename ValueType>                                           \
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL(ValueType);            \
     template <typename ValueType, typename IndexType>                       \
     GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType);     \
     template <typename ValueType, typename IndexType>                       \

diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
@@ -43,6 +43,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/components/thread_ids.cuh"
 
 
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+#include "benchmark/utils/tuning_variables.hpp"
+#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
+
+
 namespace gko {
 namespace kernels {
 namespace cuda {

diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
@@ -122,6 +122,28 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 
 
+template <typename ValueType>
+void compute_sparselib_dot(std::shared_ptr<const DefaultExecutor> exec,
+                           const matrix::Dense<ValueType>* x,
+                           const matrix::Dense<ValueType>* y,
+                           matrix::Dense<ValueType>* result)
+{
+    if (cublas::is_supported<ValueType>::value) {
+        auto handle = exec->get_cublas_handle();
+        for (size_type col = 0; col < x->get_size()[1]; col++) {
+            cublas::dot(handle, x->get_size()[0], x->get_const_values() + col,
+                        x->get_stride(), y->get_const_values() + col,
+                        y->get_stride(), result->get_values() + col);
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL);
+
+
 template <typename ValueType>
 void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                             const matrix::Dense<ValueType>* x,
@@ -145,6 +167,26 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 
 
+template <typename ValueType>
+void compute_sparselib_norm2(std::shared_ptr<const DefaultExecutor> exec,
+                             const matrix::Dense<ValueType>* x,
+                             matrix::Dense<remove_complex<ValueType>>* result)
+{
+    if (cublas::is_supported<ValueType>::value) {
+        auto handle = exec->get_cublas_handle();
+        for (size_type col = 0; col < x->get_size()[1]; col++) {
+            cublas::norm2(handle, x->get_size()[0], x->get_const_values() + col,
+                          x->get_stride(), result->get_values() + col);
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL);
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,

diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp
@@ -48,6 +48,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dpcpp/components/uninitialized_array.hpp"
 
 
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+#include "benchmark/utils/tuning_variables.hpp"
+#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
+
+
 namespace gko {
 namespace kernels {
 namespace dpcpp {
@@ -170,7 +175,11 @@ void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
                                ValueType* result, size_type size,
                                Array<char>& tmp, MappedKernelArgs... args)
 {
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+    const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 4;
+#else
     constexpr int oversubscription = 4;
+#endif
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     const auto num_workgroups =
@@ -214,7 +223,11 @@ void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
                                ValueType* result, dim<2> size, Array<char>& tmp,
                                MappedKernelArgs... args)
 {
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+    const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 4;
+#else
     constexpr int oversubscription = 4;
+#endif
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
     const auto flat_size = rows * cols;
@@ -566,7 +579,11 @@ void run_kernel_row_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
     using subsubgroup_sizes =
         syn::value_list<int, 1, 2, 4, 8, std::min<int>(16, sg_size),
                         std::min<int>(32, sg_size), sg_size>;
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+    const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
+#else
     constexpr int oversubscription = 16;
+#endif
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
     const auto resources =
@@ -619,7 +636,11 @@ void run_kernel_col_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
     using subsubgroup_sizes =
         syn::value_list<int, 1, 2, 4, 8, std::min<int>(16, sg_size),
                         std::min<int>(32, sg_size), sg_size>;
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+    const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
+#else
     constexpr int oversubscription = 16;
+#endif
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
     const auto max_blocks =

diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -232,6 +232,26 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 
 
+template <typename ValueType>
+void compute_sparselib_dot(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Dense<ValueType>* x, const matrix::Dense<ValueType>* y,
+    matrix::Dense<ValueType>* result) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_sparselib_norm2(std::shared_ptr<const DefaultExecutor> exec,
+                             const matrix::Dense<ValueType>* x,
+                             matrix::Dense<remove_complex<ValueType>>* result)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL);
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,

diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp
@@ -43,6 +43,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "hip/components/thread_ids.hip.hpp"
 
 
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+#include "benchmark/utils/tuning_variables.hpp"
+#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
+
+
 namespace gko {
 namespace kernels {
 namespace hip {