From 22c318448b8fe41a83ed4d468e8029d9df2f7994 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 21 Mar 2022 15:47:01 +0100 Subject: [PATCH] add tuning to reduction kernels --- benchmark/blas/blas.cpp | 105 ++++++++++++++++++++-- core/device_hooks/common_kernels.inc.cpp | 5 +- core/matrix/dense_kernels.hpp | 15 ++++ cuda/base/kernel_launch_reduction.cuh | 5 ++ cuda/matrix/dense_kernels.cu | 42 +++++++++ dpcpp/base/kernel_launch_reduction.dp.hpp | 21 +++++ dpcpp/matrix/dense_kernels.dp.cpp | 20 +++++ hip/base/kernel_launch_reduction.hip.hpp | 5 ++ hip/matrix/dense_kernels.hip.cpp | 43 +++++++++ omp/base/kernel_launch_reduction.hpp | 27 ++++-- omp/matrix/dense_kernels.cpp | 20 +++++ reference/matrix/dense_kernels.cpp | 20 +++++ 12 files changed, 310 insertions(+), 18 deletions(-) diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp index 6dfe2414a9f..39d47dc9339 100644 --- a/benchmark/blas/blas.cpp +++ b/benchmark/blas/blas.cpp @@ -46,6 +46,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/loggers.hpp" #include "benchmark/utils/timer.hpp" #include "benchmark/utils/types.hpp" +#include "core/matrix/dense_kernels.hpp" + + +#ifdef GINKGO_BENCHMARK_ENABLE_TUNING +#include "benchmark/utils/tuning_variables.hpp" +#endif // GINKGO_BENCHMARK_ENABLE_TUNING + + +GKO_REGISTER_OPERATION(compute_sparselib_dot, dense::compute_sparselib_dot); +GKO_REGISTER_OPERATION(compute_sparselib_norm, dense::compute_sparselib_norm2); // Command-line arguments @@ -57,8 +67,10 @@ DEFINE_string( " multiaxpy (like axpy, but a has one entry per column),\n" " scal (y = a * y),\n" " multiscal (like scal, but a has one entry per column),\n" - " dot (a = x' * y)," + " dot (a = x' * y),\n" + " sparselib_dot (like dot, but using vendor libraries),\n" " norm (a = sqrt(x' * x)),\n" + " sparselib_norm (like norm, but using vendor libraries),\n" " mm (C = A * B),\n" " gemm (C = a * A * B + b * C)\n" "where A has dimensions n x k, B has dimensions k x m,\n" @@ -73,6 +85,7 @@ class BenchmarkOperation { virtual gko::size_type get_memory() const = 0; virtual void prepare(){}; virtual void run() = 0; + virtual bool is_tunable() { return false; } }; @@ -205,20 +218,36 @@ class DotOperation : public BenchmarkOperation { void run() override { x_->compute_dot(lend(y_), lend(alpha_)); } -private: + bool is_tunable() override { return true; } + +protected: std::unique_ptr> alpha_; std::unique_ptr> x_; std::unique_ptr> y_; }; +class SparselibDotOperation : public DotOperation { + using DotOperation::DotOperation; + + void run() override + { + auto exec = alpha_->get_executor(); + exec->run(make_compute_sparselib_dot(x_.get(), y_.get(), alpha_.get())); + } + + bool is_tunable() override { return false; } +}; + + class NormOperation : public BenchmarkOperation { public: NormOperation(std::shared_ptr exec, gko::size_type rows, gko::size_type cols, gko::size_type stride) { - alpha_ = gko::matrix::Dense::create(exec, gko::dim<2>{1, cols}); + alpha_ = gko::matrix::Dense>::create( + exec, gko::dim<2>{1, cols}); y_ = gko::matrix::Dense::create(exec, gko::dim<2>{rows, cols}, stride); y_->fill(1); @@ -236,12 +265,27 @@ class NormOperation : public BenchmarkOperation { void run() override { y_->compute_norm2(lend(alpha_)); } -private: - std::unique_ptr> alpha_; + bool is_tunable() override { return true; } + +protected: + std::unique_ptr>> alpha_; std::unique_ptr> y_; }; +class SparselibNormOperation : public NormOperation { + using NormOperation::NormOperation; + + void run() override + { + auto exec = alpha_->get_executor(); + exec->run(make_compute_sparselib_norm(y_.get(), alpha_.get())); + } + + bool is_tunable() override { return false; } +}; + + class ApplyOperation : public BenchmarkOperation { public: ApplyOperation(std::shared_ptr exec, gko::size_type n, @@ -407,11 +451,21 @@ std::map( return std::make_unique( exec, dims.n, dims.r, dims.stride_x, dims.stride_y); }}, + {"sparselib_dot", + [](std::shared_ptr exec, dimensions dims) { + return std::make_unique( + exec, dims.n, dims.r, dims.stride_x, dims.stride_y); + }}, {"norm", [](std::shared_ptr exec, dimensions dims) { return std::make_unique(exec, dims.n, dims.r, dims.stride_y); }}, + {"sparselib_norm", + [](std::shared_ptr exec, dimensions dims) { + return std::make_unique( + exec, dims.n, dims.r, dims.stride_y); + }}, {"mm", [](std::shared_ptr exec, dimensions dims) { return std::make_unique( @@ -466,6 +520,44 @@ void apply_blas(const char* operation_name, std::shared_ptr exec, add_or_set_member(blas_case[operation_name], "repetitions", repetitions, allocator); + // tuning run +#ifdef GINKGO_BENCHMARK_ENABLE_TUNING + if (op->is_tunable()) { + if (!blas_case[operation_name].HasMember("tuning")) { + blas_case[operation_name].AddMember( + "tuning", rapidjson::Value(rapidjson::kObjectType), + allocator); + } + auto& tuning_case = blas_case[operation_name]["tuning"]; + add_or_set_member(tuning_case, "time", + rapidjson::Value(rapidjson::kArrayType), + allocator); + add_or_set_member(tuning_case, "values", + rapidjson::Value(rapidjson::kArrayType), + allocator); + + // Enable tuning for this portion of code + gko::_tuning_flag = true; + // Select some values we want to tune. + std::vector tuning_values(32); + std::iota(tuning_values.begin(), tuning_values.end(), 0); + for (auto val : tuning_values) { + gko::_tuned_value = val; + IterationControl ic_tuning{timer}; + op->prepare(); + for (auto _ : ic_tuning.run()) { + op->run(); + } + tuning_case["time"].PushBack(ic_tuning.compute_average_time(), + allocator); + tuning_case["values"].PushBack(val, allocator); + } + // We put back the flag to false to use the default (non-tuned) + // values for the following + gko::_tuning_flag = false; + } +#endif + // compute and write benchmark data add_or_set_member(blas_case[operation_name], "completed", true, allocator); @@ -504,7 +596,8 @@ int main(int argc, char* argv[]) " ]\n\n"; initialize_argument_parsing(&argc, &argv, header, format); - std::string extra_information = "The operations are " + FLAGS_operations; + std::string extra_information = + "The operations are " + FLAGS_operations + "\n"; print_general_information(extra_information); auto exec = executor_factory.at(FLAGS_executor)(); diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index a89d1f3c140..d63e75684f8 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -256,12 +256,11 @@ GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL); diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp index 3cf15052826..89bb9f7042d 100644 --- a/core/matrix/dense_kernels.hpp +++ b/core/matrix/dense_kernels.hpp @@ -146,6 +146,17 @@ namespace kernels { matrix::Dense>* result, \ Array& tmp) +#define GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL(_type) \ + void compute_sparselib_dot(std::shared_ptr exec, \ + const matrix::Dense<_type>* x, \ + const matrix::Dense<_type>* y, \ + matrix::Dense<_type>* result) + +#define GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL(_type) \ + void compute_sparselib_norm2(std::shared_ptr exec, \ + const matrix::Dense<_type>* x, \ + matrix::Dense>* result) + #define GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(_type, _prec) \ void fill_in_matrix_data(std::shared_ptr exec, \ const device_matrix_data<_type, _prec>& data, \ @@ -339,6 +350,10 @@ namespace kernels { GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType); \ template \ GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL(ValueType); \ template \ GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ template \ diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh index e6da25a1706..32783ed0951 100644 --- a/cuda/base/kernel_launch_reduction.cuh +++ b/cuda/base/kernel_launch_reduction.cuh @@ -43,6 +43,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/components/thread_ids.cuh" +#ifdef GINKGO_BENCHMARK_ENABLE_TUNING +#include "benchmark/utils/tuning_variables.hpp" +#endif // GINKGO_BENCHMARK_ENABLE_TUNING + + namespace gko { namespace kernels { namespace cuda { diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index 7eeebd82357..2866c40a792 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -122,6 +122,28 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); +template +void compute_sparselib_dot(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result) +{ + if (cublas::is_supported::value) { + auto handle = exec->get_cublas_handle(); + for (size_type col = 0; col < x->get_size()[1]; col++) { + cublas::dot(handle, x->get_size()[0], x->get_const_values() + col, + x->get_stride(), y->get_const_values() + col, + y->get_stride(), result->get_values() + col); + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL); + + template void compute_norm2_dispatch(std::shared_ptr exec, const matrix::Dense* x, @@ -145,6 +167,26 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); +template +void compute_sparselib_norm2(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result) +{ + if (cublas::is_supported::value) { + auto handle = exec->get_cublas_handle(); + for (size_type col = 0; col < x->get_size()[1]; col++) { + cublas::norm2(handle, x->get_size()[0], x->get_const_values() + col, + x->get_stride(), result->get_values() + col); + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL); + + template void simple_apply(std::shared_ptr exec, const matrix::Dense* a, diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index 601527c6cae..30f0548ff02 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -48,6 +48,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dpcpp/components/uninitialized_array.hpp" +#ifdef GINKGO_BENCHMARK_ENABLE_TUNING +#include "benchmark/utils/tuning_variables.hpp" +#endif // GINKGO_BENCHMARK_ENABLE_TUNING + + namespace gko { namespace kernels { namespace dpcpp { @@ -170,7 +175,11 @@ void run_kernel_reduction_impl(std::shared_ptr exec, ValueType* result, size_type size, Array& tmp, MappedKernelArgs... args) { +#ifdef GINKGO_BENCHMARK_ENABLE_TUNING + const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 4; +#else constexpr int oversubscription = 4; +#endif constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); const auto num_workgroups = @@ -214,7 +223,11 @@ void run_kernel_reduction_impl(std::shared_ptr exec, ValueType* result, dim<2> size, Array& tmp, MappedKernelArgs... args) { +#ifdef GINKGO_BENCHMARK_ENABLE_TUNING + const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 4; +#else constexpr int oversubscription = 4; +#endif const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); const auto flat_size = rows * cols; @@ -566,7 +579,11 @@ void run_kernel_row_reduction_stage1(std::shared_ptr exec, using subsubgroup_sizes = syn::value_list(16, sg_size), std::min(32, sg_size), sg_size>; +#ifdef GINKGO_BENCHMARK_ENABLE_TUNING + const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16; +#else constexpr int oversubscription = 16; +#endif const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); const auto resources = @@ -619,7 +636,11 @@ void run_kernel_col_reduction_stage1(std::shared_ptr exec, using subsubgroup_sizes = syn::value_list(16, sg_size), std::min(32, sg_size), sg_size>; +#ifdef GINKGO_BENCHMARK_ENABLE_TUNING + const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16; +#else constexpr int oversubscription = 16; +#endif const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); const auto max_blocks = diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 0503b490e47..c88a992d54d 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -232,6 +232,26 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); +template +void compute_sparselib_dot( + std::shared_ptr exec, + const matrix::Dense* x, const matrix::Dense* y, + matrix::Dense* result) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL); + + +template +void compute_sparselib_norm2(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL); + + template void simple_apply(std::shared_ptr exec, const matrix::Dense* a, diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp index 575d772855b..cc15836cfc3 100644 --- a/hip/base/kernel_launch_reduction.hip.hpp +++ b/hip/base/kernel_launch_reduction.hip.hpp @@ -43,6 +43,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/components/thread_ids.hip.hpp" +#ifdef GINKGO_BENCHMARK_ENABLE_TUNING +#include "benchmark/utils/tuning_variables.hpp" +#endif // GINKGO_BENCHMARK_ENABLE_TUNING + + namespace gko { namespace kernels { namespace hip { diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp index 0f1d2c455da..a0ad620eedf 100644 --- a/hip/matrix/dense_kernels.hip.cpp +++ b/hip/matrix/dense_kernels.hip.cpp @@ -125,6 +125,28 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); +template +void compute_sparselib_dot(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result) +{ + if (hipblas::is_supported::value) { + auto handle = exec->get_hipblas_handle(); + for (size_type col = 0; col < x->get_size()[1]; col++) { + hipblas::dot(handle, x->get_size()[0], x->get_const_values() + col, + x->get_stride(), y->get_const_values() + col, + y->get_stride(), result->get_values() + col); + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL); + + template void compute_norm2_dispatch(std::shared_ptr exec, const matrix::Dense* x, @@ -148,6 +170,27 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); +template +void compute_sparselib_norm2(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result) +{ + if (hipblas::is_supported::value) { + auto handle = exec->get_hipblas_handle(); + for (size_type col = 0; col < x->get_size()[1]; col++) { + hipblas::norm2(handle, x->get_size()[0], + x->get_const_values() + col, x->get_stride(), + result->get_values() + col); + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL); + + template void simple_apply(std::shared_ptr exec, const matrix::Dense* a, diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index c95a9caa5de..8cb37efd314 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -42,15 +42,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#ifdef GINKGO_BENCHMARK_ENABLE_TUNING +#include "benchmark/utils/tuning_variables.hpp" +#endif // GINKGO_BENCHMARK_ENABLE_TUNING + + namespace gko { namespace kernels { namespace omp { -// how many more reduction tasks we launch relative to the number of threads -constexpr int reduction_kernel_oversubscription = 4; - - namespace { @@ -207,6 +208,11 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, dim<2> size, Array& tmp, MappedKernelArgs... args) { +#ifdef GINKGO_BENCHMARK_ENABLE_TUNING + const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 4; +#else + constexpr int oversubscription = 4; +#endif constexpr int block_size = 8; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); @@ -215,8 +221,7 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, return; } // enough work to keep all threads busy or only very small reduction sizes - if (rows >= reduction_kernel_oversubscription * num_threads || - cols < rows) { + if (rows >= oversubscription * num_threads || cols < rows) { #pragma omp parallel for for (int64 row = 0; row < rows; row++) { [&]() { @@ -300,6 +305,11 @@ void run_kernel_col_reduction_sized_impl( FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, Array& tmp, MappedKernelArgs... args) { +#ifdef GINKGO_BENCHMARK_ENABLE_TUNING + const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 4; +#else + constexpr int oversubscription = 4; +#endif const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); const auto num_threads = static_cast(omp_get_max_threads()); @@ -307,8 +317,7 @@ void run_kernel_col_reduction_sized_impl( GKO_ASSERT(cols % block_size == remainder_cols); const auto num_col_blocks = ceildiv(cols, block_size); // enough work to keep all threads busy or only very small reduction sizes - if (cols >= reduction_kernel_oversubscription * num_threads || - rows < cols) { + if (cols >= oversubscription * num_threads || rows < cols) { #pragma omp parallel for for (int64 col_block = 0; col_block < num_col_blocks; col_block++) { const auto base_col = col_block * block_size; @@ -325,7 +334,7 @@ void run_kernel_col_reduction_sized_impl( } else { // number of blocks that need to be reduced afterwards const auto reduction_size = - ceildiv(reduction_kernel_oversubscription * num_threads, cols); + ceildiv(oversubscription * num_threads, cols); const auto rows_per_thread = ceildiv(rows, reduction_size); const auto required_storage = sizeof(ValueType) * rows * reduction_size; if (tmp.get_num_elems() < required_storage) { diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp index e09c54b2522..f4ba9651ffd 100644 --- a/omp/matrix/dense_kernels.cpp +++ b/omp/matrix/dense_kernels.cpp @@ -109,6 +109,26 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); +template +void compute_sparselib_dot( + std::shared_ptr exec, + const matrix::Dense* x, const matrix::Dense* y, + matrix::Dense* result) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL); + + +template +void compute_sparselib_norm2(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL); + + template void simple_apply(std::shared_ptr exec, const matrix::Dense* a, diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp index 317b3599ae3..20374d75cb8 100644 --- a/reference/matrix/dense_kernels.cpp +++ b/reference/matrix/dense_kernels.cpp @@ -397,6 +397,26 @@ void compute_norm1(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); +template +void compute_sparselib_dot( + std::shared_ptr exec, + const matrix::Dense* x, const matrix::Dense* y, + matrix::Dense* result) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL); + + +template +void compute_sparselib_norm2(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL); + + template void fill_in_matrix_data(std::shared_ptr exec, const device_matrix_data& data,