Skip to content

Commit

Permalink
add tuning to reduction kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
upsj committed Apr 11, 2022
1 parent 029e0f8 commit 22c3184
Show file tree
Hide file tree
Showing 12 changed files with 310 additions and 18 deletions.
105 changes: 99 additions & 6 deletions benchmark/blas/blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "benchmark/utils/loggers.hpp"
#include "benchmark/utils/timer.hpp"
#include "benchmark/utils/types.hpp"
#include "core/matrix/dense_kernels.hpp"


#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
#include "benchmark/utils/tuning_variables.hpp"
#endif // GINKGO_BENCHMARK_ENABLE_TUNING


GKO_REGISTER_OPERATION(compute_sparselib_dot, dense::compute_sparselib_dot);
GKO_REGISTER_OPERATION(compute_sparselib_norm, dense::compute_sparselib_norm2);


// Command-line arguments
Expand All @@ -57,8 +67,10 @@ DEFINE_string(
" multiaxpy (like axpy, but a has one entry per column),\n"
" scal (y = a * y),\n"
" multiscal (like scal, but a has one entry per column),\n"
" dot (a = x' * y),"
" dot (a = x' * y),\n"
" sparselib_dot (like dot, but using vendor libraries),\n"
" norm (a = sqrt(x' * x)),\n"
" sparselib_norm (like norm, but using vendor libraries),\n"
" mm (C = A * B),\n"
" gemm (C = a * A * B + b * C)\n"
"where A has dimensions n x k, B has dimensions k x m,\n"
Expand All @@ -73,6 +85,7 @@ class BenchmarkOperation {
virtual gko::size_type get_memory() const = 0;
virtual void prepare(){};
virtual void run() = 0;
virtual bool is_tunable() { return false; }
};


Expand Down Expand Up @@ -205,20 +218,36 @@ class DotOperation : public BenchmarkOperation {

void run() override { x_->compute_dot(lend(y_), lend(alpha_)); }

private:
bool is_tunable() override { return true; }

protected:
std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
std::unique_ptr<gko::matrix::Dense<etype>> x_;
std::unique_ptr<gko::matrix::Dense<etype>> y_;
};


class SparselibDotOperation : public DotOperation {
using DotOperation::DotOperation;

void run() override
{
auto exec = alpha_->get_executor();
exec->run(make_compute_sparselib_dot(x_.get(), y_.get(), alpha_.get()));
}

bool is_tunable() override { return false; }
};


class NormOperation : public BenchmarkOperation {
public:
NormOperation(std::shared_ptr<const gko::Executor> exec,
gko::size_type rows, gko::size_type cols,
gko::size_type stride)
{
alpha_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{1, cols});
alpha_ = gko::matrix::Dense<gko::remove_complex<etype>>::create(
exec, gko::dim<2>{1, cols});
y_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{rows, cols},
stride);
y_->fill(1);
Expand All @@ -236,12 +265,27 @@ class NormOperation : public BenchmarkOperation {

void run() override { y_->compute_norm2(lend(alpha_)); }

private:
std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
bool is_tunable() override { return true; }

protected:
std::unique_ptr<gko::matrix::Dense<gko::remove_complex<etype>>> alpha_;
std::unique_ptr<gko::matrix::Dense<etype>> y_;
};


class SparselibNormOperation : public NormOperation {
using NormOperation::NormOperation;

void run() override
{
auto exec = alpha_->get_executor();
exec->run(make_compute_sparselib_norm(y_.get(), alpha_.get()));
}

bool is_tunable() override { return false; }
};


class ApplyOperation : public BenchmarkOperation {
public:
ApplyOperation(std::shared_ptr<const gko::Executor> exec, gko::size_type n,
Expand Down Expand Up @@ -407,11 +451,21 @@ std::map<std::string, std::function<std::unique_ptr<BenchmarkOperation>(
return std::make_unique<DotOperation>(
exec, dims.n, dims.r, dims.stride_x, dims.stride_y);
}},
{"sparselib_dot",
[](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
return std::make_unique<SparselibDotOperation>(
exec, dims.n, dims.r, dims.stride_x, dims.stride_y);
}},
{"norm",
[](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
return std::make_unique<NormOperation>(exec, dims.n, dims.r,
dims.stride_y);
}},
{"sparselib_norm",
[](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
return std::make_unique<SparselibNormOperation>(
exec, dims.n, dims.r, dims.stride_y);
}},
{"mm",
[](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
return std::make_unique<ApplyOperation>(
Expand Down Expand Up @@ -466,6 +520,44 @@ void apply_blas(const char* operation_name, std::shared_ptr<gko::Executor> exec,
add_or_set_member(blas_case[operation_name], "repetitions", repetitions,
allocator);

// tuning run
#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
if (op->is_tunable()) {
if (!blas_case[operation_name].HasMember("tuning")) {
blas_case[operation_name].AddMember(
"tuning", rapidjson::Value(rapidjson::kObjectType),
allocator);
}
auto& tuning_case = blas_case[operation_name]["tuning"];
add_or_set_member(tuning_case, "time",
rapidjson::Value(rapidjson::kArrayType),
allocator);
add_or_set_member(tuning_case, "values",
rapidjson::Value(rapidjson::kArrayType),
allocator);

// Enable tuning for this portion of code
gko::_tuning_flag = true;
// Select some values we want to tune.
std::vector<gko::size_type> tuning_values(32);
std::iota(tuning_values.begin(), tuning_values.end(), 0);
for (auto val : tuning_values) {
gko::_tuned_value = val;
IterationControl ic_tuning{timer};
op->prepare();
for (auto _ : ic_tuning.run()) {
op->run();
}
tuning_case["time"].PushBack(ic_tuning.compute_average_time(),
allocator);
tuning_case["values"].PushBack(val, allocator);
}
// We put back the flag to false to use the default (non-tuned)
// values for the following
gko::_tuning_flag = false;
}
#endif

// compute and write benchmark data
add_or_set_member(blas_case[operation_name], "completed", true,
allocator);
Expand Down Expand Up @@ -504,7 +596,8 @@ int main(int argc, char* argv[])
" ]\n\n";
initialize_argument_parsing(&argc, &argv, header, format);

std::string extra_information = "The operations are " + FLAGS_operations;
std::string extra_information =
"The operations are " + FLAGS_operations + "\n";
print_general_information(extra_information);

auto exec = executor_factory.at(FLAGS_executor)();
Expand Down
5 changes: 2 additions & 3 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,12 +256,11 @@ GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
Expand Down
15 changes: 15 additions & 0 deletions core/matrix/dense_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,17 @@ namespace kernels {
matrix::Dense<remove_complex<_type>>* result, \
Array<char>& tmp)

#define GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL(_type) \
void compute_sparselib_dot(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type>* x, \
const matrix::Dense<_type>* y, \
matrix::Dense<_type>* result)

#define GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL(_type) \
void compute_sparselib_norm2(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type>* x, \
matrix::Dense<remove_complex<_type>>* result)

#define GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(_type, _prec) \
void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec, \
const device_matrix_data<_type, _prec>& data, \
Expand Down Expand Up @@ -339,6 +350,10 @@ namespace kernels {
GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType); \
template <typename ValueType> \
GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType); \
template <typename ValueType> \
GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL(ValueType); \
template <typename ValueType> \
GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL(ValueType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
Expand Down
5 changes: 5 additions & 0 deletions cuda/base/kernel_launch_reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cuda/components/thread_ids.cuh"


#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
#include "benchmark/utils/tuning_variables.hpp"
#endif // GINKGO_BENCHMARK_ENABLE_TUNING


namespace gko {
namespace kernels {
namespace cuda {
Expand Down
42 changes: 42 additions & 0 deletions cuda/matrix/dense_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,28 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);


template <typename ValueType>
void compute_sparselib_dot(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* x,
const matrix::Dense<ValueType>* y,
matrix::Dense<ValueType>* result)
{
if (cublas::is_supported<ValueType>::value) {
auto handle = exec->get_cublas_handle();
for (size_type col = 0; col < x->get_size()[1]; col++) {
cublas::dot(handle, x->get_size()[0], x->get_const_values() + col,
x->get_stride(), y->get_const_values() + col,
y->get_stride(), result->get_values() + col);
}
} else {
GKO_NOT_IMPLEMENTED;
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL);


template <typename ValueType>
void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* x,
Expand All @@ -145,6 +167,26 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);


template <typename ValueType>
void compute_sparselib_norm2(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* x,
matrix::Dense<remove_complex<ValueType>>* result)
{
if (cublas::is_supported<ValueType>::value) {
auto handle = exec->get_cublas_handle();
for (size_type col = 0; col < x->get_size()[1]; col++) {
cublas::norm2(handle, x->get_size()[0], x->get_const_values() + col,
x->get_stride(), result->get_values() + col);
}
} else {
GKO_NOT_IMPLEMENTED;
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL);


template <typename ValueType>
void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* a,
Expand Down
21 changes: 21 additions & 0 deletions dpcpp/base/kernel_launch_reduction.dp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dpcpp/components/uninitialized_array.hpp"


#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
#include "benchmark/utils/tuning_variables.hpp"
#endif // GINKGO_BENCHMARK_ENABLE_TUNING


namespace gko {
namespace kernels {
namespace dpcpp {
Expand Down Expand Up @@ -170,7 +175,11 @@ void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
ValueType* result, size_type size,
Array<char>& tmp, MappedKernelArgs... args)
{
#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 4;
#else
constexpr int oversubscription = 4;
#endif
constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
const auto num_workgroups =
Expand Down Expand Up @@ -214,7 +223,11 @@ void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
ValueType* result, dim<2> size, Array<char>& tmp,
MappedKernelArgs... args)
{
#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 4;
#else
constexpr int oversubscription = 4;
#endif
const auto rows = static_cast<int64>(size[0]);
const auto cols = static_cast<int64>(size[1]);
const auto flat_size = rows * cols;
Expand Down Expand Up @@ -566,7 +579,11 @@ void run_kernel_row_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
using subsubgroup_sizes =
syn::value_list<int, 1, 2, 4, 8, std::min<int>(16, sg_size),
std::min<int>(32, sg_size), sg_size>;
#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
#else
constexpr int oversubscription = 16;
#endif
const auto rows = static_cast<int64>(size[0]);
const auto cols = static_cast<int64>(size[1]);
const auto resources =
Expand Down Expand Up @@ -619,7 +636,11 @@ void run_kernel_col_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
using subsubgroup_sizes =
syn::value_list<int, 1, 2, 4, 8, std::min<int>(16, sg_size),
std::min<int>(32, sg_size), sg_size>;
#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
#else
constexpr int oversubscription = 16;
#endif
const auto rows = static_cast<int64>(size[0]);
const auto cols = static_cast<int64>(size[1]);
const auto max_blocks =
Expand Down
20 changes: 20 additions & 0 deletions dpcpp/matrix/dense_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,26 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);


template <typename ValueType>
void compute_sparselib_dot(
std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* x, const matrix::Dense<ValueType>* y,
matrix::Dense<ValueType>* result) GKO_NOT_IMPLEMENTED;

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL);


template <typename ValueType>
void compute_sparselib_norm2(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* x,
matrix::Dense<remove_complex<ValueType>>* result)
GKO_NOT_IMPLEMENTED;

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL);


template <typename ValueType>
void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* a,
Expand Down
5 changes: 5 additions & 0 deletions hip/base/kernel_launch_reduction.hip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "hip/components/thread_ids.hip.hpp"


#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
#include "benchmark/utils/tuning_variables.hpp"
#endif // GINKGO_BENCHMARK_ENABLE_TUNING


namespace gko {
namespace kernels {
namespace hip {
Expand Down
Loading

0 comments on commit 22c3184

Please sign in to comment.