Skip to content

Commit

Permalink
add tuning to reduction kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
upsj committed Apr 11, 2022
1 parent 029e0f8 commit 314ab2d
Show file tree
Hide file tree
Showing 13 changed files with 342 additions and 34 deletions.
105 changes: 99 additions & 6 deletions benchmark/blas/blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "benchmark/utils/loggers.hpp"
#include "benchmark/utils/timer.hpp"
#include "benchmark/utils/types.hpp"
#include "core/matrix/dense_kernels.hpp"


#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
#include "benchmark/utils/tuning_variables.hpp"
#endif // GINKGO_BENCHMARK_ENABLE_TUNING


GKO_REGISTER_OPERATION(compute_sparselib_dot, dense::compute_sparselib_dot);
GKO_REGISTER_OPERATION(compute_sparselib_norm, dense::compute_sparselib_norm2);


// Command-line arguments
Expand All @@ -57,8 +67,10 @@ DEFINE_string(
" multiaxpy (like axpy, but a has one entry per column),\n"
" scal (y = a * y),\n"
" multiscal (like scal, but a has one entry per column),\n"
" dot (a = x' * y),"
" dot (a = x' * y),\n"
" sparselib_dot (like dot, but using vendor libraries),\n"
" norm (a = sqrt(x' * x)),\n"
" sparselib_norm (like norm, but using vendor libraries),\n"
" mm (C = A * B),\n"
" gemm (C = a * A * B + b * C)\n"
"where A has dimensions n x k, B has dimensions k x m,\n"
Expand All @@ -73,6 +85,7 @@ class BenchmarkOperation {
virtual gko::size_type get_memory() const = 0;
virtual void prepare(){};
virtual void run() = 0;
virtual bool is_tunable() { return false; }
};


Expand Down Expand Up @@ -205,20 +218,36 @@ class DotOperation : public BenchmarkOperation {

void run() override { x_->compute_dot(lend(y_), lend(alpha_)); }

private:
bool is_tunable() override { return true; }

protected:
std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
std::unique_ptr<gko::matrix::Dense<etype>> x_;
std::unique_ptr<gko::matrix::Dense<etype>> y_;
};


class SparselibDotOperation : public DotOperation {
using DotOperation::DotOperation;

void run() override
{
auto exec = alpha_->get_executor();
exec->run(make_compute_sparselib_dot(x_.get(), y_.get(), alpha_.get()));
}

bool is_tunable() override { return false; }
};


class NormOperation : public BenchmarkOperation {
public:
NormOperation(std::shared_ptr<const gko::Executor> exec,
gko::size_type rows, gko::size_type cols,
gko::size_type stride)
{
alpha_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{1, cols});
alpha_ = gko::matrix::Dense<gko::remove_complex<etype>>::create(
exec, gko::dim<2>{1, cols});
y_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{rows, cols},
stride);
y_->fill(1);
Expand All @@ -236,12 +265,27 @@ class NormOperation : public BenchmarkOperation {

void run() override { y_->compute_norm2(lend(alpha_)); }

private:
std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
bool is_tunable() override { return true; }

protected:
std::unique_ptr<gko::matrix::Dense<gko::remove_complex<etype>>> alpha_;
std::unique_ptr<gko::matrix::Dense<etype>> y_;
};


class SparselibNormOperation : public NormOperation {
using NormOperation::NormOperation;

void run() override
{
auto exec = alpha_->get_executor();
exec->run(make_compute_sparselib_norm(y_.get(), alpha_.get()));
}

bool is_tunable() override { return false; }
};


class ApplyOperation : public BenchmarkOperation {
public:
ApplyOperation(std::shared_ptr<const gko::Executor> exec, gko::size_type n,
Expand Down Expand Up @@ -407,11 +451,21 @@ std::map<std::string, std::function<std::unique_ptr<BenchmarkOperation>(
return std::make_unique<DotOperation>(
exec, dims.n, dims.r, dims.stride_x, dims.stride_y);
}},
{"sparselib_dot",
[](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
return std::make_unique<SparselibDotOperation>(
exec, dims.n, dims.r, dims.stride_x, dims.stride_y);
}},
{"norm",
[](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
return std::make_unique<NormOperation>(exec, dims.n, dims.r,
dims.stride_y);
}},
{"sparselib_norm",
[](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
return std::make_unique<SparselibNormOperation>(
exec, dims.n, dims.r, dims.stride_y);
}},
{"mm",
[](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
return std::make_unique<ApplyOperation>(
Expand Down Expand Up @@ -466,6 +520,44 @@ void apply_blas(const char* operation_name, std::shared_ptr<gko::Executor> exec,
add_or_set_member(blas_case[operation_name], "repetitions", repetitions,
allocator);

// tuning run
#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
if (op->is_tunable()) {
if (!blas_case[operation_name].HasMember("tuning")) {
blas_case[operation_name].AddMember(
"tuning", rapidjson::Value(rapidjson::kObjectType),
allocator);
}
auto& tuning_case = blas_case[operation_name]["tuning"];
add_or_set_member(tuning_case, "time",
rapidjson::Value(rapidjson::kArrayType),
allocator);
add_or_set_member(tuning_case, "values",
rapidjson::Value(rapidjson::kArrayType),
allocator);

// Enable tuning for this portion of code
gko::_tuning_flag = true;
// Select some values we want to tune.
std::vector<gko::size_type> tuning_values(32);
std::iota(tuning_values.begin(), tuning_values.end(), 0);
for (auto val : tuning_values) {
gko::_tuned_value = val;
IterationControl ic_tuning{timer};
op->prepare();
for (auto _ : ic_tuning.run()) {
op->run();
}
tuning_case["time"].PushBack(ic_tuning.compute_average_time(),
allocator);
tuning_case["values"].PushBack(val, allocator);
}
// We put back the flag to false to use the default (non-tuned)
// values for the following
gko::_tuning_flag = false;
}
#endif

// compute and write benchmark data
add_or_set_member(blas_case[operation_name], "completed", true,
allocator);
Expand Down Expand Up @@ -504,7 +596,8 @@ int main(int argc, char* argv[])
" ]\n\n";
initialize_argument_parsing(&argc, &argv, header, format);

std::string extra_information = "The operations are " + FLAGS_operations;
std::string extra_information =
"The operations are " + FLAGS_operations + "\n";
print_general_information(extra_information);

auto exec = executor_factory.at(FLAGS_executor)();
Expand Down
48 changes: 32 additions & 16 deletions common/cuda_hip/base/kernel_launch_reduction.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,11 @@ void run_kernel_reduction_cached(std::shared_ptr<const DefaultExecutor> exec,
ValueType* result, size_type size,
Array<char>& tmp, KernelArgs&&... args)
{
#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
#else
constexpr int oversubscription = 16;
#endif
constexpr auto block_size = default_block_size;
const auto num_blocks = std::min<int64>(
ceildiv(size, block_size), exec->get_num_warps() * oversubscription);
Expand Down Expand Up @@ -161,7 +165,11 @@ void run_kernel_reduction_cached(std::shared_ptr<const DefaultExecutor> exec,
ValueType* result, dim<2> size,
Array<char>& tmp, KernelArgs&&... args)
{
#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
#else
constexpr int oversubscription = 16;
#endif
constexpr auto block_size = default_block_size;
const auto rows = static_cast<int64>(size[0]);
const auto cols = static_cast<int64>(size[1]);
Expand Down Expand Up @@ -195,10 +203,10 @@ void run_kernel_reduction_cached(std::shared_ptr<const DefaultExecutor> exec,
template <int subwarp_size, typename ValueType, typename KernelFunction,
typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
__global__
__launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d(
int64 rows, int64 cols, int64 col_blocks, KernelFunction fn,
ReductionOp op, FinalizeOp finalize, ValueType identity,
ValueType* result, int64 result_stride, KernelArgs... args)
__launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d(
int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op,
FinalizeOp finalize, ValueType identity, ValueType* result,
int64 result_stride, KernelArgs... args)
{
const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
const auto row = idx % rows;
Expand Down Expand Up @@ -227,10 +235,10 @@ __global__
template <int subwarp_size, typename ValueType, typename KernelFunction,
typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
__global__
__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small(
int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
FinalizeOp finalize, ValueType identity, ValueType* result,
KernelArgs... args)
__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small(
int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
FinalizeOp finalize, ValueType identity, ValueType* result,
KernelArgs... args)
{
constexpr auto warp_size = config::warp_size;
constexpr auto warps_per_block = default_block_size / warp_size;
Expand Down Expand Up @@ -290,10 +298,10 @@ __global__
template <typename ValueType, typename KernelFunction, typename ReductionOp,
typename FinalizeOp, typename... KernelArgs>
__global__
__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked(
int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
FinalizeOp finalize, ValueType identity, ValueType* result,
KernelArgs... args)
__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked(
int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
FinalizeOp finalize, ValueType identity, ValueType* result,
KernelArgs... args)
{
constexpr auto warp_size = config::warp_size;
__shared__ UninitializedArray<ValueType, default_block_size> block_partial;
Expand Down Expand Up @@ -329,10 +337,10 @@ __global__

template <typename ValueType, typename ReductionOp, typename FinalizeOp>
__global__
__launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d(
int64 num_results, int64 num_blocks, ReductionOp op,
FinalizeOp finalize, ValueType identity, const ValueType* input,
int64 result_stride, ValueType* result)
__launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d(
int64 num_results, int64 num_blocks, ReductionOp op, FinalizeOp finalize,
ValueType identity, const ValueType* input, int64 result_stride,
ValueType* result)
{
const auto idx = thread::get_thread_id_flat<int64>();
if (idx >= num_results) {
Expand Down Expand Up @@ -430,7 +438,11 @@ void run_kernel_row_reduction_cached(
{
using subwarp_sizes =
syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
#else
constexpr int oversubscription = 16;
#endif
const auto rows = static_cast<int64>(size[0]);
const auto cols = static_cast<int64>(size[1]);
const auto resources =
Expand Down Expand Up @@ -480,7 +492,11 @@ void run_kernel_col_reduction_cached(
{
using subwarp_sizes =
syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
#else
constexpr int oversubscription = 16;
#endif
const auto rows = static_cast<int64>(size[0]);
const auto cols = static_cast<int64>(size[1]);
const auto max_blocks = exec->get_num_warps() * config::warp_size *
Expand Down
5 changes: 2 additions & 3 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,12 +256,11 @@ GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
Expand Down
15 changes: 15 additions & 0 deletions core/matrix/dense_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,17 @@ namespace kernels {
matrix::Dense<remove_complex<_type>>* result, \
Array<char>& tmp)

#define GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL(_type) \
void compute_sparselib_dot(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type>* x, \
const matrix::Dense<_type>* y, \
matrix::Dense<_type>* result)

#define GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL(_type) \
void compute_sparselib_norm2(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type>* x, \
matrix::Dense<remove_complex<_type>>* result)

#define GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(_type, _prec) \
void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec, \
const device_matrix_data<_type, _prec>& data, \
Expand Down Expand Up @@ -339,6 +350,10 @@ namespace kernels {
GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType); \
template <typename ValueType> \
GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType); \
template <typename ValueType> \
GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL(ValueType); \
template <typename ValueType> \
GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL(ValueType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
Expand Down
5 changes: 5 additions & 0 deletions cuda/base/kernel_launch_reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cuda/components/thread_ids.cuh"


#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
#include "benchmark/utils/tuning_variables.hpp"
#endif // GINKGO_BENCHMARK_ENABLE_TUNING


namespace gko {
namespace kernels {
namespace cuda {
Expand Down
Loading

0 comments on commit 314ab2d

Please sign in to comment.