Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tuning to reduction kernels and improve tuning #991

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 99 additions & 6 deletions benchmark/blas/blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "benchmark/utils/loggers.hpp"
#include "benchmark/utils/timer.hpp"
#include "benchmark/utils/types.hpp"
#include "core/matrix/dense_kernels.hpp"


#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
#include "benchmark/utils/tuning_variables.hpp"
#endif // GINKGO_BENCHMARK_ENABLE_TUNING


GKO_REGISTER_OPERATION(compute_sparselib_dot, dense::compute_sparselib_dot);
GKO_REGISTER_OPERATION(compute_sparselib_norm, dense::compute_sparselib_norm2);


// Command-line arguments
Expand All @@ -57,8 +67,10 @@ DEFINE_string(
" multiaxpy (like axpy, but a has one entry per column),\n"
" scal (y = a * y),\n"
" multiscal (like scal, but a has one entry per column),\n"
" dot (a = x' * y),"
" dot (a = x' * y),\n"
" sparselib_dot (like dot, but using vendor libraries),\n"
" norm (a = sqrt(x' * x)),\n"
" sparselib_norm (like norm, but using vendor libraries),\n"
" mm (C = A * B),\n"
" gemm (C = a * A * B + b * C)\n"
"where A has dimensions n x k, B has dimensions k x m,\n"
Expand All @@ -73,6 +85,7 @@ class BenchmarkOperation {
virtual gko::size_type get_memory() const = 0;
virtual void prepare(){};
virtual void run() = 0;
virtual bool is_tunable() { return false; }
};


Expand Down Expand Up @@ -205,20 +218,36 @@ class DotOperation : public BenchmarkOperation {

void run() override { x_->compute_dot(lend(y_), lend(alpha_)); }

private:
bool is_tunable() override { return true; }

protected:
std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
std::unique_ptr<gko::matrix::Dense<etype>> x_;
std::unique_ptr<gko::matrix::Dense<etype>> y_;
};


class SparselibDotOperation : public DotOperation {
using DotOperation::DotOperation;

void run() override
{
auto exec = alpha_->get_executor();
exec->run(make_compute_sparselib_dot(x_.get(), y_.get(), alpha_.get()));
}

bool is_tunable() override { return false; }
};


class NormOperation : public BenchmarkOperation {
public:
NormOperation(std::shared_ptr<const gko::Executor> exec,
gko::size_type rows, gko::size_type cols,
gko::size_type stride)
{
alpha_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{1, cols});
alpha_ = gko::matrix::Dense<gko::remove_complex<etype>>::create(
exec, gko::dim<2>{1, cols});
y_ = gko::matrix::Dense<etype>::create(exec, gko::dim<2>{rows, cols},
stride);
y_->fill(1);
Expand All @@ -236,12 +265,27 @@ class NormOperation : public BenchmarkOperation {

void run() override { y_->compute_norm2(lend(alpha_)); }

private:
std::unique_ptr<gko::matrix::Dense<etype>> alpha_;
bool is_tunable() override { return true; }

protected:
std::unique_ptr<gko::matrix::Dense<gko::remove_complex<etype>>> alpha_;
std::unique_ptr<gko::matrix::Dense<etype>> y_;
};


class SparselibNormOperation : public NormOperation {
using NormOperation::NormOperation;

void run() override
{
auto exec = alpha_->get_executor();
exec->run(make_compute_sparselib_norm(y_.get(), alpha_.get()));
}

bool is_tunable() override { return false; }
};


class ApplyOperation : public BenchmarkOperation {
public:
ApplyOperation(std::shared_ptr<const gko::Executor> exec, gko::size_type n,
Expand Down Expand Up @@ -407,11 +451,21 @@ std::map<std::string, std::function<std::unique_ptr<BenchmarkOperation>(
return std::make_unique<DotOperation>(
exec, dims.n, dims.r, dims.stride_x, dims.stride_y);
}},
{"sparselib_dot",
[](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
return std::make_unique<SparselibDotOperation>(
exec, dims.n, dims.r, dims.stride_x, dims.stride_y);
}},
{"norm",
[](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
return std::make_unique<NormOperation>(exec, dims.n, dims.r,
dims.stride_y);
}},
{"sparselib_norm",
[](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
return std::make_unique<SparselibNormOperation>(
exec, dims.n, dims.r, dims.stride_y);
}},
{"mm",
[](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
return std::make_unique<ApplyOperation>(
Expand Down Expand Up @@ -466,6 +520,44 @@ void apply_blas(const char* operation_name, std::shared_ptr<gko::Executor> exec,
add_or_set_member(blas_case[operation_name], "repetitions", repetitions,
allocator);

// tuning run
#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
if (op->is_tunable()) {
if (!blas_case[operation_name].HasMember("tuning")) {
blas_case[operation_name].AddMember(
"tuning", rapidjson::Value(rapidjson::kObjectType),
allocator);
}
auto& tuning_case = blas_case[operation_name]["tuning"];
add_or_set_member(tuning_case, "time",
rapidjson::Value(rapidjson::kArrayType),
allocator);
add_or_set_member(tuning_case, "values",
rapidjson::Value(rapidjson::kArrayType),
allocator);

// Enable tuning for this portion of code
gko::_tuning_flag = true;
// Select some values we want to tune.
std::vector<gko::size_type> tuning_values(32);
std::iota(tuning_values.begin(), tuning_values.end(), 0);
for (auto val : tuning_values) {
gko::_tuned_value = val;
IterationControl ic_tuning{timer};
op->prepare();
for (auto _ : ic_tuning.run()) {
op->run();
}
tuning_case["time"].PushBack(ic_tuning.compute_average_time(),
allocator);
tuning_case["values"].PushBack(val, allocator);
}
// We put back the flag to false to use the default (non-tuned)
// values for the following
gko::_tuning_flag = false;
}
#endif

// compute and write benchmark data
add_or_set_member(blas_case[operation_name], "completed", true,
allocator);
Expand Down Expand Up @@ -504,7 +596,8 @@ int main(int argc, char* argv[])
" ]\n\n";
initialize_argument_parsing(&argc, &argv, header, format);

std::string extra_information = "The operations are " + FLAGS_operations;
std::string extra_information =
"The operations are " + FLAGS_operations + "\n";
print_general_information(extra_information);

auto exec = executor_factory.at(FLAGS_executor)();
Expand Down
48 changes: 32 additions & 16 deletions common/cuda_hip/base/kernel_launch_reduction.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,11 @@ void run_kernel_reduction_cached(std::shared_ptr<const DefaultExecutor> exec,
ValueType* result, size_type size,
Array<char>& tmp, KernelArgs&&... args)
{
#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
#else
constexpr int oversubscription = 16;
#endif
constexpr auto block_size = default_block_size;
const auto num_blocks = std::min<int64>(
ceildiv(size, block_size), exec->get_num_warps() * oversubscription);
Expand Down Expand Up @@ -161,7 +165,11 @@ void run_kernel_reduction_cached(std::shared_ptr<const DefaultExecutor> exec,
ValueType* result, dim<2> size,
Array<char>& tmp, KernelArgs&&... args)
{
#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
#else
constexpr int oversubscription = 16;
#endif
constexpr auto block_size = default_block_size;
const auto rows = static_cast<int64>(size[0]);
const auto cols = static_cast<int64>(size[1]);
Expand Down Expand Up @@ -195,10 +203,10 @@ void run_kernel_reduction_cached(std::shared_ptr<const DefaultExecutor> exec,
template <int subwarp_size, typename ValueType, typename KernelFunction,
typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
__global__
__launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d(
int64 rows, int64 cols, int64 col_blocks, KernelFunction fn,
ReductionOp op, FinalizeOp finalize, ValueType identity,
ValueType* result, int64 result_stride, KernelArgs... args)
__launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d(
int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op,
FinalizeOp finalize, ValueType identity, ValueType* result,
int64 result_stride, KernelArgs... args)
{
const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
const auto row = idx % rows;
Expand Down Expand Up @@ -227,10 +235,10 @@ __global__
template <int subwarp_size, typename ValueType, typename KernelFunction,
typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
__global__
__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small(
int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
FinalizeOp finalize, ValueType identity, ValueType* result,
KernelArgs... args)
__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small(
int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
FinalizeOp finalize, ValueType identity, ValueType* result,
KernelArgs... args)
{
constexpr auto warp_size = config::warp_size;
constexpr auto warps_per_block = default_block_size / warp_size;
Expand Down Expand Up @@ -290,10 +298,10 @@ __global__
template <typename ValueType, typename KernelFunction, typename ReductionOp,
typename FinalizeOp, typename... KernelArgs>
__global__
__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked(
int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
FinalizeOp finalize, ValueType identity, ValueType* result,
KernelArgs... args)
__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked(
int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
FinalizeOp finalize, ValueType identity, ValueType* result,
KernelArgs... args)
{
constexpr auto warp_size = config::warp_size;
__shared__ UninitializedArray<ValueType, default_block_size> block_partial;
Expand Down Expand Up @@ -329,10 +337,10 @@ __global__

template <typename ValueType, typename ReductionOp, typename FinalizeOp>
__global__
__launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d(
int64 num_results, int64 num_blocks, ReductionOp op,
FinalizeOp finalize, ValueType identity, const ValueType* input,
int64 result_stride, ValueType* result)
__launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d(
int64 num_results, int64 num_blocks, ReductionOp op, FinalizeOp finalize,
ValueType identity, const ValueType* input, int64 result_stride,
ValueType* result)
{
const auto idx = thread::get_thread_id_flat<int64>();
if (idx >= num_results) {
Expand Down Expand Up @@ -430,7 +438,11 @@ void run_kernel_row_reduction_cached(
{
using subwarp_sizes =
syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
#else
constexpr int oversubscription = 16;
#endif
const auto rows = static_cast<int64>(size[0]);
const auto cols = static_cast<int64>(size[1]);
const auto resources =
Expand Down Expand Up @@ -480,7 +492,11 @@ void run_kernel_col_reduction_cached(
{
using subwarp_sizes =
syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
const int oversubscription = gko::_tuning_flag ? gko::_tuned_value : 16;
#else
constexpr int oversubscription = 16;
#endif
const auto rows = static_cast<int64>(size[0]);
const auto cols = static_cast<int64>(size[1]);
const auto max_blocks = exec->get_num_warps() * config::warp_size *
Expand Down
5 changes: 2 additions & 3 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,12 +256,11 @@ GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
Expand Down
15 changes: 15 additions & 0 deletions core/matrix/dense_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,17 @@ namespace kernels {
matrix::Dense<remove_complex<_type>>* result, \
Array<char>& tmp)

#define GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL(_type) \
void compute_sparselib_dot(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type>* x, \
const matrix::Dense<_type>* y, \
matrix::Dense<_type>* result)

#define GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL(_type) \
void compute_sparselib_norm2(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type>* x, \
matrix::Dense<remove_complex<_type>>* result)

#define GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(_type, _prec) \
void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec, \
const device_matrix_data<_type, _prec>& data, \
Expand Down Expand Up @@ -339,6 +350,10 @@ namespace kernels {
GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType); \
template <typename ValueType> \
GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType); \
template <typename ValueType> \
GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_DOT_KERNEL(ValueType); \
template <typename ValueType> \
GKO_DECLARE_DENSE_COMPUTE_SPARSELIB_NORM2_KERNEL(ValueType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
Expand Down
5 changes: 5 additions & 0 deletions cuda/base/kernel_launch_reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cuda/components/thread_ids.cuh"


#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
#include "benchmark/utils/tuning_variables.hpp"
#endif // GINKGO_BENCHMARK_ENABLE_TUNING


namespace gko {
namespace kernels {
namespace cuda {
Expand Down
Loading