diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp index cfa56ef81fe..d906e9f9e12 100644 --- a/benchmark/sparse_blas/sparse_blas.cpp +++ b/benchmark/sparse_blas/sparse_blas.cpp @@ -127,9 +127,12 @@ void apply_sparse_blas(const char* operation_name, allocator); auto gen_logger = create_operations_logger( FLAGS_gpu_timer, FLAGS_nested_names, exec, - test_case[operation_name]["components"], allocator, 1); + test_case[operation_name]["components"], allocator, + repetitions); exec->add_logger(gen_logger); - op->run(); + for (unsigned i = 0; i < repetitions; i++) { + op->run(); + } exec->remove_logger(gen_logger); } op->write_stats(test_case[operation_name], allocator); diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index 5c6d849fe36..335ed687002 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -58,6 +59,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #include "benchmark/utils/json.hpp" #include "benchmark/utils/timer.hpp" #include "benchmark/utils/types.hpp" @@ -69,6 +73,10 @@ DEFINE_string(executor, "reference", "The executor used to run the benchmarks, one of: reference, " "omp, cuda, hip"); +DEFINE_string(allocator, "default", + "The allocator used in the executor. Only relevant for CUDA and " + "HIP executors, one of: default, async, host, unified"); + DEFINE_uint32(device_id, 0, "ID of the device where to run the code"); DEFINE_bool(overwrite, false, @@ -329,6 +337,40 @@ void backup_results(rapidjson::Document& results) } +inline std::shared_ptr create_cuda_allocator() +{ + std::string flag{FLAGS_allocator}; + if (flag == "default") { + return std::make_shared(); + } else if (flag == "async") { + return std::make_shared(nullptr); + } else if (flag == "unified") { + return std::make_shared(FLAGS_device_id); + } else if (flag == "host") { + return std::make_shared(FLAGS_device_id); + } else { + throw std::runtime_error{"Unknown allocator type " + flag}; + } +} + + +inline std::shared_ptr create_hip_allocator() +{ + std::string flag{FLAGS_allocator}; + if (flag == "default") { + return std::make_shared(); + } else if (flag == "async") { + return std::make_shared(nullptr); + } else if (flag == "unified") { + return std::make_shared(FLAGS_device_id); + } else if (flag == "host") { + return std::make_shared(FLAGS_device_id); + } else { + throw std::runtime_error{"Unknown allocator type " + flag}; + } +} + + // executor mapping const std::map(bool)>> executor_factory{ @@ -337,12 +379,14 @@ const std::map(bool)>> {"cuda", [](bool) { return gko::CudaExecutor::create(FLAGS_device_id, - gko::OmpExecutor::create()); + gko::OmpExecutor::create(), + create_cuda_allocator()); }}, {"hip", [](bool) { return gko::HipExecutor::create(FLAGS_device_id, - gko::OmpExecutor::create()); + gko::OmpExecutor::create(), + create_hip_allocator()); }}, {"dpcpp", [](bool use_gpu_timer) { auto property = dpcpp_queue_property::in_order; @@ -369,14 +413,16 @@ const std::map +#include #include