Skip to content

Commit

Permalink
Merge -allocator flag for benchmarks
Browse files Browse the repository at this point in the history
This allows controlling which allocator should be used for the GPU executors.
It also fixes minor bugs around the handling of asynchronous allocations.

Related PR: #1385
  • Loading branch information
upsj authored Aug 11, 2023
2 parents 0dfcbb6 + 21b4cd6 commit ee648e2
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 6 deletions.
7 changes: 5 additions & 2 deletions benchmark/sparse_blas/sparse_blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,12 @@ void apply_sparse_blas(const char* operation_name,
allocator);
auto gen_logger = create_operations_logger(
FLAGS_gpu_timer, FLAGS_nested_names, exec,
test_case[operation_name]["components"], allocator, 1);
test_case[operation_name]["components"], allocator,
repetitions);
exec->add_logger(gen_logger);
op->run();
for (unsigned i = 0; i < repetitions; i++) {
op->run();
}
exec->remove_logger(gen_logger);
}
op->write_stats(test_case[operation_name], allocator);
Expand Down
54 changes: 50 additions & 4 deletions benchmark/utils/general.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <ostream>
#include <random>
#include <sstream>
#include <stdexcept>
#include <string>
#include <type_traits>
#include <utility>
Expand All @@ -58,6 +59,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <rapidjson/prettywriter.h>


#include <ginkgo/core/base/memory.hpp>


#include "benchmark/utils/json.hpp"
#include "benchmark/utils/timer.hpp"
#include "benchmark/utils/types.hpp"
Expand All @@ -69,6 +73,10 @@ DEFINE_string(executor, "reference",
"The executor used to run the benchmarks, one of: reference, "
"omp, cuda, hip");

DEFINE_string(allocator, "default",
"The allocator used in the executor. Only relevant for CUDA and "
"HIP executors, one of: default, async, host, unified");

DEFINE_uint32(device_id, 0, "ID of the device where to run the code");

DEFINE_bool(overwrite, false,
Expand Down Expand Up @@ -329,6 +337,40 @@ void backup_results(rapidjson::Document& results)
}


inline std::shared_ptr<gko::CudaAllocatorBase> create_cuda_allocator()
{
std::string flag{FLAGS_allocator};
if (flag == "default") {
return std::make_shared<gko::CudaAllocator>();
} else if (flag == "async") {
return std::make_shared<gko::CudaAsyncAllocator>(nullptr);
} else if (flag == "unified") {
return std::make_shared<gko::CudaUnifiedAllocator>(FLAGS_device_id);
} else if (flag == "host") {
return std::make_shared<gko::CudaHostAllocator>(FLAGS_device_id);
} else {
throw std::runtime_error{"Unknown allocator type " + flag};
}
}


inline std::shared_ptr<gko::HipAllocatorBase> create_hip_allocator()
{
std::string flag{FLAGS_allocator};
if (flag == "default") {
return std::make_shared<gko::HipAllocator>();
} else if (flag == "async") {
return std::make_shared<gko::HipAsyncAllocator>(nullptr);
} else if (flag == "unified") {
return std::make_shared<gko::HipUnifiedAllocator>(FLAGS_device_id);
} else if (flag == "host") {
return std::make_shared<gko::HipHostAllocator>(FLAGS_device_id);
} else {
throw std::runtime_error{"Unknown allocator type " + flag};
}
}


// executor mapping
const std::map<std::string, std::function<std::shared_ptr<gko::Executor>(bool)>>
executor_factory{
Expand All @@ -337,12 +379,14 @@ const std::map<std::string, std::function<std::shared_ptr<gko::Executor>(bool)>>
{"cuda",
[](bool) {
return gko::CudaExecutor::create(FLAGS_device_id,
gko::OmpExecutor::create());
gko::OmpExecutor::create(),
create_cuda_allocator());
}},
{"hip",
[](bool) {
return gko::HipExecutor::create(FLAGS_device_id,
gko::OmpExecutor::create());
gko::OmpExecutor::create(),
create_hip_allocator());
}},
{"dpcpp", [](bool use_gpu_timer) {
auto property = dpcpp_queue_property::in_order;
Expand All @@ -369,14 +413,16 @@ const std::map<std::string,
FLAGS_device_id = gko::experimental::mpi::map_rank_to_device_id(
comm, gko::CudaExecutor::get_num_devices());
return gko::CudaExecutor::create(FLAGS_device_id,
gko::ReferenceExecutor::create());
gko::ReferenceExecutor::create(),
create_cuda_allocator());
}},
{"hip",
[](MPI_Comm comm) {
FLAGS_device_id = gko::experimental::mpi::map_rank_to_device_id(
comm, gko::HipExecutor::get_num_devices());
return gko::HipExecutor::create(FLAGS_device_id,
gko::ReferenceExecutor::create());
gko::ReferenceExecutor::create(),
create_hip_allocator());
}},
{"dpcpp", [](MPI_Comm comm) {
if (gko::DpcppExecutor::get_num_devices("gpu")) {
Expand Down
4 changes: 4 additions & 0 deletions core/device_hooks/cuda_hooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ bool CudaAsyncAllocator::check_environment(int device_id,
GKO_NOT_COMPILED(cuda);


CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id)
GKO_NOT_COMPILED(cuda);


CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags)
GKO_NOT_COMPILED(cuda);

Expand Down
3 changes: 3 additions & 0 deletions core/device_hooks/hip_hooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ bool HipAsyncAllocator::check_environment(int device_id,
GKO_NOT_COMPILED(hip);


HipUnifiedAllocator::HipUnifiedAllocator(int device_id) GKO_NOT_COMPILED(hip);


HipUnifiedAllocator::HipUnifiedAllocator(int device_id, unsigned int flags)
GKO_NOT_COMPILED(hip);

Expand Down
1 change: 1 addition & 0 deletions cuda/base/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <ginkgo/core/base/memory.hpp>


#include <cuda.h>
#include <cuda_runtime.h>


Expand Down

0 comments on commit ee648e2

Please sign in to comment.