Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dpcpp device timing #1110

Merged
merged 6 commits into from
Oct 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,19 @@ endfunction()
function(ginkgo_add_single_benchmark_executable name use_lib_linops macro_def type)
add_executable("${name}" ${ARGN})
target_link_libraries("${name}" ginkgo gflags rapidjson)
# always include the device timer
if (GINKGO_BUILD_CUDA)
target_compile_definitions("${name}" PRIVATE HAS_CUDA_TIMER=1)
target_link_libraries("${name}" cuda_timer)
endif()
if (GINKGO_BUILD_HIP)
target_compile_definitions("${name}" PRIVATE HAS_HIP_TIMER=1)
target_link_libraries("${name}" hip_timer)
endif()
if (GINKGO_BUILD_DPCPP)
target_compile_definitions("${name}" PRIVATE HAS_DPCPP_TIMER=1)
target_link_libraries("${name}" dpcpp_timer)
endif()
target_compile_definitions("${name}" PRIVATE "${macro_def}")
target_compile_options("${name}" PRIVATE ${GINKGO_COMPILER_FLAGS})
ginkgo_benchmark_add_tuning_maybe("${name}")
Expand Down Expand Up @@ -131,6 +138,8 @@ if (GINKGO_BUILD_DPCPP)
ginkgo_benchmark_onemkl_linops(s GKO_BENCHMARK_USE_SINGLE_PRECISION)
ginkgo_benchmark_onemkl_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION)
ginkgo_benchmark_onemkl_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION)
add_library(dpcpp_timer utils/dpcpp_timer.dp.cpp)
target_link_libraries(dpcpp_timer ginkgo)
endif()

add_subdirectory(blas)
Expand Down
3 changes: 1 addition & 2 deletions benchmark/blas/blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -506,8 +506,7 @@ int main(int argc, char* argv[])

std::string extra_information = "The operations are " + FLAGS_operations;
print_general_information(extra_information);

auto exec = executor_factory.at(FLAGS_executor)();
auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
auto engine = get_engine();
auto operations = split(FLAGS_operations, ',');

Expand Down
2 changes: 1 addition & 1 deletion benchmark/conversions/conversions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ int main(int argc, char* argv[])
std::string() + "The formats are " + FLAGS_formats + "\n";
print_general_information(extra_information);

auto exec = executor_factory.at(FLAGS_executor)();
auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
auto formats = split(FLAGS_formats, ',');

rapidjson::IStreamWrapper jcin(std::cin);
Expand Down
2 changes: 1 addition & 1 deletion benchmark/preconditioner/preconditioner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ int main(int argc, char* argv[])
"Running with preconditioners: " + FLAGS_preconditioners + "\n";
print_general_information(extra_information);

auto exec = get_executor();
auto exec = get_executor(FLAGS_gpu_timer);
auto& engine = get_engine();

auto preconditioners = split(FLAGS_preconditioners, ',');
Expand Down
9 changes: 5 additions & 4 deletions benchmark/solver/solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,9 +443,10 @@ void solve_system(const std::string& solver_name,
add_or_set_member(solver_json, "preconditioner",
rapidjson::Value(rapidjson::kObjectType),
allocator);
write_precond_info(lend(clone(get_executor()->get_master(),
prec->get_preconditioner())),
solver_json["preconditioner"], allocator);
write_precond_info(
lend(clone(get_executor(FLAGS_gpu_timer)->get_master(),
prec->get_preconditioner())),
solver_json["preconditioner"], allocator);
}

auto apply_logger =
Expand Down Expand Up @@ -571,7 +572,7 @@ int main(int argc, char* argv[])
std::to_string(FLAGS_nrhs) + "\n";
print_general_information(extra_information);

auto exec = get_executor();
auto exec = get_executor(FLAGS_gpu_timer);
auto solvers = split(FLAGS_solvers, ',');
auto preconds = split(FLAGS_preconditioners, ',');
std::vector<std::string> precond_solvers;
Expand Down
2 changes: 1 addition & 1 deletion benchmark/sparse_blas/sparse_blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ int main(int argc, char* argv[])
" { \"filename\": \"my_file2.mtx\"}\n" + " ]\n\n";
initialize_argument_parsing(&argc, &argv, header, format);

auto exec = executor_factory.at(FLAGS_executor)();
auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);

rapidjson::IStreamWrapper jcin(std::cin);
rapidjson::Document test_cases;
Expand Down
2 changes: 1 addition & 1 deletion benchmark/spmv/spmv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ int main(int argc, char* argv[])
std::to_string(FLAGS_nrhs) + "\n";
print_general_information(extra_information);

auto exec = executor_factory.at(FLAGS_executor)();
auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
auto engine = get_engine();
auto formats = split(FLAGS_formats, ',');

Expand Down
109 changes: 109 additions & 0 deletions benchmark/utils/dpcpp_timer.dp.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/*******************************<GINKGO LICENSE>******************************
Copyright (c) 2017-2022, the Ginkgo authors
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
******************************<GINKGO LICENSE>*******************************/

#include <iostream>


#include <CL/sycl.hpp>


#include "benchmark/utils/timer_impl.hpp"


/**
* DpcppTimer uses dpcpp executor and event to measure the timing.
*/
class DpcppTimer : public Timer {
public:
/**
* Create a DpcppTimer.
*
* @param exec Executor which should be a DpcppExecutor
*/
DpcppTimer(std::shared_ptr<const gko::Executor> exec)
: DpcppTimer(std::dynamic_pointer_cast<const gko::DpcppExecutor>(exec))
{}

/**
* Create a DpcppTimer.
*
* @param exec DpcppExecutor associated to the timer
*/
DpcppTimer(std::shared_ptr<const gko::DpcppExecutor> exec) : Timer()
{
assert(exec != nullptr);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
assert(exec != nullptr);

if (!exec->get_queue()
->template has_property<
sycl::property::queue::enable_profiling>()) {
GKO_NOT_SUPPORTED(exec);
}
exec_ = exec;
}

protected:
void tic_impl() override
{
exec_->synchronize();
// Currently, gko::DpcppExecutor always use default stream.
start_ = exec_->get_queue()->submit([&](sycl::handler& cgh) {
cgh.parallel_for(1, [=](sycl::id<1> id) {});
});
}

double toc_impl() override
{
auto stop = exec_->get_queue()->submit([&](sycl::handler& cgh) {
cgh.parallel_for(1, [=](sycl::id<1> id) {});
});
stop.wait_and_throw();
// get the start time of stop
auto stop_time = stop.get_profiling_info<
sycl::info::event_profiling::command_start>();
// get the end time of start
auto start_time =
start_
.get_profiling_info<sycl::info::event_profiling::command_end>();
return (stop_time - start_time) / double{1.0e9};
}

private:
std::shared_ptr<const gko::DpcppExecutor> exec_;
sycl::event start_;
int id_;
};


std::shared_ptr<Timer> get_dpcpp_timer(
std::shared_ptr<const gko::DpcppExecutor> exec)
{
return std::make_shared<DpcppTimer>(exec);
}
25 changes: 15 additions & 10 deletions benchmark/utils/general.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,30 +290,35 @@ void backup_results(rapidjson::Document& results)


// executor mapping
const std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
const std::map<std::string, std::function<std::shared_ptr<gko::Executor>(bool)>>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This parameter should already be available as a global variable, maybe use the global instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it will require the timer.hpp include the general.hpp or include general.hpp earlier than timer.hpp.
it only uses one input so I prefer not introducing another headers hierarchy.

executor_factory{
{"reference", [] { return gko::ReferenceExecutor::create(); }},
{"omp", [] { return gko::OmpExecutor::create(); }},
{"reference", [](bool) { return gko::ReferenceExecutor::create(); }},
{"omp", [](bool) { return gko::OmpExecutor::create(); }},
{"cuda",
[] {
[](bool) {
return gko::CudaExecutor::create(FLAGS_device_id,
gko::OmpExecutor::create(), true);
}},
{"hip",
[] {
[](bool) {
return gko::HipExecutor::create(FLAGS_device_id,
gko::OmpExecutor::create(), true);
}},
{"dpcpp", [] {
return gko::DpcppExecutor::create(FLAGS_device_id,
gko::OmpExecutor::create());
{"dpcpp", [](bool use_gpu_timer) {
auto property = dpcpp_queue_property::in_order;
if (use_gpu_timer) {
property = dpcpp_queue_property::in_order |
dpcpp_queue_property::enable_profiling;
}
return gko::DpcppExecutor::create(
FLAGS_device_id, gko::OmpExecutor::create(), "all", property);
}}};


// returns the appropriate executor, as set by the executor flag
std::shared_ptr<gko::Executor> get_executor()
std::shared_ptr<gko::Executor> get_executor(bool use_gpu_timer)
{
static auto exec = executor_factory.at(FLAGS_executor)();
static auto exec = executor_factory.at(FLAGS_executor)(use_gpu_timer);
return exec;
}

Expand Down
31 changes: 22 additions & 9 deletions benchmark/utils/timer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,22 @@ DEFINE_bool(gpu_timer, false,
"executor is cuda or hip");


#ifdef HAS_CUDA
#ifdef HAS_CUDA_TIMER
std::shared_ptr<Timer> get_cuda_timer(
std::shared_ptr<const gko::CudaExecutor> exec);
#endif // HAS_CUDA
#endif // HAS_CUDA_TIMER


#ifdef HAS_HIP
#ifdef HAS_HIP_TIMER
std::shared_ptr<Timer> get_hip_timer(
std::shared_ptr<const gko::HipExecutor> exec);
#endif // HAS_HIP
#endif // HAS_HIP_TIMER


#ifdef HAS_DPCPP_TIMER
std::shared_ptr<Timer> get_dpcpp_timer(
std::shared_ptr<const gko::DpcppExecutor> exec);
#endif // HAS_DPCPP_TIMER


/**
Expand All @@ -75,21 +81,28 @@ std::shared_ptr<Timer> get_timer(std::shared_ptr<const gko::Executor> exec,
bool use_gpu_timer)
{
if (use_gpu_timer) {
#ifdef HAS_CUDA
#ifdef HAS_CUDA_TIMER
if (auto cuda =
std::dynamic_pointer_cast<const gko::CudaExecutor>(exec)) {
return get_cuda_timer(cuda);
}
#endif // HAS_CUDA
#endif // HAS_CUDA_TIMER

#ifdef HAS_HIP
#ifdef HAS_HIP_TIMER
if (auto hip =
std::dynamic_pointer_cast<const gko::HipExecutor>(exec)) {
return get_hip_timer(hip);
}
#endif // HAS_HIP
#endif // HAS_HIP_TIMER

#ifdef HAS_DPCPP_TIMER
if (auto dpcpp =
std::dynamic_pointer_cast<const gko::DpcppExecutor>(exec)) {
return get_dpcpp_timer(dpcpp);
}
#endif // HAS_DPCPP_TIMER
}
// No cuda/hip executor available or no gpu_timer used
// No cuda/hip/dpcpp executor available or no gpu_timer used
return std::make_shared<CpuTimer>(exec);
}

Expand Down
7 changes: 4 additions & 3 deletions core/device_hooks/dpcpp_hooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,11 @@ version version_info::get_dpcpp_version() noexcept


std::shared_ptr<DpcppExecutor> DpcppExecutor::create(
int device_id, std::shared_ptr<Executor> master, std::string device_type)
int device_id, std::shared_ptr<Executor> master, std::string device_type,
dpcpp_queue_property property)
{
return std::shared_ptr<DpcppExecutor>(
new DpcppExecutor(device_id, std::move(master), device_type));
new DpcppExecutor(device_id, std::move(master), device_type, property));
}


Expand Down Expand Up @@ -123,7 +124,7 @@ void DpcppExecutor::run(const Operation& op) const
int DpcppExecutor::get_num_devices(std::string) { return 0; }


void DpcppExecutor::set_device_property() {}
void DpcppExecutor::set_device_property(dpcpp_queue_property property) {}


bool DpcppExecutor::verify_memory_to(const OmpExecutor* dest_exec) const
Expand Down
23 changes: 19 additions & 4 deletions dpcpp/base/executor.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,11 @@ bool OmpExecutor::verify_memory_to(const DpcppExecutor* dest_exec) const


std::shared_ptr<DpcppExecutor> DpcppExecutor::create(
int device_id, std::shared_ptr<Executor> master, std::string device_type)
int device_id, std::shared_ptr<Executor> master, std::string device_type,
dpcpp_queue_property property)
{
return std::shared_ptr<DpcppExecutor>(
new DpcppExecutor(device_id, std::move(master), device_type));
new DpcppExecutor(device_id, std::move(master), device_type, property));
}


Expand Down Expand Up @@ -234,10 +235,24 @@ void delete_queue(sycl::queue* queue)
}


::cl::sycl::property_list get_property_list(dpcpp_queue_property property)
{
if (property == dpcpp_queue_property::in_order) {
return {sycl::property::queue::in_order{}};
} else if (property == (dpcpp_queue_property::in_order |
dpcpp_queue_property::enable_profiling)) {
return {sycl::property::queue::in_order{},
sycl::property::queue::enable_profiling{}};
} else {
GKO_NOT_SUPPORTED(property);
}
}


} // namespace detail


void DpcppExecutor::set_device_property()
void DpcppExecutor::set_device_property(dpcpp_queue_property property)
{
assert(this->get_exec_info().device_id <
DpcppExecutor::get_num_devices(this->get_exec_info().device_type));
Expand Down Expand Up @@ -277,7 +292,7 @@ void DpcppExecutor::set_device_property()
// `wait()` would be needed after every call to a DPC++ function or kernel.
// For example, without `in_order`, doing a copy, a kernel, and a copy, will
// not necessarily happen in that order by default, which we need to avoid.
auto* queue = new sycl::queue{device, sycl::property::queue::in_order{}};
auto* queue = new sycl::queue{device, detail::get_property_list(property)};
queue_ = std::move(queue_manager<sycl::queue>{queue, detail::delete_queue});
}

Expand Down
Loading