From d40f82ad96a27343532ff01463efb6ad1ce02958 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 16 Feb 2025 20:35:13 +0100 Subject: [PATCH 1/2] add work estimates to profiler_hook logger --- core/log/profiler_hook.cpp | 89 ++++++--- core/log/profiler_hook_summary.cpp | 55 +++++- core/log/profiler_hook_summary_writer.cpp | 226 ++++++++++++++++++---- core/test/log/profiler_hook.cpp | 153 +++++++++++++-- include/ginkgo/core/log/profiler_hook.hpp | 27 ++- 5 files changed, 456 insertions(+), 94 deletions(-) diff --git a/core/log/profiler_hook.cpp b/core/log/profiler_hook.cpp index e3ed0ad8299..41cc350c641 100644 --- a/core/log/profiler_hook.cpp +++ b/core/log/profiler_hook.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -33,7 +34,7 @@ void ProfilerHook::on_allocation_completed(const gko::Executor* exec, const gko::uintptr&) const { this->maybe_synchronize(exec); - this->end_hook_("allocate", profile_event_category::memory); + this->end_hook_("allocate", profile_event_category::memory, {}); } @@ -49,7 +50,7 @@ void ProfilerHook::on_free_completed(const gko::Executor* exec, const gko::uintptr&) const { this->maybe_synchronize(exec); - this->end_hook_("free", profile_event_category::memory); + this->end_hook_("free", profile_event_category::memory, {}); } @@ -67,11 +68,12 @@ void ProfilerHook::on_copy_started(const gko::Executor* from, void ProfilerHook::on_copy_completed(const gko::Executor* from, const gko::Executor* to, const gko::uintptr&, const gko::uintptr&, - const gko::size_type&) const + const gko::size_type& size) const { this->maybe_synchronize(from); this->maybe_synchronize(to); - this->end_hook_("copy", profile_event_category::operation); + this->end_hook_("copy", profile_event_category::operation, + memory_bound_work_estimate{size, size}); } @@ -87,7 +89,8 @@ void ProfilerHook::on_operation_completed(const Executor* exec, const Operation* operation) const { this->maybe_synchronize(exec); - this->end_hook_(operation->get_name(), profile_event_category::operation); + this->end_hook_(operation->get_name(), profile_event_category::operation, + operation->get_work_estimate()); } @@ -109,7 +112,7 @@ void ProfilerHook::on_polymorphic_object_copy_completed( std::stringstream ss; ss << "copy(" << stringify_object(from) << "," << stringify_object(to) << ")"; - this->end_hook_(ss.str().c_str(), profile_event_category::object); + this->end_hook_(ss.str().c_str(), profile_event_category::object, {}); } @@ -131,7 +134,7 @@ void ProfilerHook::on_polymorphic_object_move_completed( std::stringstream ss; ss << "move(" << stringify_object(from) << "," << stringify_object(to) << ")"; - this->end_hook_(ss.str().c_str(), profile_event_category::object); + this->end_hook_(ss.str().c_str(), profile_event_category::object, {}); } @@ -153,9 +156,9 @@ void ProfilerHook::on_linop_apply_completed(const LinOp* A, const LinOp* b, std::stringstream ss; ss << "apply(" << stringify_object(A) << ")"; if (dynamic_cast(A)) { - this->end_hook_("iteration", profile_event_category::solver); + this->end_hook_("iteration", profile_event_category::solver, {}); } - this->end_hook_(ss.str().c_str(), profile_event_category::linop); + this->end_hook_(ss.str().c_str(), profile_event_category::linop, {}); } @@ -183,9 +186,9 @@ void ProfilerHook::on_linop_advanced_apply_completed(const LinOp* A, std::stringstream ss; ss << "advanced_apply(" << stringify_object(A) << ")"; if (dynamic_cast(A)) { - this->end_hook_("iteration", profile_event_category::solver); + this->end_hook_("iteration", profile_event_category::solver, {}); } - this->end_hook_(ss.str().c_str(), profile_event_category::linop); + this->end_hook_(ss.str().c_str(), profile_event_category::linop, {}); } @@ -203,7 +206,7 @@ void ProfilerHook::on_linop_factory_generate_completed( { std::stringstream ss; ss << "generate(" << stringify_object(factory) << ")"; - this->end_hook_(ss.str().c_str(), profile_event_category::factory); + this->end_hook_(ss.str().c_str(), profile_event_category::factory, {}); } @@ -240,7 +243,7 @@ void ProfilerHook::on_criterion_check_completed( { std::stringstream ss; ss << "check(" << stringify_object(criterion) << ")"; - this->end_hook_(ss.str().c_str(), profile_event_category::criterion); + this->end_hook_(ss.str().c_str(), profile_event_category::criterion, {}); } @@ -252,7 +255,7 @@ void ProfilerHook::on_iteration_complete( { if (num_iterations > 0 && dynamic_cast(solver) && !stopped) { - this->end_hook_("iteration", profile_event_category::solver); + this->end_hook_("iteration", profile_event_category::solver, {}); this->begin_hook_("iteration", profile_event_category::solver); } } @@ -292,8 +295,12 @@ void ProfilerHook::set_object_name(ptr_param obj, profiling_scope_guard ProfilerHook::user_range(const char* name) const { - return profiling_scope_guard{name, profile_event_category::user, - begin_hook_, end_hook_}; + return profiling_scope_guard{ + name, profile_event_category::user, begin_hook_, + [end_hook = end_hook_](const char* name, + profile_event_category category) { + end_hook(name, category, {}); + }}; } @@ -303,12 +310,31 @@ void ProfilerHook::set_synchronization(bool synchronize) } +static ProfilerHook::hook_function wrap_work_estimate_hook_function( + ProfilerHook::work_estimate_hook_function end_hook) +{ + return [end_hook](const char* name, profile_event_category category) { + end_hook(name, category, {}); + }; +} + + +static ProfilerHook::work_estimate_hook_function wrap_hook_function( + ProfilerHook::hook_function end_hook) +{ + return [end_hook](const char* name, profile_event_category category, + std::optional) { + end_hook(name, category); + }; +} + + void ProfilerHook::maybe_synchronize(const Executor* exec) const { if (synchronize_) { - profiling_scope_guard sync_guard{"synchronize", - profile_event_category::internal, - begin_hook_, end_hook_}; + profiling_scope_guard sync_guard{ + "synchronize", profile_event_category::internal, begin_hook_, + wrap_work_estimate_hook_function(end_hook_)}; exec->synchronize(); } } @@ -327,7 +353,7 @@ std::string ProfilerHook::stringify_object(const PolymorphicObject* obj) const } -ProfilerHook::ProfilerHook(hook_function begin, hook_function end) +ProfilerHook::ProfilerHook(hook_function begin, work_estimate_hook_function end) : synchronize_{false}, begin_hook_{begin}, end_hook_{end} {} @@ -355,30 +381,27 @@ std::shared_ptr ProfilerHook::create_tau(bool initialize) new int, tau_finalize_deleter{}}; } } - return std::shared_ptr{new ProfilerHook{begin_tau, end_tau}}; + return create_custom(begin_tau, end_tau); } std::shared_ptr ProfilerHook::create_vtune() { auto fns = create_vtune_fns(); - return std::shared_ptr{ - new ProfilerHook{std::move(fns.first), std::move(fns.second)}}; + return create_custom(std::move(fns.first), std::move(fns.second)); } std::shared_ptr ProfilerHook::create_nvtx(uint32 color_rgb) { init_nvtx(); - return std::shared_ptr{ - new ProfilerHook{begin_nvtx_fn(color_rgb), end_nvtx}}; + return create_custom(begin_nvtx_fn(color_rgb), end_nvtx); } std::shared_ptr ProfilerHook::create_roctx() { - return std::shared_ptr{ - new ProfilerHook{begin_roctx, end_roctx}}; + return create_custom(begin_roctx, end_roctx); } @@ -400,10 +423,18 @@ std::shared_ptr ProfilerHook::create_for_executor( } +std::shared_ptr ProfilerHook::create_custom( + hook_function begin, work_estimate_hook_function end) +{ + return std::shared_ptr{ + new ProfilerHook{begin, std::move(end)}}; +} + + std::shared_ptr ProfilerHook::create_custom(hook_function begin, hook_function end) { - return std::shared_ptr{new ProfilerHook{begin, end}}; + return create_custom(begin, wrap_hook_function(std::move(end))); } diff --git a/core/log/profiler_hook_summary.cpp b/core/log/profiler_hook_summary.cpp index 02641bdfd29..865ac9e6682 100644 --- a/core/log/profiler_hook_summary.cpp +++ b/core/log/profiler_hook_summary.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -8,6 +8,8 @@ #include #include +#include + #include "core/log/profiler_hook.hpp" @@ -95,8 +97,36 @@ void pop_all(Summary& s) << "\"\n"; #endif } - s.pop(s.get_top_name().c_str(), true); + s.pop(s.get_top_name().c_str(), {}, true); + } +} + + +std::optional accumulate_work_estimate( + const char* name, std::optional existing, + std::optional work, bool first) +{ + if (first) { + GKO_ASSERT(!existing.has_value()); + return work; + } + if (work.has_value() != existing.has_value()) { +#if GKO_VERBOSE_LEVEL >= 1 + // this can happen if some parts use GKO_REGISTER_OPERATION + // and some use GKO_REGISTER_OPERATION_WITH_WORK_ESTIMATE + std::cerr << "WARNING: Operation " << name + << " has inconsistent work estimate - some kernel calls " + "provide an estimate, but others don't.\n"; +#endif + } + if (work.has_value() && existing.has_value()) { + // if they have different types, this throws an exception + return *existing + *work; + } + if (existing.has_value()) { + return existing; } + return {}; } @@ -167,7 +197,8 @@ struct summary : summary_base { overhead += cpu_clock::now() - cpu_now; } - void pop(const char* name, bool allow_pop_root = false) + void pop(const char* name, std::optional work, + bool allow_pop_root = false) { const auto cpu_now = cpu_clock::now(); std::lock_guard guard{mutex}; @@ -187,6 +218,8 @@ struct summary : summary_base { const auto elapsed = timer->difference_async(partial_entry.second, now); release_time_point(std::move(partial_entry.second)); release_time_point(std::move(now)); + entry.work_estimate = accumulate_work_estimate( + name, entry.work_estimate, work, entry.count == 0); entry.count++; entry.inclusive += elapsed; entry.exclusive += elapsed; @@ -211,6 +244,7 @@ struct nested_summary : summary_base { int64 parent_id; std::chrono::nanoseconds elapsed{}; int64 count{}; + std::optional work_estimate{}; entry(int64 name_id, int64 node_id, int64 parent_id) : name_id{name_id}, node_id{node_id}, parent_id{parent_id} @@ -291,7 +325,8 @@ struct nested_summary : summary_base { overhead += cpu_clock::now() - cpu_now; } - void pop(const char* name, bool allow_pop_root = false) + void pop(const char* name, std::optional work, + bool allow_pop_root = false) { const auto cpu_now = cpu_clock::now(); std::lock_guard guard{mutex}; @@ -311,6 +346,8 @@ struct nested_summary : summary_base { const auto elapsed = timer->difference_async(partial_entry.start, now); release_time_point(std::move(partial_entry.start)); release_time_point(std::move(now)); + node.work_estimate = accumulate_work_estimate(name, node.work_estimate, + work, node.count == 0); node.count++; node.elapsed += elapsed; const auto cpu_now4 = cpu_clock::now(); @@ -387,7 +424,10 @@ std::shared_ptr ProfilerHook::create_summary( data->check_nesting = debug_check_nesting; return std::shared_ptr{new ProfilerHook{ [data](const char* name, profile_event_category) { data->push(name); }, - [data](const char* name, profile_event_category) { data->pop(name); }}}; + [data](const char* name, profile_event_category, + std::optional work) { + data->pop(name, work); + }}}; } @@ -409,7 +449,10 @@ std::shared_ptr ProfilerHook::create_nested_summary( data->check_nesting = debug_check_nesting; return std::shared_ptr{new ProfilerHook{ [data](const char* name, profile_event_category) { data->push(name); }, - [data](const char* name, profile_event_category) { data->pop(name); }}}; + [data](const char* name, profile_event_category, + std::optional work) { + data->pop(name, work); + }}}; } diff --git a/core/log/profiler_hook_summary_writer.cpp b/core/log/profiler_hook_summary_writer.cpp index dd39687ffe4..eead9a36546 100644 --- a/core/log/profiler_hook_summary_writer.cpp +++ b/core/log/profiler_hook_summary_writer.cpp @@ -1,10 +1,13 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause +#include #include +#include #include +#include #include @@ -52,6 +55,74 @@ std::string format_fraction(std::chrono::nanoseconds part, } +std::pair unit_scale(double value) +{ + static std::array ranges{ + {9.999e2, 9.999e5, 9.999e8, 9.999e11, + std::numeric_limits::infinity()}}; + static std::array scales{{1, 1e3, 1e6, 1e9, 1e12}}; + static std::array units{ + {" ", "k", "M", "G", "T", "error!"}}; + auto unit = std::distance( + ranges.begin(), std::lower_bound(ranges.begin(), ranges.end(), value)); + return std::make_pair(units[unit], value / scales[unit]); +} + + +std::string format_work_estimate(memory_bound_work_estimate work, + std::chrono::nanoseconds time) +{ + const auto memory = work.bytes_read + work.bytes_written; + const auto rate = + double(memory) / std::max(double(time.count()), 1.0) * 1e9; + std::stringstream ss; + ss << std::setprecision(1) << std::fixed; + const auto [unit, value] = unit_scale(rate); + ss << value << ' ' << unit << "B/s"; + return ss.str(); +} + + +std::string format_work_estimate(compute_bound_work_estimate work, + std::chrono::nanoseconds time) +{ + const auto rate = + double(work.flops) / std::max(double(time.count()), 1.0) * 1e9; + std::stringstream ss; + ss << std::setprecision(1) << std::fixed; + const auto [unit, value] = unit_scale(rate); + ss << value << ' ' << unit << "FLOPS/s"; + return ss.str(); +} + + +std::string format_work_estimate(custom_work_estimate work, + std::chrono::nanoseconds time) +{ + const auto rate = + double(work.operations) / std::max(double(time.count()), 1.0) * 1e9; + std::stringstream ss; + ss << std::setprecision(1) << std::fixed; + const auto [unit, value] = unit_scale(rate); + ss << value << ' ' << unit << work.operation_count_name << "/s"; + return ss.str(); +} + + +std::string format_work_estimate(std::optional work, + std::chrono::nanoseconds time) +{ + if (!work.has_value()) { + return ""; + } + return std::visit( + [time](auto work) -> std::string { + return format_work_estimate(work, time); + }, + *work); +} + + template void print_table(const std::array& headers, const std::vector>& table, @@ -95,8 +166,11 @@ void print_table(const std::array& headers, ProfilerHook::TableSummaryWriter::TableSummaryWriter(std::ostream& output, - std::string header) - : output_{&output}, header_{std::move(header)} + std::string header, + bool use_work_estimates) + : output_{&output}, + header_{std::move(header)}, + use_work_estimates_{use_work_estimates} {} @@ -104,41 +178,75 @@ void ProfilerHook::TableSummaryWriter::write( const std::vector& entries, std::chrono::nanoseconds overhead) { - (*output_) << header_ << '\n' - << "Overhead estimate " << format_duration(overhead) << '\n'; auto sorted_entries = entries; std::sort(sorted_entries.begin(), sorted_entries.end(), [](const summary_entry& lhs, const summary_entry& rhs) { // reverse-sort by inclusive total time return lhs.inclusive > rhs.inclusive; }); - std::vector> table; - std::array headers({" name ", " total ", " total (self) ", - " count ", " avg ", " avg (self) "}); - for (const auto& entry : sorted_entries) { - table.emplace_back(std::array{ - " " + entry.name + " ", - " " + format_duration(entry.inclusive) + " ", - " " + format_duration(entry.exclusive) + " ", - " " + std::to_string(entry.count) + " ", - " " + format_avg_duration(entry.inclusive, entry.count) + " ", - " " + format_avg_duration(entry.exclusive, entry.count) + " "}); + (*output_) << header_ << '\n'; + if (use_work_estimates_) { + // compute how much of the total runtime is accounted for by work + // estimates + const auto total_it = std::find_if( + sorted_entries.begin(), sorted_entries.end(), + [](const summary_entry& entry) { return entry.name == "total"; }); + const auto total_time = total_it == sorted_entries.end() + ? std::chrono::nanoseconds{} + : total_it->inclusive; + std::chrono::nanoseconds time_with_work_estimates{}; + // then output the table + std::vector> table; + std::array headers( + {" name ", " total ", " total (self) ", " count ", " avg ", + " avg (self) ", " performance "}); + for (const auto& entry : sorted_entries) { + table.emplace_back(std::array{ + " " + entry.name + " ", + " " + format_duration(entry.inclusive) + " ", + " " + format_duration(entry.exclusive) + " ", + " " + std::to_string(entry.count) + " ", + " " + format_avg_duration(entry.inclusive, entry.count) + " ", + " " + format_avg_duration(entry.exclusive, entry.count) + " ", + " " + + format_work_estimate(entry.work_estimate, entry.exclusive) + + " "}); + if (entry.work_estimate.has_value()) { + time_with_work_estimates += entry.exclusive; + } + } + print_table(headers, table, *output_); + (*output_) << "Overhead estimate " << format_duration(overhead) << '\n' + << "Work estimates available for " + << format_fraction(time_with_work_estimates, total_time) + << " of runtime\n"; + } else { + std::vector> table; + std::array headers({" name ", " total ", + " total (self) ", " count ", + " avg ", " avg (self) "}); + for (const auto& entry : sorted_entries) { + table.emplace_back(std::array{ + " " + entry.name + " ", + " " + format_duration(entry.inclusive) + " ", + " " + format_duration(entry.exclusive) + " ", + " " + std::to_string(entry.count) + " ", + " " + format_avg_duration(entry.inclusive, entry.count) + " ", + " " + format_avg_duration(entry.exclusive, entry.count) + " "}); + } + print_table(headers, table, *output_); + (*output_) << "Overhead estimate " << format_duration(overhead) << '\n'; } - print_table(headers, table, *output_); } void ProfilerHook::TableSummaryWriter::write_nested( const nested_summary_entry& root, std::chrono::nanoseconds overhead) { - (*output_) << header_ << '\n' - << "Overhead estimate " << format_duration(overhead) << '\n'; - std::vector> table; - std::array headers( - {" name ", " total ", " fraction ", " count ", " avg "}); - auto visitor = [&table](auto visitor, const nested_summary_entry& node, - std::chrono::nanoseconds parent_elapsed, - std::size_t depth) -> void { + (*output_) << header_ << '\n'; + auto visitor = [](auto visitor, const nested_summary_entry& node, + std::chrono::nanoseconds parent_elapsed, + std::size_t depth, const auto& emplace_op) -> void { std::vector child_permutation(node.children.size()); const auto total_child_duration = std::accumulate(node.children.begin(), node.children.end(), @@ -155,12 +263,7 @@ void ProfilerHook::TableSummaryWriter::write_nested( return node.children[lhs].elapsed > node.children[rhs].elapsed; }); - table.emplace_back(std::array{ - std::string(2 * depth + 1, ' ') + node.name + " ", - " " + format_duration(node.elapsed) + " ", - format_fraction(node.elapsed, parent_elapsed) + " ", - " " + std::to_string(node.count) + " ", - " " + format_avg_duration(node.elapsed, node.count) + " "}); + emplace_op(depth, node, parent_elapsed, self_duration); nested_summary_entry self_entry{ "(self)", self_duration, node.count, {}}; bool printed_self = false; @@ -168,18 +271,69 @@ void ProfilerHook::TableSummaryWriter::write_nested( // print (self) entry before smaller entry ... if (!printed_self && node.children[child_id].elapsed < self_duration) { - visitor(visitor, self_entry, node.elapsed, depth + 1); + visitor(visitor, self_entry, node.elapsed, depth + 1, + emplace_op); printed_self = true; } - visitor(visitor, node.children[child_id], node.elapsed, depth + 1); + visitor(visitor, node.children[child_id], node.elapsed, depth + 1, + emplace_op); } // ... or at the end if (!printed_self && !node.children.empty()) { - visitor(visitor, self_entry, node.elapsed, depth + 1); + visitor(visitor, self_entry, node.elapsed, depth + 1, emplace_op); } }; - visitor(visitor, root, root.elapsed, 0); - print_table(headers, table, *output_); + if (use_work_estimates_) { + std::vector> table; + std::array headers({" name ", " total ", " fraction ", + " count ", " avg ", + " performance "}); + std::chrono::nanoseconds time_with_work_estimates{}; + const auto total_time = root.elapsed; + visitor( + visitor, root, root.elapsed, 0, + [&table, &time_with_work_estimates, &total_time]( + std::size_t depth, const nested_summary_entry& node, + std::chrono::nanoseconds parent_elapsed, + std::chrono::nanoseconds self_duration) { + table.emplace_back(std::array{ + std::string(2 * depth + 1, ' ') + node.name + " ", + " " + format_duration(node.elapsed) + " ", + format_fraction(node.elapsed, parent_elapsed) + " ", + " " + std::to_string(node.count) + " ", + " " + format_avg_duration(node.elapsed, node.count) + " ", + " " + + format_work_estimate(node.work_estimate, + self_duration) + + " "}); + if (node.work_estimate.has_value()) { + time_with_work_estimates += self_duration; + } + }); + print_table(headers, table, *output_); + (*output_) << "Overhead estimate " << format_duration(overhead) << '\n' + << "Work estimates available for " + << format_fraction(time_with_work_estimates, total_time) + << " of runtime\n"; + } else { + std::vector> table; + std::array headers( + {" name ", " total ", " fraction ", " count ", " avg "}); + visitor( + visitor, root, root.elapsed, 0, + [&table](std::size_t depth, const nested_summary_entry& node, + std::chrono::nanoseconds parent_elapsed, + std::chrono::nanoseconds self_duration) { + table.emplace_back(std::array{ + std::string(2 * depth + 1, ' ') + node.name + " ", + " " + format_duration(node.elapsed) + " ", + format_fraction(node.elapsed, parent_elapsed) + " ", + " " + std::to_string(node.count) + " ", + " " + format_avg_duration(node.elapsed, node.count) + " "}); + }); + print_table(headers, table, *output_); + (*output_) << "Overhead estimate " << format_duration(overhead) << '\n'; + } } diff --git a/core/test/log/profiler_hook.cpp b/core/test/log/profiler_hook.cpp index ce74e526879..f58ece896c3 100644 --- a/core/test/log/profiler_hook.cpp +++ b/core/test/log/profiler_hook.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -17,16 +18,44 @@ #include "core/test/utils.hpp" +std::string to_string(gko::memory_bound_work_estimate work) +{ + return "MB(" + std::to_string(work.bytes_read) + "," + + std::to_string(work.bytes_written) + ")"; +} + +std::string to_string(gko::compute_bound_work_estimate work) +{ + return "MB(" + std::to_string(work.flops) + ")"; +} + +std::string to_string(gko::custom_work_estimate work) +{ + return work.operation_count_name + "(" + std::to_string(work.operations) + + ")"; +} + + std::pair + gko::log::ProfilerHook::work_estimate_hook_function> make_hooks(std::vector& output) { return std::make_pair( [&output](const char* msg, gko::log::profile_event_category) { output.push_back(std::string{"begin:"} + msg); }, - [&output](const char* msg, gko::log::profile_event_category) { - output.push_back(std::string{"end:"} + msg); + [&output](const char* msg, gko::log::profile_event_category, + std::optional work) { + if (work.has_value()) { + output.push_back( + std::string{"end:"} + msg + "(" + + std::visit([](auto work) { return to_string(work); }, + *work) + + ")"); + + } else { + output.push_back(std::string{"end:"} + msg); + } }); } @@ -43,15 +72,21 @@ class DummyOperation : public gko::Operation { void run(std::shared_ptr) const override {} + std::optional get_work_estimate() const override + { + return gko::memory_bound_work_estimate{100, 1}; + } + const char* get_name() const noexcept override { return "op"; } }; TEST(ProfilerHook, LogsAllocateCopyOperation) { - std::vector expected{ - "begin:allocate", "end:allocate", "begin:copy", "end:copy", - "begin:op", "end:op", "begin:free", "end:free"}; + std::vector expected{"begin:allocate", "end:allocate", + "begin:copy", "end:copy(MB(4,4))", + "begin:op", "end:op(MB(100,1))", + "begin:free", "end:free"}; std::vector output; auto hooks = make_hooks(output); auto exec = gko::ReferenceExecutor::create(); @@ -110,15 +145,15 @@ TEST(ProfilerHook, LogsPolymorphicObjectLinOp) "end:move(obj_copy,obj)", "begin:apply(obj)", "begin:op", - "end:op", + "end:op(MB(100,1))", "end:apply(obj)", "begin:advanced_apply(obj)", "begin:op", - "end:op", + "end:op(MB(100,1))", "end:advanced_apply(obj)", "begin:generate(obj_factory)", "begin:op", - "end:op", + "end:op(MB(100,1))", "end:generate(obj_factory)", "begin:check(nullptr)", "end:check(nullptr)"}; @@ -387,9 +422,7 @@ TEST(ProfilerHookTableSummaryWriter, SummaryWorks) entries.push_back({"medium", 1ms, 500us, 4}); // check division by count entries.push_back({"long", 120s, 60s, 1}); entries.push_back({"eternal", 24h, 24h, 1}); - // clang-format off const auto expected = R"(Test header -Overhead estimate 1.0 ns | name | total | total (self) | count | avg | avg (self) | |----------|-------:|-------------:|------:|---------:|-----------:| | eternal | 1.0 d | 1.0 d | 1 | 1.0 d | 1.0 d | @@ -398,8 +431,49 @@ Overhead estimate 1.0 ns | shortish | 1.2 us | 1.0 us | 1 | 1.2 us | 1.0 us | | short | 1.0 ns | 0.0 ns | 1 | 1.0 ns | 0.0 ns | | empty | 0.0 ns | 0.0 ns | 0 | 0.0 ns | 0.0 ns | +Overhead estimate 1.0 ns +)"; + + writer.write(entries, 1ns); + + ASSERT_EQ(ss.str(), expected); +} + + +TEST(ProfilerHookTableSummaryWriter, SummaryWorksWithWorkEstimate) +{ + using gko::log::ProfilerHook; + using mb_estimate = gko::memory_bound_work_estimate; + using cb_estimate = gko::compute_bound_work_estimate; + using cust_estimate = gko::custom_work_estimate; + using namespace std::chrono_literals; + std::stringstream ss; + ProfilerHook::TableSummaryWriter writer(ss, "Test header", true); + std::vector entries; + entries.push_back({"total", 2s, 0ns, 1}); + entries.push_back({"a", 1s, 1s, 1, mb_estimate{1, 0}}); + entries.push_back({"b", 1ms, 1ms, 1, mb_estimate{1, 0}}); + entries.push_back({"c", 1us, 1us, 1, mb_estimate{1, 0}}); + entries.push_back({"d", 1ns, 1ns, 1, mb_estimate{1, 2}}); + entries.push_back({"h", 1ns, 1ns, 1, mb_estimate{1000, 0}}); + entries.push_back({"e", 1ns, 1ns, 1, mb_estimate{999'900, 0}}); + entries.push_back({"f", 1ns, 1ns, 1, mb_estimate{999'990, 0}}); + entries.push_back({"g", 1ns, 1ns, 1, cust_estimate{"NOPs", 1000'000}}); + const auto expected = R"(Test header +| name | total | total (self) | count | avg | avg (self) | performance | +|-------|-------:|-------------:|------:|-------:|-----------:|---------------:| +| total | 2.0 s | 0.0 ns | 1 | 2.0 s | 0.0 ns | | +| a | 1.0 s | 1.0 s | 1 | 1.0 s | 1.0 s | 1.0 B/s | +| b | 1.0 ms | 1.0 ms | 1 | 1.0 ms | 1.0 ms | 1.0 kB/s | +| c | 1.0 us | 1.0 us | 1 | 1.0 us | 1.0 us | 1.0 MB/s | +| d | 1.0 ns | 1.0 ns | 1 | 1.0 ns | 1.0 ns | 3.0 GB/s | +| h | 1.0 ns | 1.0 ns | 1 | 1.0 ns | 1.0 ns | 1.0 TB/s | +| e | 1.0 ns | 1.0 ns | 1 | 1.0 ns | 1.0 ns | 999.9 TB/s | +| f | 1.0 ns | 1.0 ns | 1 | 1.0 ns | 1.0 ns | 1000.0 TB/s | +| g | 1.0 ns | 1.0 ns | 1 | 1.0 ns | 1.0 ns | 1000.0 TNOPs/s | +Overhead estimate 1.0 ns +Work estimates available for 50.1 % of runtime )"; - // clang-format on writer.write(entries, 1ns); @@ -417,16 +491,16 @@ TEST(ProfilerHookTableSummaryWriter, NestedSummaryWorks) "root", 2us, 1, - {ProfilerHook::nested_summary_entry{"foo", 100ns, 5, {}}, + {}, + {ProfilerHook::nested_summary_entry{"foo", 100ns, 5}, ProfilerHook::nested_summary_entry{ "bar", 1000ns, 2, - {ProfilerHook::nested_summary_entry{"child", 100ns, 2, {}}}}, - ProfilerHook::nested_summary_entry{"baz", 1ns, 2, {}}}}; - // clang-format off + {}, + {ProfilerHook::nested_summary_entry{"child", 100ns, 2}}}, + ProfilerHook::nested_summary_entry{"baz", 1ns, 2}}}; const auto expected = R"(Test header -Overhead estimate 1.0 ns | name | total | fraction | count | avg | |------------|---------:|---------:|------:|---------:| | root | 2.0 us | 100.0 % | 1 | 2.0 us | @@ -436,8 +510,49 @@ Overhead estimate 1.0 ns | (self) | 899.0 ns | 45.0 % | 1 | 899.0 ns | | foo | 100.0 ns | 5.0 % | 5 | 20.0 ns | | baz | 1.0 ns | 0.1 % | 2 | 0.0 ns | +Overhead estimate 1.0 ns +)"; + + writer.write_nested(entry, 1ns); + + ASSERT_EQ(ss.str(), expected); +} + + +TEST(ProfilerHookTableSummaryWriter, NestedSummaryWorksWithWorkEstimate) +{ + using gko::log::ProfilerHook; + using namespace std::chrono_literals; + std::stringstream ss; + ProfilerHook::TableSummaryWriter writer(ss, "Test header", true); + ProfilerHook::nested_summary_entry entry{ + "root", + 2us, + 1, + {}, + {ProfilerHook::nested_summary_entry{ + "foo", 100ns, 5, gko::custom_work_estimate{"Foos", 100}}, + ProfilerHook::nested_summary_entry{ + "bar", + 1000ns, + 2, + gko::compute_bound_work_estimate{900}, + {ProfilerHook::nested_summary_entry{ + "child", 100ns, 2, gko::memory_bound_work_estimate{100, 0}}}}, + ProfilerHook::nested_summary_entry{"baz", 1ns, 2}}}; + const auto expected = R"(Test header +| name | total | fraction | count | avg | performance | +|------------|---------:|---------:|------:|---------:|-------------:| +| root | 2.0 us | 100.0 % | 1 | 2.0 us | | +| bar | 1.0 us | 50.0 % | 2 | 500.0 ns | 1.0 GFLOPS/s | +| (self) | 900.0 ns | 90.0 % | 2 | 450.0 ns | | +| child | 100.0 ns | 10.0 % | 2 | 50.0 ns | 1.0 GB/s | +| (self) | 899.0 ns | 45.0 % | 1 | 899.0 ns | | +| foo | 100.0 ns | 5.0 % | 5 | 20.0 ns | 1.0 GFoos/s | +| baz | 1.0 ns | 0.1 % | 2 | 0.0 ns | | +Overhead estimate 1.0 ns +Work estimates available for 55.0 % of runtime )"; - // clang-format on writer.write_nested(entry, 1ns); diff --git a/include/ginkgo/core/log/profiler_hook.hpp b/include/ginkgo/core/log/profiler_hook.hpp index c5dc9dcbab6..44277dd858f 100644 --- a/include/ginkgo/core/log/profiler_hook.hpp +++ b/include/ginkgo/core/log/profiler_hook.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -11,6 +11,7 @@ #include #include +#include #include @@ -58,6 +59,9 @@ class ProfilerHook : public Logger { public: using hook_function = std::function; + using work_estimate_hook_function = + std::function)>; void on_allocation_started(const gko::Executor* exec, const gko::size_type&) const override; @@ -257,6 +261,8 @@ class ProfilerHook : public Logger { std::chrono::nanoseconds exclusive{0}; /** The total number of invocations of the range. */ int64 count{}; + /** Analytical estimate for how much work the operation did. */ + std::optional work_estimate{}; }; struct nested_summary_entry { @@ -266,6 +272,8 @@ class ProfilerHook : public Logger { std::chrono::nanoseconds elapsed{0}; /** The total number of invocations of the range. */ int64 count{}; + /** Analytical estimate for how much work the operation did. */ + std::optional work_estimate{}; /** The nested ranges inside this range. */ std::vector children{}; }; @@ -313,9 +321,12 @@ class ProfilerHook : public Logger { * * @param output the output stream to write the table to. * @param header the header to write above the table. + * @param use_work_estimates set to true to compute achieved + * bandwidth/FLOP estimates. */ TableSummaryWriter(std::ostream& output = std::cerr, - std::string header = "Runtime summary"); + std::string header = "Runtime summary", + bool use_work_estimates = false); void write(const std::vector& entries, std::chrono::nanoseconds overhead) override; @@ -324,6 +335,7 @@ class ProfilerHook : public Logger { std::chrono::nanoseconds overhead) override; private: + bool use_work_estimates_; std::ostream* output_; std::string header_; }; @@ -380,8 +392,15 @@ class ProfilerHook : public Logger { static std::shared_ptr create_custom(hook_function begin, hook_function end); + /** + * Creates a logger annotating Ginkgo events with a custom set of functions + * for range begin and end with support for kernel work estimates. + */ + static std::shared_ptr create_custom( + hook_function begin, work_estimate_hook_function end); + private: - ProfilerHook(hook_function begin, hook_function end); + ProfilerHook(hook_function begin, work_estimate_hook_function end); void maybe_synchronize(const Executor* exec) const; @@ -390,7 +409,7 @@ class ProfilerHook : public Logger { std::unordered_map name_map_; bool synchronize_; hook_function begin_hook_; - hook_function end_hook_; + work_estimate_hook_function end_hook_; }; From 9ac9b8f08daece467447dc5498943eda28e7999b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 16 Feb 2025 20:27:31 +0100 Subject: [PATCH 2/2] test result accumulation --- core/log/profiler_hook_summary.cpp | 1 + core/test/log/profiler_hook.cpp | 40 ++++++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/core/log/profiler_hook_summary.cpp b/core/log/profiler_hook_summary.cpp index 865ac9e6682..ce528929e76 100644 --- a/core/log/profiler_hook_summary.cpp +++ b/core/log/profiler_hook_summary.cpp @@ -393,6 +393,7 @@ ProfilerHook::nested_summary_entry build_tree(const nested_summary& summary) entry.name = summary.names[summary_node.name_id]; entry.elapsed = summary_node.elapsed; entry.count = summary_node.count; + entry.work_estimate = summary_node.work_estimate; const auto child_range = child_ranges[summary_node.node_id]; for (auto i = child_range.first; i < child_range.second; i++) { entry.children.emplace_back(); diff --git a/core/test/log/profiler_hook.cpp b/core/test/log/profiler_hook.cpp index f58ece896c3..441e7938e06 100644 --- a/core/test/log/profiler_hook.cpp +++ b/core/test/log/profiler_hook.cpp @@ -276,6 +276,11 @@ void call_ranges_unique(std::shared_ptr logger) } } auto range6 = logger->user_range("bazzzz"); + // an operation that requires accumulation of work estimates + logger->on_copy_started(nullptr, nullptr, 0, 0, 10); + logger->on_copy_completed(nullptr, nullptr, 0, 0, 10); + logger->on_copy_started(nullptr, nullptr, 0, 0, 35); + logger->on_copy_completed(nullptr, nullptr, 0, 0, 35); } struct TestSummaryWriter : gko::log::ProfilerHook::SummaryWriter { @@ -291,11 +296,14 @@ struct TestSummaryWriter : gko::log::ProfilerHook::SummaryWriter { * bazz() * bazzz() * ) - * bazzzz() + * bazzzz( + * copy() + * copy() + * ) * ) * ) */ - ASSERT_EQ(e.size(), 7); + ASSERT_EQ(e.size(), 8); ASSERT_EQ(e[0].name, "total"); ASSERT_EQ(e[0].count, 1); ASSERT_EQ(e[1].name, "foo"); @@ -310,6 +318,8 @@ struct TestSummaryWriter : gko::log::ProfilerHook::SummaryWriter { ASSERT_EQ(e[5].count, 1); ASSERT_EQ(e[6].name, "bazzzz"); ASSERT_EQ(e[6].count, 1); + ASSERT_EQ(e[7].name, "copy"); + ASSERT_EQ(e[7].count, 2); ASSERT_EQ(e[0].inclusive, e[0].exclusive + e[1].inclusive); ASSERT_EQ(e[1].inclusive, e[1].exclusive + e[2].inclusive + e[3].inclusive + e[6].inclusive); @@ -318,7 +328,12 @@ struct TestSummaryWriter : gko::log::ProfilerHook::SummaryWriter { e[3].exclusive + e[4].inclusive + e[5].inclusive); ASSERT_EQ(e[4].inclusive, e[4].exclusive); ASSERT_EQ(e[5].inclusive, e[5].exclusive); - ASSERT_EQ(e[6].inclusive, e[6].exclusive); + ASSERT_EQ(e[6].inclusive, e[6].exclusive + e[7].inclusive); + ASSERT_EQ(e[7].inclusive, e[7].exclusive); + const auto work_estimate = std::get( + e[7].work_estimate.value()); + ASSERT_EQ(work_estimate.bytes_read, 45); + ASSERT_EQ(work_estimate.bytes_written, 45); } }; @@ -353,6 +368,11 @@ void call_ranges(std::shared_ptr logger) } } auto range6 = logger->user_range("baz"); + // an operation that requires accumulation of work estimates + logger->on_copy_started(nullptr, nullptr, 0, 0, 10); + logger->on_copy_completed(nullptr, nullptr, 0, 0, 10); + logger->on_copy_started(nullptr, nullptr, 0, 0, 35); + logger->on_copy_completed(nullptr, nullptr, 0, 0, 35); } @@ -369,7 +389,10 @@ struct TestNestedSummaryWriter : gko::log::ProfilerHook::NestedSummaryWriter { * baz() * bazz() * ) - * baz() + * baz( + * copy() + * copy() + * ) * ) * ) */ @@ -388,12 +411,19 @@ struct TestNestedSummaryWriter : gko::log::ProfilerHook::NestedSummaryWriter { ASSERT_EQ(f.children[1].children.size(), 2); ASSERT_EQ(f.children[2].name, "baz"); ASSERT_EQ(f.children[2].count, 1); - ASSERT_EQ(f.children[2].children.size(), 0); + ASSERT_EQ(f.children[2].children.size(), 1); auto& b = f.children[1]; ASSERT_EQ(b.children[0].name, "baz"); ASSERT_EQ(b.children[0].count, 1); ASSERT_EQ(b.children[1].name, "bazz"); ASSERT_EQ(b.children[1].count, 1); + auto& bb = f.children[2]; + ASSERT_EQ(bb.children[0].name, "copy"); + ASSERT_EQ(bb.children[0].count, 2); + const auto work_estimate = std::get( + bb.children[0].work_estimate.value()); + ASSERT_EQ(work_estimate.bytes_read, 45); + ASSERT_EQ(work_estimate.bytes_written, 45); } };