From 5e6cc71b96e8d24b031e75935577975e904da422 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Sun, 9 Feb 2025 02:56:22 -0500 Subject: [PATCH 1/9] Initial implementation --- cpp/CMakeLists.txt | 1 + cpp/include/kvikio/nvtx.hpp | 188 ++++++++++++++++++++++ cpp/include/kvikio/parallel_operation.hpp | 45 ++++-- cpp/include/kvikio/posix_io.hpp | 1 + cpp/include/kvikio/utils.hpp | 112 ------------- cpp/src/file_handle.cpp | 47 +++++- cpp/src/nvtx.cpp | 84 ++++++++++ cpp/src/posix_io.cpp | 1 + cpp/src/remote_handle.cpp | 1 + cpp/src/utils.cpp | 5 - cpp/tests/test_basic_io.cpp | 88 +++++----- 11 files changed, 392 insertions(+), 181 deletions(-) create mode 100644 cpp/include/kvikio/nvtx.hpp create mode 100644 cpp/src/nvtx.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 4c1daa58c6..fc59f35591 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -144,6 +144,7 @@ set(SOURCES "src/error.cpp" "src/file_handle.cpp" "src/file_utils.cpp" + "src/nvtx.cpp" "src/posix_io.cpp" "src/shim/cuda.cpp" "src/shim/cufile.cpp" diff --git a/cpp/include/kvikio/nvtx.hpp b/cpp/include/kvikio/nvtx.hpp new file mode 100644 index 0000000000..8b1e7b0c8d --- /dev/null +++ b/cpp/include/kvikio/nvtx.hpp @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#ifdef KVIKIO_CUDA_FOUND +#include +#endif + +#include +#include + +namespace kvikio { + +#ifdef KVIKIO_CUDA_FOUND +/** + * @brief Tag type for libkvikio's NVTX domain. + */ +struct libkvikio_domain { + static constexpr char const* name{"libkvikio"}; +}; + +struct libkvikio_category_default { + static constexpr char const* name{"default"}; + static constexpr uint32_t id{1}; +}; + +struct libkvikio_category_thread_pool_task { + static constexpr char const* name{"thread pool tasks"}; + static constexpr uint32_t id{2}; +}; + +using kvikio_nvtx_scoped_range = nvtx3::scoped_range_in; +using kvikio_nvtx_registered_string = nvtx3::registered_string_in; +using kvikio_nvtx_named_category = nvtx3::named_category_in; + +// Macro to concatenate two tokens x and y. +#define KVIKIO_CONCAT_HELPER(x, y) x##y +#define KVIKIO_CONCAT(x, y) KVIKIO_CONCAT_HELPER(x, y) + +// Macro to create a static, registered string that will not have a name conflict with any +// registered string defined in the same scope. +#define KVIKIO_REGISTER_STRING(msg) \ + [](const char* a_msg) -> auto& { \ + static kvikio_nvtx_registered_string a_reg_str{a_msg}; \ + return a_reg_str; \ + }(msg) + +// Macro overloads of KVIKIO_NVTX_FUNC_RANGE +#define KVIKIO_NVTX_FUNC_RANGE_IMPL() NVTX3_FUNC_RANGE_IN(libkvikio_domain) + +#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_4(msg, nvtx_payload, nvtx_color, nvtx_category) \ + kvikio::kvikio_nvtx_scoped_range KVIKIO_CONCAT(_kvikio_nvtx_range, __LINE__) \ + { \ + nvtx3::event_attributes \ + { \ + KVIKIO_REGISTER_STRING(msg), nvtx3::payload{convert_to_64bit(nvtx_payload)}, nvtx_color, \ + nvtx_category \ + } \ + } +#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_3(msg, nvtx_payload, nvtx_color) \ + KVIKIO_NVTX_SCOPED_RANGE_IMPL_4( \ + msg, \ + nvtx_payload, \ + nvtx_color, \ + kvikio::kvikio_nvtx_named_category::get()) +#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_2(msg, nvtx_payload) \ + KVIKIO_NVTX_SCOPED_RANGE_IMPL_3( \ + msg, nvtx_payload, kvikio::nvtx_manager::instance().get_default_color()) +#define KVIKIO_NVTX_SCOPED_RANGE_SELECTOR(_1, _2, _3, _4, NAME, ...) NAME +#define KVIKIO_NVTX_SCOPED_RANGE_IMPL(...) \ + KVIKIO_NVTX_SCOPED_RANGE_SELECTOR(__VA_ARGS__, \ + KVIKIO_NVTX_SCOPED_RANGE_IMPL_4, \ + KVIKIO_NVTX_SCOPED_RANGE_IMPL_3, \ + KVIKIO_NVTX_SCOPED_RANGE_IMPL_2) \ + (__VA_ARGS__) + +#define KVIKIO_NVTX_MARKER_IMPL(msg, nvtx_payload) \ + nvtx3::mark_in(nvtx3::event_attributes{ \ + KVIKIO_REGISTER_STRING(msg), nvtx3::payload{convert_to_64bit(nvtx_payload)}}) + +#endif + +#ifdef KVIKIO_CUDA_FOUND +using kvikio_nvtx_color = nvtx3::color; +using kvikio_nvtx_category = nvtx3::category; +#else +using kvikio_nvtx_color = int; +#endif + +class nvtx_manager { + public: + static nvtx_manager& instance(); + std::size_t get_correlation_id(); + kvikio_nvtx_color& get_next_color(); + kvikio_nvtx_color& get_default_color(); + nvtx_manager(nvtx_manager const&) = delete; + nvtx_manager& operator=(nvtx_manager const&) = delete; + nvtx_manager(nvtx_manager&&) = delete; + nvtx_manager& operator=(nvtx_manager&&) = delete; + + private: + nvtx_manager() = default; +}; + +/** + * @brief Convenience macro for generating an NVTX range in the `libkvikio` domain + * from the lifetime of a function. + * + * Takes no argument. The name of the immediately enclosing function returned by `__func__` is used + * as the message. + * + * Example: + * ``` + * void some_function(){ + * KVIKIO_NVTX_FUNC_RANGE(); // The name `some_function` is used as the message + * ... + * } + * ``` + */ +#ifdef KVIKIO_CUDA_FOUND +#define KVIKIO_NVTX_FUNC_RANGE() KVIKIO_NVTX_FUNC_RANGE_IMPL() +#else +#define KVIKIO_NVTX_FUNC_RANGE(...) \ + do { \ + } while (0) +#endif + +/** + * @brief Convenience macro for generating an NVTX scoped range in the `libkvikio` domain to + * annotate a time duration. + * + * Takes two arguments (message, nvtx_payload). + * + * Example: + * ``` + * void some_function(){ + * KVIKIO_NVTX_SCOPED_RANGE("my function", 42); + * ... + * } + * ``` + */ +#ifdef KVIKIO_CUDA_FOUND +#define KVIKIO_NVTX_SCOPED_RANGE(...) KVIKIO_NVTX_SCOPED_RANGE_IMPL(__VA_ARGS__) +#else +#define KVIKIO_NVTX_SCOPED_RANGE(msg, nvtx_payload, ...) \ + do { \ + } while (0) +#endif + +/** + * @brief Convenience macro for generating an NVTX marker in the `libkvikio` domain to annotate a + * certain time point. + * + * Takes two arguments (message, nvtx_payload). Use this macro to annotate asynchronous I/O + * operations, where the nvtx_payload refers to the I/O size. + * + * Example: + * ``` + * std::future some_function(){ + * size_t io_size{2077}; + * KVIKIO_NVTX_MARKER("I/O operation", io_size); + * perform_async_io_operation(io_size); + * ... + * } + * ``` + */ +#ifdef KVIKIO_CUDA_FOUND +#define KVIKIO_NVTX_MARKER(message, nvtx_payload) KVIKIO_NVTX_MARKER_IMPL(message, nvtx_payload) +#else +#define KVIKIO_NVTX_MARKER(message, nvtx_payload) \ + do { \ + } while (0) +#endif + +} // namespace kvikio diff --git a/cpp/include/kvikio/parallel_operation.hpp b/cpp/include/kvikio/parallel_operation.hpp index f345333c4f..debc51c1cd 100644 --- a/cpp/include/kvikio/parallel_operation.hpp +++ b/cpp/include/kvikio/parallel_operation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,6 +24,7 @@ #include #include +#include #include namespace kvikio { @@ -32,10 +33,19 @@ namespace detail { template std::future submit_task( - F op, T buf, std::size_t size, std::size_t file_offset, std::size_t devPtr_offset) + F op, + T buf, + std::size_t size, + std::size_t file_offset, + std::size_t devPtr_offset, + kvikio_nvtx_color nvtx_color = nvtx_manager::instance().get_default_color(), + kvikio_nvtx_named_category nvtx_category = + kvikio_nvtx_named_category::get()) { - return defaults::thread_pool().submit_task( - [=] { return op(buf, size, file_offset, devPtr_offset); }); + return defaults::thread_pool().submit_task([=] { + KVIKIO_NVTX_SCOPED_RANGE("Task", size, nvtx_color, nvtx_category); + return op(buf, size, file_offset, devPtr_offset); + }); } } // namespace detail @@ -53,18 +63,23 @@ std::future submit_task( * @return A future to be used later to check if the operation has finished its execution. */ template -std::future parallel_io(F op, - T buf, - std::size_t size, - std::size_t file_offset, - std::size_t task_size, - std::size_t devPtr_offset) +std::future parallel_io( + F op, + T buf, + std::size_t size, + std::size_t file_offset, + std::size_t task_size, + std::size_t devPtr_offset, + kvikio_nvtx_color nvtx_color = nvtx_manager::instance().get_default_color(), + kvikio_nvtx_named_category nvtx_category = + kvikio_nvtx_named_category::get()) { if (task_size == 0) { throw std::invalid_argument("`task_size` cannot be zero"); } // Single-task guard if (task_size >= size || page_size >= size) { - return detail::submit_task(op, buf, size, file_offset, devPtr_offset); + return detail::submit_task( + op, buf, size, file_offset, devPtr_offset, nvtx_color, nvtx_category); } // We know an upper bound of the total number of tasks @@ -73,14 +88,18 @@ std::future parallel_io(F op, // 1) Submit `task_size` sized tasks while (size >= task_size) { - tasks.push_back(detail::submit_task(op, buf, task_size, file_offset, devPtr_offset)); + tasks.push_back(detail::submit_task( + op, buf, task_size, file_offset, devPtr_offset, nvtx_color, nvtx_category)); file_offset += task_size; devPtr_offset += task_size; size -= task_size; } // 2) Submit a task for the remainder - if (size > 0) { tasks.push_back(detail::submit_task(op, buf, size, file_offset, devPtr_offset)); } + if (size > 0) { + tasks.push_back( + detail::submit_task(op, buf, size, file_offset, devPtr_offset, nvtx_color, nvtx_category)); + } // Finally, we sum the result of all tasks. auto gather_tasks = [](std::vector>&& tasks) -> std::size_t { diff --git a/cpp/include/kvikio/posix_io.hpp b/cpp/include/kvikio/posix_io.hpp index 99964315b3..955262066e 100644 --- a/cpp/include/kvikio/posix_io.hpp +++ b/cpp/include/kvikio/posix_io.hpp @@ -23,6 +23,7 @@ #include #include +#include #include #include diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index b10e54c482..931dbfaef0 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -23,10 +23,6 @@ #include #include -#ifdef KVIKIO_CUDA_FOUND -#include -#endif - #include namespace kvikio { @@ -155,112 +151,4 @@ bool is_future_done(T const& future) return future.wait_for(std::chrono::seconds(0)) != std::future_status::timeout; } -#ifdef KVIKIO_CUDA_FOUND -/** - * @brief Tag type for libkvikio's NVTX domain. - */ -struct libkvikio_domain { - static constexpr char const* name{"libkvikio"}; -}; - -// Macro to concatenate two tokens x and y. -#define KVIKIO_CONCAT_HELPER(x, y) x##y -#define KVIKIO_CONCAT(x, y) KVIKIO_CONCAT_HELPER(x, y) - -// Macro to create a static, registered string that will not have a name conflict with any -// registered string defined in the same scope. -#define KVIKIO_REGISTER_STRING(msg) \ - [](const char* a_msg) -> auto& { \ - static nvtx3::registered_string_in a_reg_str{a_msg}; \ - return a_reg_str; \ - }(msg) - -// Macro overloads of KVIKIO_NVTX_FUNC_RANGE -#define KVIKIO_NVTX_FUNC_RANGE_IMPL() NVTX3_FUNC_RANGE_IN(libkvikio_domain) - -#define KVIKIO_NVTX_SCOPED_RANGE_IMPL(msg, val) \ - nvtx3::scoped_range_in KVIKIO_CONCAT(_kvikio_nvtx_range, __LINE__) \ - { \ - nvtx3::event_attributes \ - { \ - KVIKIO_REGISTER_STRING(msg), nvtx3::payload { convert_to_64bit(val) } \ - } \ - } - -#define KVIKIO_NVTX_MARKER_IMPL(msg, val) \ - nvtx3::mark_in( \ - nvtx3::event_attributes{KVIKIO_REGISTER_STRING(msg), nvtx3::payload{convert_to_64bit(val)}}) - -#endif - -/** - * @brief Convenience macro for generating an NVTX range in the `libkvikio` domain - * from the lifetime of a function. - * - * Takes no argument. The name of the immediately enclosing function returned by `__func__` is used - * as the message. - * - * Example: - * ``` - * void some_function(){ - * KVIKIO_NVTX_FUNC_RANGE(); // The name `some_function` is used as the message - * ... - * } - * ``` - */ -#ifdef KVIKIO_CUDA_FOUND -#define KVIKIO_NVTX_FUNC_RANGE() KVIKIO_NVTX_FUNC_RANGE_IMPL() -#else -#define KVIKIO_NVTX_FUNC_RANGE(...) \ - do { \ - } while (0) -#endif - -/** - * @brief Convenience macro for generating an NVTX scoped range in the `libkvikio` domain to - * annotate a time duration. - * - * Takes two arguments (message, payload). - * - * Example: - * ``` - * void some_function(){ - * KVIKIO_NVTX_SCOPED_RANGE("my function", 42); - * ... - * } - * ``` - */ -#ifdef KVIKIO_CUDA_FOUND -#define KVIKIO_NVTX_SCOPED_RANGE(msg, val) KVIKIO_NVTX_SCOPED_RANGE_IMPL(msg, val) -#else -#define KVIKIO_NVTX_SCOPED_RANGE(msg, val) \ - do { \ - } while (0) -#endif - -/** - * @brief Convenience macro for generating an NVTX marker in the `libkvikio` domain to annotate a - * certain time point. - * - * Takes two arguments (message, payload). Use this macro to annotate asynchronous I/O operations, - * where the payload refers to the I/O size. - * - * Example: - * ``` - * std::future some_function(){ - * size_t io_size{2077}; - * KVIKIO_NVTX_MARKER("I/O operation", io_size); - * perform_async_io_operation(io_size); - * ... - * } - * ``` - */ -#ifdef KVIKIO_CUDA_FOUND -#define KVIKIO_NVTX_MARKER(message, payload) KVIKIO_NVTX_MARKER_IMPL(message, payload) -#else -#define KVIKIO_NVTX_MARKER(message, payload) \ - do { \ - } while (0) -#endif - } // namespace kvikio diff --git a/cpp/src/file_handle.cpp b/cpp/src/file_handle.cpp index 0e65afb7fd..ef227c31ea 100644 --- a/cpp/src/file_handle.cpp +++ b/cpp/src/file_handle.cpp @@ -26,6 +26,7 @@ #include #include #include +#include namespace kvikio { @@ -139,13 +140,13 @@ std::size_t FileHandle::read(void* devPtr_base, std::size_t devPtr_offset, bool sync_default_stream) { + KVIKIO_NVTX_SCOPED_RANGE("FileHandle::read()", size); if (is_compat_mode_preferred()) { return detail::posix_device_read( _fd_direct_off.fd(), devPtr_base, size, file_offset, devPtr_offset); } if (sync_default_stream) { CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(nullptr)); } - KVIKIO_NVTX_SCOPED_RANGE("cufileRead()", size); ssize_t ret = cuFileAPI::instance().Read(_cufile_handle.handle(), devPtr_base, size, @@ -161,6 +162,7 @@ std::size_t FileHandle::write(void const* devPtr_base, std::size_t devPtr_offset, bool sync_default_stream) { + KVIKIO_NVTX_SCOPED_RANGE("FileHandle::write()", size); _nbytes = 0; // Invalidate the computed file size if (is_compat_mode_preferred()) { @@ -169,7 +171,6 @@ std::size_t FileHandle::write(void const* devPtr_base, } if (sync_default_stream) { CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(nullptr)); } - KVIKIO_NVTX_SCOPED_RANGE("cufileWrite()", size); ssize_t ret = cuFileAPI::instance().Write(_cufile_handle.handle(), devPtr_base, size, @@ -192,7 +193,8 @@ std::future FileHandle::pread(void* buf, std::size_t gds_threshold, bool sync_default_stream) { - KVIKIO_NVTX_MARKER("FileHandle::pread()", size); + auto& nvtx_color = nvtx_manager::instance().get_next_color(); + KVIKIO_NVTX_SCOPED_RANGE("FileHandle::pread()", size, nvtx_color); if (is_host_memory(buf)) { auto op = [this](void* hostPtr_base, std::size_t size, @@ -203,7 +205,14 @@ std::future FileHandle::pread(void* buf, _fd_direct_off.fd(), buf, size, file_offset); }; - return parallel_io(op, buf, size, file_offset, task_size, 0); + return parallel_io(op, + buf, + size, + file_offset, + task_size, + 0, + nvtx_color, + kvikio_nvtx_named_category::get()); } CUcontext ctx = get_context_from_pointer(buf); @@ -232,7 +241,14 @@ std::future FileHandle::pread(void* buf, return read(devPtr_base, size, file_offset, devPtr_offset, /* sync_default_stream = */ false); }; auto [devPtr_base, base_size, devPtr_offset] = get_alloc_info(buf, &ctx); - return parallel_io(task, devPtr_base, size, file_offset, task_size, devPtr_offset); + return parallel_io(task, + devPtr_base, + size, + file_offset, + task_size, + devPtr_offset, + nvtx_color, + kvikio_nvtx_named_category::get()); } std::future FileHandle::pwrite(void const* buf, @@ -242,7 +258,8 @@ std::future FileHandle::pwrite(void const* buf, std::size_t gds_threshold, bool sync_default_stream) { - KVIKIO_NVTX_MARKER("FileHandle::pwrite()", size); + auto& nvtx_color = nvtx_manager::instance().get_next_color(); + KVIKIO_NVTX_SCOPED_RANGE("FileHandle::pwrite()", size, nvtx_color); if (is_host_memory(buf)) { auto op = [this](void const* hostPtr_base, std::size_t size, @@ -253,7 +270,14 @@ std::future FileHandle::pwrite(void const* buf, _fd_direct_off.fd(), buf, size, file_offset); }; - return parallel_io(op, buf, size, file_offset, task_size, 0); + return parallel_io(op, + buf, + size, + file_offset, + task_size, + 0, + nvtx_color, + kvikio_nvtx_named_category::get()); } CUcontext ctx = get_context_from_pointer(buf); @@ -282,7 +306,14 @@ std::future FileHandle::pwrite(void const* buf, return write(devPtr_base, size, file_offset, devPtr_offset, /* sync_default_stream = */ false); }; auto [devPtr_base, base_size, devPtr_offset] = get_alloc_info(buf, &ctx); - return parallel_io(op, devPtr_base, size, file_offset, task_size, devPtr_offset); + return parallel_io(op, + devPtr_base, + size, + file_offset, + task_size, + devPtr_offset, + nvtx_color, + kvikio_nvtx_named_category::get()); } void FileHandle::read_async(void* devPtr_base, diff --git a/cpp/src/nvtx.cpp b/cpp/src/nvtx.cpp new file mode 100644 index 0000000000..c4f27f000e --- /dev/null +++ b/cpp/src/nvtx.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#ifdef KVIKIO_CUDA_FOUND +#include +#endif + +#include + +namespace kvikio { + +nvtx_manager& nvtx_manager::instance() +{ + static nvtx_manager _instance; + return _instance; +} + +#ifdef KVIKIO_CUDA_FOUND +kvikio_nvtx_color& nvtx_manager::get_next_color() +{ + static std::array color_palette = {nvtx3::rgb{106, 192, 67}, + nvtx3::rgb{191, 73, 203}, + nvtx3::rgb{93, 151, 76}, + nvtx3::rgb{96, 72, 194}, + nvtx3::rgb{179, 170, 71}, + nvtx3::rgb{92, 58, 113}, + nvtx3::rgb{212, 136, 57}, + nvtx3::rgb{96, 144, 194}, + nvtx3::rgb{211, 69, 56}, + nvtx3::rgb{97, 179, 155}, + nvtx3::rgb{203, 69, 131}, + nvtx3::rgb{57, 89, 48}, + nvtx3::rgb{184, 133, 199}, + nvtx3::rgb{128, 102, 51}, + nvtx3::rgb{211, 138, 130}, + nvtx3::rgb{122, 50, 49}}; + + static std::mutex mut; + std::lock_guard lock{mut}; + + static std::size_t idx = 0; + auto old_idx = idx; + ++idx; + if (idx >= color_palette.size()) { idx -= color_palette.size(); } + + return color_palette[old_idx]; +} + +kvikio_nvtx_color& nvtx_manager::get_default_color() +{ + static kvikio_nvtx_color default_color{nvtx3::argb{0, 255, 255, 255}}; + return default_color; +} +#else +kvikio_nvtx_color& nvtx_manager::get_next_color() +{ + static kvikio_nvtx_color dummy{}; + return dummy; +} + +kvikio_nvtx_color& nvtx_manager::get_default_color() +{ + static kvikio_nvtx_color dummy{}; + return dummy; +} +#endif + +} // namespace kvikio diff --git a/cpp/src/posix_io.cpp b/cpp/src/posix_io.cpp index 9576f284dc..ed149f5d43 100644 --- a/cpp/src/posix_io.cpp +++ b/cpp/src/posix_io.cpp @@ -22,6 +22,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp index 8ca04f94ed..016cb0bf9a 100644 --- a/cpp/src/remote_handle.cpp +++ b/cpp/src/remote_handle.cpp @@ -25,6 +25,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/utils.cpp b/cpp/src/utils.cpp index bed2cbafbc..fdc174f791 100644 --- a/cpp/src/utils.cpp +++ b/cpp/src/utils.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include @@ -23,10 +22,6 @@ #include #include -#ifdef KVIKIO_CUDA_FOUND -#include -#endif - #include #include #include diff --git a/cpp/tests/test_basic_io.cpp b/cpp/tests/test_basic_io.cpp index e16bd99d83..f564072e90 100644 --- a/cpp/tests/test_basic_io.cpp +++ b/cpp/tests/test_basic_io.cpp @@ -27,7 +27,7 @@ class BasicIOTest : public testing::Test { TempDir tmp_dir{false}; _filepath = tmp_dir.path() / "test"; - _dev_a = std::move(DevBuffer::arange(100)); + _dev_a = std::move(DevBuffer::arange(1024 * 1024 * 100)); _dev_b = std::move(DevBuffer::zero_like(_dev_a)); } @@ -42,57 +42,59 @@ TEST_F(BasicIOTest, write_read) { { kvikio::FileHandle f(_filepath, "w"); - auto nbytes = f.write(_dev_a.ptr, _dev_a.nbytes, 0, 0); - EXPECT_EQ(nbytes, _dev_a.nbytes); + auto fut = f.pwrite(_dev_a.ptr, _dev_a.nbytes); + fut.wait(); + EXPECT_EQ(fut.get(), _dev_a.nbytes); } { kvikio::FileHandle f(_filepath, "r"); - auto nbytes = f.read(_dev_b.ptr, _dev_b.nbytes, 0, 0); - EXPECT_EQ(nbytes, _dev_b.nbytes); - expect_equal(_dev_a, _dev_b); + auto fut = f.pread(_dev_b.ptr, _dev_b.nbytes); + fut.wait(); + EXPECT_EQ(fut.get(), _dev_b.nbytes); + // expect_equal(_dev_a, _dev_b); } } -TEST_F(BasicIOTest, write_read_async) -{ - CUstream stream{}; - CUDA_DRIVER_TRY(kvikio::cudaAPI::instance().StreamCreate(&stream, CU_STREAM_NON_BLOCKING)); +// TEST_F(BasicIOTest, write_read_async) +// { +// CUstream stream{}; +// CUDA_DRIVER_TRY(kvikio::cudaAPI::instance().StreamCreate(&stream, CU_STREAM_NON_BLOCKING)); - // Default compatibility mode (AUTO) - { - kvikio::FileHandle f(_filepath, "w"); - auto stream_future = f.write_async(_dev_a.ptr, _dev_a.nbytes, 0, 0, stream); - auto nbytes = stream_future.check_bytes_done(); - EXPECT_EQ(nbytes, _dev_a.nbytes); - } +// // Default compatibility mode (AUTO) +// { +// kvikio::FileHandle f(_filepath, "w"); +// auto stream_future = f.write_async(_dev_a.ptr, _dev_a.nbytes, 0, 0, stream); +// auto nbytes = stream_future.check_bytes_done(); +// EXPECT_EQ(nbytes, _dev_a.nbytes); +// } - { - kvikio::FileHandle f(_filepath, "r"); - auto stream_future = f.read_async(_dev_b.ptr, _dev_b.nbytes, 0, 0, stream); - auto nbytes = stream_future.check_bytes_done(); - EXPECT_EQ(nbytes, _dev_b.nbytes); - expect_equal(_dev_a, _dev_b); - } +// { +// kvikio::FileHandle f(_filepath, "r"); +// auto stream_future = f.read_async(_dev_b.ptr, _dev_b.nbytes, 0, 0, stream); +// auto nbytes = stream_future.check_bytes_done(); +// EXPECT_EQ(nbytes, _dev_b.nbytes); +// expect_equal(_dev_a, _dev_b); +// } - // Explicitly set compatibility mode - std::array compat_modes{kvikio::CompatMode::AUTO, kvikio::CompatMode::ON}; - for (auto const& compat_mode : compat_modes) { - { - kvikio::FileHandle f(_filepath, "w", kvikio::FileHandle::m644, compat_mode); - auto stream_future = f.write_async(_dev_a.ptr, _dev_a.nbytes, 0, 0, stream); - auto nbytes = stream_future.check_bytes_done(); - EXPECT_EQ(nbytes, _dev_a.nbytes); - } +// // Explicitly set compatibility mode +// std::array compat_modes{kvikio::CompatMode::AUTO, +// kvikio::CompatMode::ON}; for (auto const& compat_mode : compat_modes) { +// { +// kvikio::FileHandle f(_filepath, "w", kvikio::FileHandle::m644, compat_mode); +// auto stream_future = f.write_async(_dev_a.ptr, _dev_a.nbytes, 0, 0, stream); +// auto nbytes = stream_future.check_bytes_done(); +// EXPECT_EQ(nbytes, _dev_a.nbytes); +// } - { - kvikio::FileHandle f(_filepath, "r", kvikio::FileHandle::m644, compat_mode); - auto stream_future = f.read_async(_dev_b.ptr, _dev_b.nbytes, 0, 0, stream); - auto nbytes = stream_future.check_bytes_done(); - EXPECT_EQ(nbytes, _dev_b.nbytes); - expect_equal(_dev_a, _dev_b); - } - } +// { +// kvikio::FileHandle f(_filepath, "r", kvikio::FileHandle::m644, compat_mode); +// auto stream_future = f.read_async(_dev_b.ptr, _dev_b.nbytes, 0, 0, stream); +// auto nbytes = stream_future.check_bytes_done(); +// EXPECT_EQ(nbytes, _dev_b.nbytes); +// expect_equal(_dev_a, _dev_b); +// } +// } - CUDA_DRIVER_TRY(kvikio::cudaAPI::instance().StreamDestroy(stream)); -} +// CUDA_DRIVER_TRY(kvikio::cudaAPI::instance().StreamDestroy(stream)); +// } From ec551c6be107c921b8b55df159ca617c10a0edf7 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 10 Feb 2025 01:52:01 -0500 Subject: [PATCH 2/9] Simplify impl. Enable color coding --- cpp/include/kvikio/nvtx.hpp | 94 +++++++++-------------- cpp/include/kvikio/parallel_operation.hpp | 30 +++++--- cpp/include/kvikio/utils.hpp | 6 ++ cpp/src/file_handle.cpp | 47 +++--------- cpp/src/nvtx.cpp | 50 +++++------- cpp/src/remote_handle.cpp | 3 +- 6 files changed, 94 insertions(+), 136 deletions(-) diff --git a/cpp/include/kvikio/nvtx.hpp b/cpp/include/kvikio/nvtx.hpp index 8b1e7b0c8d..9797303a69 100644 --- a/cpp/include/kvikio/nvtx.hpp +++ b/cpp/include/kvikio/nvtx.hpp @@ -15,6 +15,8 @@ */ #pragma once +#include + #ifdef KVIKIO_CUDA_FOUND #include #endif @@ -32,19 +34,8 @@ struct libkvikio_domain { static constexpr char const* name{"libkvikio"}; }; -struct libkvikio_category_default { - static constexpr char const* name{"default"}; - static constexpr uint32_t id{1}; -}; - -struct libkvikio_category_thread_pool_task { - static constexpr char const* name{"thread pool tasks"}; - static constexpr uint32_t id{2}; -}; - -using kvikio_nvtx_scoped_range = nvtx3::scoped_range_in; -using kvikio_nvtx_registered_string = nvtx3::registered_string_in; -using kvikio_nvtx_named_category = nvtx3::named_category_in; +using nvtx_scoped_range = nvtx3::scoped_range_in; +using nvtx_registered_string = nvtx3::registered_string_in; // Macro to concatenate two tokens x and y. #define KVIKIO_CONCAT_HELPER(x, y) x##y @@ -52,60 +43,49 @@ using kvikio_nvtx_named_category = nvtx3::named_category_in // Macro to create a static, registered string that will not have a name conflict with any // registered string defined in the same scope. -#define KVIKIO_REGISTER_STRING(msg) \ - [](const char* a_msg) -> auto& { \ - static kvikio_nvtx_registered_string a_reg_str{a_msg}; \ - return a_reg_str; \ +#define KVIKIO_REGISTER_STRING(msg) \ + [](const char* a_msg) -> auto& { \ + static nvtx_registered_string a_reg_str{a_msg}; \ + return a_reg_str; \ }(msg) // Macro overloads of KVIKIO_NVTX_FUNC_RANGE #define KVIKIO_NVTX_FUNC_RANGE_IMPL() NVTX3_FUNC_RANGE_IN(libkvikio_domain) -#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_4(msg, nvtx_payload, nvtx_color, nvtx_category) \ - kvikio::kvikio_nvtx_scoped_range KVIKIO_CONCAT(_kvikio_nvtx_range, __LINE__) \ - { \ - nvtx3::event_attributes \ - { \ - KVIKIO_REGISTER_STRING(msg), nvtx3::payload{convert_to_64bit(nvtx_payload)}, nvtx_color, \ - nvtx_category \ - } \ +#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_3(msg, nvtx_payload_v, nvtx_color_v) \ + kvikio::nvtx_scoped_range KVIKIO_CONCAT(_kvikio_nvtx_range, __LINE__) \ + { \ + nvtx3::event_attributes \ + { \ + KVIKIO_REGISTER_STRING(msg), nvtx3::payload{convert_to_64bit(nvtx_payload_v)}, nvtx_color_v \ + } \ } -#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_3(msg, nvtx_payload, nvtx_color) \ - KVIKIO_NVTX_SCOPED_RANGE_IMPL_4( \ - msg, \ - nvtx_payload, \ - nvtx_color, \ - kvikio::kvikio_nvtx_named_category::get()) -#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_2(msg, nvtx_payload) \ - KVIKIO_NVTX_SCOPED_RANGE_IMPL_3( \ - msg, nvtx_payload, kvikio::nvtx_manager::instance().get_default_color()) -#define KVIKIO_NVTX_SCOPED_RANGE_SELECTOR(_1, _2, _3, _4, NAME, ...) NAME -#define KVIKIO_NVTX_SCOPED_RANGE_IMPL(...) \ - KVIKIO_NVTX_SCOPED_RANGE_SELECTOR(__VA_ARGS__, \ - KVIKIO_NVTX_SCOPED_RANGE_IMPL_4, \ - KVIKIO_NVTX_SCOPED_RANGE_IMPL_3, \ - KVIKIO_NVTX_SCOPED_RANGE_IMPL_2) \ +#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_2(msg, nvtx_payload_v) \ + KVIKIO_NVTX_SCOPED_RANGE_IMPL_3( \ + msg, nvtx_payload_v, kvikio::nvtx_manager::instance().default_color()) +#define KVIKIO_NVTX_SCOPED_RANGE_SELECTOR(_1, _2, _3, NAME, ...) NAME +#define KVIKIO_NVTX_SCOPED_RANGE_IMPL(...) \ + KVIKIO_NVTX_SCOPED_RANGE_SELECTOR( \ + __VA_ARGS__, KVIKIO_NVTX_SCOPED_RANGE_IMPL_3, KVIKIO_NVTX_SCOPED_RANGE_IMPL_2) \ (__VA_ARGS__) -#define KVIKIO_NVTX_MARKER_IMPL(msg, nvtx_payload) \ +#define KVIKIO_NVTX_MARKER_IMPL(msg, nvtx_payload_v) \ nvtx3::mark_in(nvtx3::event_attributes{ \ - KVIKIO_REGISTER_STRING(msg), nvtx3::payload{convert_to_64bit(nvtx_payload)}}) + KVIKIO_REGISTER_STRING(msg), nvtx3::payload{convert_to_64bit(nvtx_payload_v)}}) #endif #ifdef KVIKIO_CUDA_FOUND -using kvikio_nvtx_color = nvtx3::color; -using kvikio_nvtx_category = nvtx3::category; +using nvtx_color = nvtx3::color; #else -using kvikio_nvtx_color = int; +using nvtx_color = int; #endif class nvtx_manager { public: - static nvtx_manager& instance(); - std::size_t get_correlation_id(); - kvikio_nvtx_color& get_next_color(); - kvikio_nvtx_color& get_default_color(); + static nvtx_manager& instance() noexcept; + const nvtx_color& default_color() const noexcept; + const nvtx_color& get_color_by_index(std::uint64_t idx) const noexcept; nvtx_manager(nvtx_manager const&) = delete; nvtx_manager& operator=(nvtx_manager const&) = delete; nvtx_manager(nvtx_manager&&) = delete; @@ -142,7 +122,7 @@ class nvtx_manager { * @brief Convenience macro for generating an NVTX scoped range in the `libkvikio` domain to * annotate a time duration. * - * Takes two arguments (message, nvtx_payload). + * Takes two arguments (message, nvtx_payload_v). * * Example: * ``` @@ -155,8 +135,8 @@ class nvtx_manager { #ifdef KVIKIO_CUDA_FOUND #define KVIKIO_NVTX_SCOPED_RANGE(...) KVIKIO_NVTX_SCOPED_RANGE_IMPL(__VA_ARGS__) #else -#define KVIKIO_NVTX_SCOPED_RANGE(msg, nvtx_payload, ...) \ - do { \ +#define KVIKIO_NVTX_SCOPED_RANGE(msg, nvtx_payload_v, ...) \ + do { \ } while (0) #endif @@ -164,8 +144,8 @@ class nvtx_manager { * @brief Convenience macro for generating an NVTX marker in the `libkvikio` domain to annotate a * certain time point. * - * Takes two arguments (message, nvtx_payload). Use this macro to annotate asynchronous I/O - * operations, where the nvtx_payload refers to the I/O size. + * Takes two arguments (message, nvtx_payload_v). Use this macro to annotate asynchronous I/O + * operations, where the nvtx_payload_v refers to the I/O size. * * Example: * ``` @@ -178,10 +158,10 @@ class nvtx_manager { * ``` */ #ifdef KVIKIO_CUDA_FOUND -#define KVIKIO_NVTX_MARKER(message, nvtx_payload) KVIKIO_NVTX_MARKER_IMPL(message, nvtx_payload) +#define KVIKIO_NVTX_MARKER(message, nvtx_payload_v) KVIKIO_NVTX_MARKER_IMPL(message, nvtx_payload_v) #else -#define KVIKIO_NVTX_MARKER(message, nvtx_payload) \ - do { \ +#define KVIKIO_NVTX_MARKER(message, nvtx_payload_v) \ + do { \ } while (0) #endif diff --git a/cpp/include/kvikio/parallel_operation.hpp b/cpp/include/kvikio/parallel_operation.hpp index debc51c1cd..1e883fcbb5 100644 --- a/cpp/include/kvikio/parallel_operation.hpp +++ b/cpp/include/kvikio/parallel_operation.hpp @@ -31,6 +31,15 @@ namespace kvikio { namespace detail { +inline const std::pair get_next_color_and_call_idx() noexcept +{ + static std::atomic_uint64_t call_counter{0ull}; + auto call_idx = + 1ull + std::atomic_fetch_add_explicit(&call_counter, 1ull, std::memory_order_relaxed); + auto& nvtx_color = nvtx_manager::instance().get_color_by_index(call_idx); + return {nvtx_color, call_idx}; +} + template std::future submit_task( F op, @@ -38,12 +47,11 @@ std::future submit_task( std::size_t size, std::size_t file_offset, std::size_t devPtr_offset, - kvikio_nvtx_color nvtx_color = nvtx_manager::instance().get_default_color(), - kvikio_nvtx_named_category nvtx_category = - kvikio_nvtx_named_category::get()) + std::uint64_t nvtx_payload = 0ull, + nvtx_color nvtx_color = nvtx_manager::instance().default_color()) { return defaults::thread_pool().submit_task([=] { - KVIKIO_NVTX_SCOPED_RANGE("Task", size, nvtx_color, nvtx_category); + KVIKIO_NVTX_SCOPED_RANGE("task", nvtx_payload, nvtx_color); return op(buf, size, file_offset, devPtr_offset); }); } @@ -70,16 +78,14 @@ std::future parallel_io( std::size_t file_offset, std::size_t task_size, std::size_t devPtr_offset, - kvikio_nvtx_color nvtx_color = nvtx_manager::instance().get_default_color(), - kvikio_nvtx_named_category nvtx_category = - kvikio_nvtx_named_category::get()) + std::uint64_t call_idx = 0, + nvtx_color nvtx_color_v = nvtx_manager::instance().default_color()) { if (task_size == 0) { throw std::invalid_argument("`task_size` cannot be zero"); } // Single-task guard if (task_size >= size || page_size >= size) { - return detail::submit_task( - op, buf, size, file_offset, devPtr_offset, nvtx_color, nvtx_category); + return detail::submit_task(op, buf, size, file_offset, devPtr_offset, call_idx, nvtx_color_v); } // We know an upper bound of the total number of tasks @@ -88,8 +94,8 @@ std::future parallel_io( // 1) Submit `task_size` sized tasks while (size >= task_size) { - tasks.push_back(detail::submit_task( - op, buf, task_size, file_offset, devPtr_offset, nvtx_color, nvtx_category)); + tasks.push_back( + detail::submit_task(op, buf, task_size, file_offset, devPtr_offset, call_idx, nvtx_color_v)); file_offset += task_size; devPtr_offset += task_size; size -= task_size; @@ -98,7 +104,7 @@ std::future parallel_io( // 2) Submit a task for the remainder if (size > 0) { tasks.push_back( - detail::submit_task(op, buf, size, file_offset, devPtr_offset, nvtx_color, nvtx_category)); + detail::submit_task(op, buf, size, file_offset, devPtr_offset, call_idx, nvtx_color_v)); } // Finally, we sum the result of all tasks. diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index 931dbfaef0..4b0c8aa75a 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -50,6 +50,12 @@ template >* = nullptr> return std::int64_t(value); } +/** + * @brief Helper function to allow NVTX payload of type std::uint64_t to pass through without doing + * anything. + */ +[[nodiscard]] inline std::uint64_t convert_to_64bit(std::uint64_t value) { return value; } + /** * @brief Help function to convert value to 64 bit float */ diff --git a/cpp/src/file_handle.cpp b/cpp/src/file_handle.cpp index ef227c31ea..d310c93037 100644 --- a/cpp/src/file_handle.cpp +++ b/cpp/src/file_handle.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -193,8 +194,8 @@ std::future FileHandle::pread(void* buf, std::size_t gds_threshold, bool sync_default_stream) { - auto& nvtx_color = nvtx_manager::instance().get_next_color(); - KVIKIO_NVTX_SCOPED_RANGE("FileHandle::pread()", size, nvtx_color); + auto& [nvtx_color_v, call_idx] = detail::get_next_color_and_call_idx(); + KVIKIO_NVTX_SCOPED_RANGE("FileHandle::pread()", size, nvtx_color_v); if (is_host_memory(buf)) { auto op = [this](void* hostPtr_base, std::size_t size, @@ -205,14 +206,7 @@ std::future FileHandle::pread(void* buf, _fd_direct_off.fd(), buf, size, file_offset); }; - return parallel_io(op, - buf, - size, - file_offset, - task_size, - 0, - nvtx_color, - kvikio_nvtx_named_category::get()); + return parallel_io(op, buf, size, file_offset, task_size, 0, call_idx, nvtx_color_v); } CUcontext ctx = get_context_from_pointer(buf); @@ -241,14 +235,8 @@ std::future FileHandle::pread(void* buf, return read(devPtr_base, size, file_offset, devPtr_offset, /* sync_default_stream = */ false); }; auto [devPtr_base, base_size, devPtr_offset] = get_alloc_info(buf, &ctx); - return parallel_io(task, - devPtr_base, - size, - file_offset, - task_size, - devPtr_offset, - nvtx_color, - kvikio_nvtx_named_category::get()); + return parallel_io( + task, devPtr_base, size, file_offset, task_size, devPtr_offset, call_idx, nvtx_color_v); } std::future FileHandle::pwrite(void const* buf, @@ -258,8 +246,8 @@ std::future FileHandle::pwrite(void const* buf, std::size_t gds_threshold, bool sync_default_stream) { - auto& nvtx_color = nvtx_manager::instance().get_next_color(); - KVIKIO_NVTX_SCOPED_RANGE("FileHandle::pwrite()", size, nvtx_color); + auto& [nvtx_color_v, call_idx] = detail::get_next_color_and_call_idx(); + KVIKIO_NVTX_SCOPED_RANGE("FileHandle::pwrite()", size, nvtx_color_v); if (is_host_memory(buf)) { auto op = [this](void const* hostPtr_base, std::size_t size, @@ -270,14 +258,7 @@ std::future FileHandle::pwrite(void const* buf, _fd_direct_off.fd(), buf, size, file_offset); }; - return parallel_io(op, - buf, - size, - file_offset, - task_size, - 0, - nvtx_color, - kvikio_nvtx_named_category::get()); + return parallel_io(op, buf, size, file_offset, task_size, 0, call_idx, nvtx_color_v); } CUcontext ctx = get_context_from_pointer(buf); @@ -306,14 +287,8 @@ std::future FileHandle::pwrite(void const* buf, return write(devPtr_base, size, file_offset, devPtr_offset, /* sync_default_stream = */ false); }; auto [devPtr_base, base_size, devPtr_offset] = get_alloc_info(buf, &ctx); - return parallel_io(op, - devPtr_base, - size, - file_offset, - task_size, - devPtr_offset, - nvtx_color, - kvikio_nvtx_named_category::get()); + return parallel_io( + op, devPtr_base, size, file_offset, task_size, devPtr_offset, call_idx, nvtx_color_v); } void FileHandle::read_async(void* devPtr_base, diff --git a/cpp/src/nvtx.cpp b/cpp/src/nvtx.cpp index c4f27f000e..4bc6091884 100644 --- a/cpp/src/nvtx.cpp +++ b/cpp/src/nvtx.cpp @@ -25,16 +25,29 @@ namespace kvikio { -nvtx_manager& nvtx_manager::instance() +nvtx_manager& nvtx_manager::instance() noexcept { static nvtx_manager _instance; return _instance; } +const nvtx_color& nvtx_manager::default_color() const noexcept +{ #ifdef KVIKIO_CUDA_FOUND -kvikio_nvtx_color& nvtx_manager::get_next_color() + static nvtx_color default_color{nvtx3::argb{0, 255, 255, 255}}; + return default_color; +#else + static nvtx_color dummy{}; + return dummy; +#endif +} + +const nvtx_color& nvtx_manager::get_color_by_index(std::uint64_t idx) const noexcept { - static std::array color_palette = {nvtx3::rgb{106, 192, 67}, +#ifdef KVIKIO_CUDA_FOUND + constexpr std::size_t num_color{16}; + static_assert((num_color & (num_color - 1)) == 0); // Is power of 2 + static std::array color_palette = {nvtx3::rgb{106, 192, 67}, nvtx3::rgb{191, 73, 203}, nvtx3::rgb{93, 151, 76}, nvtx3::rgb{96, 72, 194}, @@ -50,35 +63,12 @@ kvikio_nvtx_color& nvtx_manager::get_next_color() nvtx3::rgb{128, 102, 51}, nvtx3::rgb{211, 138, 130}, nvtx3::rgb{122, 50, 49}}; - - static std::mutex mut; - std::lock_guard lock{mut}; - - static std::size_t idx = 0; - auto old_idx = idx; - ++idx; - if (idx >= color_palette.size()) { idx -= color_palette.size(); } - - return color_palette[old_idx]; -} - -kvikio_nvtx_color& nvtx_manager::get_default_color() -{ - static kvikio_nvtx_color default_color{nvtx3::argb{0, 255, 255, 255}}; - return default_color; -} + auto safe_idx = idx & (num_color - 1); // idx % num_color + return color_palette[safe_idx]; #else -kvikio_nvtx_color& nvtx_manager::get_next_color() -{ - static kvikio_nvtx_color dummy{}; + static nvtx_color dummy{}; return dummy; -} - -kvikio_nvtx_color& nvtx_manager::get_default_color() -{ - static kvikio_nvtx_color dummy{}; - return dummy; -} #endif +} } // namespace kvikio diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp index 016cb0bf9a..c6ad0f4d28 100644 --- a/cpp/src/remote_handle.cpp +++ b/cpp/src/remote_handle.cpp @@ -393,6 +393,7 @@ std::future RemoteHandle::pread(void* buf, std::size_t file_offset, std::size_t task_size) { + auto& [nvtx_color_v, call_idx] = detail::get_next_color_and_call_idx(); KVIKIO_NVTX_SCOPED_RANGE("RemoteHandle::pread()", size); auto task = [this](void* devPtr_base, std::size_t size, @@ -400,7 +401,7 @@ std::future RemoteHandle::pread(void* buf, std::size_t devPtr_offset) -> std::size_t { return read(static_cast(devPtr_base) + devPtr_offset, size, file_offset); }; - return parallel_io(task, buf, size, file_offset, task_size, 0); + return parallel_io(task, buf, size, file_offset, task_size, 0, call_idx, nvtx_color_v); } } // namespace kvikio From cc7ca567d20fe7d57c93b71a40a8b89e4537712c Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 10 Feb 2025 01:56:30 -0500 Subject: [PATCH 3/9] Revert accidental changes to the test --- cpp/tests/test_basic_io.cpp | 88 ++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 45 deletions(-) diff --git a/cpp/tests/test_basic_io.cpp b/cpp/tests/test_basic_io.cpp index f564072e90..e16bd99d83 100644 --- a/cpp/tests/test_basic_io.cpp +++ b/cpp/tests/test_basic_io.cpp @@ -27,7 +27,7 @@ class BasicIOTest : public testing::Test { TempDir tmp_dir{false}; _filepath = tmp_dir.path() / "test"; - _dev_a = std::move(DevBuffer::arange(1024 * 1024 * 100)); + _dev_a = std::move(DevBuffer::arange(100)); _dev_b = std::move(DevBuffer::zero_like(_dev_a)); } @@ -42,59 +42,57 @@ TEST_F(BasicIOTest, write_read) { { kvikio::FileHandle f(_filepath, "w"); - auto fut = f.pwrite(_dev_a.ptr, _dev_a.nbytes); - fut.wait(); - EXPECT_EQ(fut.get(), _dev_a.nbytes); + auto nbytes = f.write(_dev_a.ptr, _dev_a.nbytes, 0, 0); + EXPECT_EQ(nbytes, _dev_a.nbytes); } { kvikio::FileHandle f(_filepath, "r"); - auto fut = f.pread(_dev_b.ptr, _dev_b.nbytes); - fut.wait(); - EXPECT_EQ(fut.get(), _dev_b.nbytes); - // expect_equal(_dev_a, _dev_b); + auto nbytes = f.read(_dev_b.ptr, _dev_b.nbytes, 0, 0); + EXPECT_EQ(nbytes, _dev_b.nbytes); + expect_equal(_dev_a, _dev_b); } } -// TEST_F(BasicIOTest, write_read_async) -// { -// CUstream stream{}; -// CUDA_DRIVER_TRY(kvikio::cudaAPI::instance().StreamCreate(&stream, CU_STREAM_NON_BLOCKING)); +TEST_F(BasicIOTest, write_read_async) +{ + CUstream stream{}; + CUDA_DRIVER_TRY(kvikio::cudaAPI::instance().StreamCreate(&stream, CU_STREAM_NON_BLOCKING)); -// // Default compatibility mode (AUTO) -// { -// kvikio::FileHandle f(_filepath, "w"); -// auto stream_future = f.write_async(_dev_a.ptr, _dev_a.nbytes, 0, 0, stream); -// auto nbytes = stream_future.check_bytes_done(); -// EXPECT_EQ(nbytes, _dev_a.nbytes); -// } + // Default compatibility mode (AUTO) + { + kvikio::FileHandle f(_filepath, "w"); + auto stream_future = f.write_async(_dev_a.ptr, _dev_a.nbytes, 0, 0, stream); + auto nbytes = stream_future.check_bytes_done(); + EXPECT_EQ(nbytes, _dev_a.nbytes); + } -// { -// kvikio::FileHandle f(_filepath, "r"); -// auto stream_future = f.read_async(_dev_b.ptr, _dev_b.nbytes, 0, 0, stream); -// auto nbytes = stream_future.check_bytes_done(); -// EXPECT_EQ(nbytes, _dev_b.nbytes); -// expect_equal(_dev_a, _dev_b); -// } + { + kvikio::FileHandle f(_filepath, "r"); + auto stream_future = f.read_async(_dev_b.ptr, _dev_b.nbytes, 0, 0, stream); + auto nbytes = stream_future.check_bytes_done(); + EXPECT_EQ(nbytes, _dev_b.nbytes); + expect_equal(_dev_a, _dev_b); + } -// // Explicitly set compatibility mode -// std::array compat_modes{kvikio::CompatMode::AUTO, -// kvikio::CompatMode::ON}; for (auto const& compat_mode : compat_modes) { -// { -// kvikio::FileHandle f(_filepath, "w", kvikio::FileHandle::m644, compat_mode); -// auto stream_future = f.write_async(_dev_a.ptr, _dev_a.nbytes, 0, 0, stream); -// auto nbytes = stream_future.check_bytes_done(); -// EXPECT_EQ(nbytes, _dev_a.nbytes); -// } + // Explicitly set compatibility mode + std::array compat_modes{kvikio::CompatMode::AUTO, kvikio::CompatMode::ON}; + for (auto const& compat_mode : compat_modes) { + { + kvikio::FileHandle f(_filepath, "w", kvikio::FileHandle::m644, compat_mode); + auto stream_future = f.write_async(_dev_a.ptr, _dev_a.nbytes, 0, 0, stream); + auto nbytes = stream_future.check_bytes_done(); + EXPECT_EQ(nbytes, _dev_a.nbytes); + } -// { -// kvikio::FileHandle f(_filepath, "r", kvikio::FileHandle::m644, compat_mode); -// auto stream_future = f.read_async(_dev_b.ptr, _dev_b.nbytes, 0, 0, stream); -// auto nbytes = stream_future.check_bytes_done(); -// EXPECT_EQ(nbytes, _dev_b.nbytes); -// expect_equal(_dev_a, _dev_b); -// } -// } + { + kvikio::FileHandle f(_filepath, "r", kvikio::FileHandle::m644, compat_mode); + auto stream_future = f.read_async(_dev_b.ptr, _dev_b.nbytes, 0, 0, stream); + auto nbytes = stream_future.check_bytes_done(); + EXPECT_EQ(nbytes, _dev_b.nbytes); + expect_equal(_dev_a, _dev_b); + } + } -// CUDA_DRIVER_TRY(kvikio::cudaAPI::instance().StreamDestroy(stream)); -// } + CUDA_DRIVER_TRY(kvikio::cudaAPI::instance().StreamDestroy(stream)); +} From 5ec1a40c9c4590de321e11b78bdd99b49e5f046b Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 10 Feb 2025 10:22:24 -0500 Subject: [PATCH 4/9] Add thread renaming. Add comments --- cpp/include/kvikio/nvtx.hpp | 32 ++++++++++++++++--- cpp/include/kvikio/parallel_operation.hpp | 38 ++++++++++++----------- cpp/src/nvtx.cpp | 26 ++++++++++++++-- 3 files changed, 71 insertions(+), 25 deletions(-) diff --git a/cpp/include/kvikio/nvtx.hpp b/cpp/include/kvikio/nvtx.hpp index 9797303a69..6bc9770357 100644 --- a/cpp/include/kvikio/nvtx.hpp +++ b/cpp/include/kvikio/nvtx.hpp @@ -61,8 +61,7 @@ using nvtx_registered_string = nvtx3::registered_string_in; } \ } #define KVIKIO_NVTX_SCOPED_RANGE_IMPL_2(msg, nvtx_payload_v) \ - KVIKIO_NVTX_SCOPED_RANGE_IMPL_3( \ - msg, nvtx_payload_v, kvikio::nvtx_manager::instance().default_color()) + KVIKIO_NVTX_SCOPED_RANGE_IMPL_3(msg, nvtx_payload_v, kvikio::nvtx_manager::default_color()) #define KVIKIO_NVTX_SCOPED_RANGE_SELECTOR(_1, _2, _3, NAME, ...) NAME #define KVIKIO_NVTX_SCOPED_RANGE_IMPL(...) \ KVIKIO_NVTX_SCOPED_RANGE_SELECTOR( \ @@ -81,11 +80,36 @@ using nvtx_color = nvtx3::color; using nvtx_color = int; #endif +/** + * @brief Utility singleton class for NVTX annotation. + */ class nvtx_manager { public: static nvtx_manager& instance() noexcept; - const nvtx_color& default_color() const noexcept; - const nvtx_color& get_color_by_index(std::uint64_t idx) const noexcept; + + /** + * @brief Return the default color. + * + * @return Default color. + */ + static const nvtx_color& default_color() noexcept; + + /** + * @brief Return the color at the given index from the internal color palette whose size n is a + * power of 2. The index may exceed the size of the color palette, in which case it wraps around, + * i.e. (idx mod n). + * + * @param idx The index value. + * @return The color picked from the internal color palette. + */ + static const nvtx_color& get_color_by_index(std::uint64_t idx) noexcept; + + /** + * @brief Rename the current thread. + * + */ + static void rename_current_thread(std::string_view new_name) noexcept; + nvtx_manager(nvtx_manager const&) = delete; nvtx_manager& operator=(nvtx_manager const&) = delete; nvtx_manager(nvtx_manager&&) = delete; diff --git a/cpp/include/kvikio/parallel_operation.hpp b/cpp/include/kvikio/parallel_operation.hpp index 1e883fcbb5..699807bc1f 100644 --- a/cpp/include/kvikio/parallel_operation.hpp +++ b/cpp/include/kvikio/parallel_operation.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -36,22 +37,24 @@ inline const std::pair get_next_color_and_call static std::atomic_uint64_t call_counter{0ull}; auto call_idx = 1ull + std::atomic_fetch_add_explicit(&call_counter, 1ull, std::memory_order_relaxed); - auto& nvtx_color = nvtx_manager::instance().get_color_by_index(call_idx); + auto& nvtx_color = nvtx_manager::get_color_by_index(call_idx); return {nvtx_color, call_idx}; } template -std::future submit_task( - F op, - T buf, - std::size_t size, - std::size_t file_offset, - std::size_t devPtr_offset, - std::uint64_t nvtx_payload = 0ull, - nvtx_color nvtx_color = nvtx_manager::instance().default_color()) +std::future submit_task(F op, + T buf, + std::size_t size, + std::size_t file_offset, + std::size_t devPtr_offset, + std::uint64_t nvtx_payload = 0ull, + nvtx_color nvtx_color = nvtx_manager::default_color()) { return defaults::thread_pool().submit_task([=] { KVIKIO_NVTX_SCOPED_RANGE("task", nvtx_payload, nvtx_color); + thread_local std::once_flag call_once_per_thread; + std::call_once(call_once_per_thread, + [] { nvtx_manager::rename_current_thread("thread pool"); }); return op(buf, size, file_offset, devPtr_offset); }); } @@ -71,15 +74,14 @@ std::future submit_task( * @return A future to be used later to check if the operation has finished its execution. */ template -std::future parallel_io( - F op, - T buf, - std::size_t size, - std::size_t file_offset, - std::size_t task_size, - std::size_t devPtr_offset, - std::uint64_t call_idx = 0, - nvtx_color nvtx_color_v = nvtx_manager::instance().default_color()) +std::future parallel_io(F op, + T buf, + std::size_t size, + std::size_t file_offset, + std::size_t task_size, + std::size_t devPtr_offset, + std::uint64_t call_idx = 0, + nvtx_color nvtx_color_v = nvtx_manager::default_color()) { if (task_size == 0) { throw std::invalid_argument("`task_size` cannot be zero"); } diff --git a/cpp/src/nvtx.cpp b/cpp/src/nvtx.cpp index 4bc6091884..0b26f6c7ac 100644 --- a/cpp/src/nvtx.cpp +++ b/cpp/src/nvtx.cpp @@ -14,8 +14,9 @@ * limitations under the License. */ +#include #include -#include +#include #ifdef KVIKIO_CUDA_FOUND #include @@ -31,7 +32,7 @@ nvtx_manager& nvtx_manager::instance() noexcept return _instance; } -const nvtx_color& nvtx_manager::default_color() const noexcept +const nvtx_color& nvtx_manager::default_color() noexcept { #ifdef KVIKIO_CUDA_FOUND static nvtx_color default_color{nvtx3::argb{0, 255, 255, 255}}; @@ -42,7 +43,7 @@ const nvtx_color& nvtx_manager::default_color() const noexcept #endif } -const nvtx_color& nvtx_manager::get_color_by_index(std::uint64_t idx) const noexcept +const nvtx_color& nvtx_manager::get_color_by_index(std::uint64_t idx) noexcept { #ifdef KVIKIO_CUDA_FOUND constexpr std::size_t num_color{16}; @@ -71,4 +72,23 @@ const nvtx_color& nvtx_manager::get_color_by_index(std::uint64_t idx) const noex #endif } +void nvtx_manager::rename_current_thread(std::string_view new_name) noexcept +{ +#ifdef KVIKIO_CUDA_FOUND + auto tid = syscall(SYS_gettid); + std::stringstream ss; + ss << new_name << " (" << tid << ")"; + + nvtxResourceAttributes_t attribs = {0}; + attribs.version = NVTX_VERSION; + attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE; + attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_THREAD_NATIVE; + attribs.identifier.ullValue = tid; + attribs.messageType = NVTX_MESSAGE_TYPE_ASCII; + attribs.message.ascii = ss.str().c_str(); + nvtxResourceHandle_t handle = + nvtxDomainResourceCreate(nvtx3::domain::get(), &attribs); +#endif +} + } // namespace kvikio From 6ceab55d7d2f07b03310054c1e8cd85cde16ae0b Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 10 Feb 2025 11:50:09 -0500 Subject: [PATCH 5/9] Add more comment --- cpp/include/kvikio/nvtx.hpp | 4 +++- cpp/include/kvikio/parallel_operation.hpp | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/cpp/include/kvikio/nvtx.hpp b/cpp/include/kvikio/nvtx.hpp index 6bc9770357..64a5392380 100644 --- a/cpp/include/kvikio/nvtx.hpp +++ b/cpp/include/kvikio/nvtx.hpp @@ -105,8 +105,10 @@ class nvtx_manager { static const nvtx_color& get_color_by_index(std::uint64_t idx) noexcept; /** - * @brief Rename the current thread. + * @brief Rename the current thread under the KvikIO NVTX domain. * + * @note This NVTX feature is currently not supported by the Nsight System profiler. As a result, + * the OS thread will not be renamed in the nsys-ui. */ static void rename_current_thread(std::string_view new_name) noexcept; diff --git a/cpp/include/kvikio/parallel_operation.hpp b/cpp/include/kvikio/parallel_operation.hpp index 699807bc1f..512a8890f9 100644 --- a/cpp/include/kvikio/parallel_operation.hpp +++ b/cpp/include/kvikio/parallel_operation.hpp @@ -32,6 +32,16 @@ namespace kvikio { namespace detail { +/** + * @brief Determine the NVTX color and call index. They are used to identify tasks from different + * pread/pwrite calls. Tasks from the same pread/pwrite call are given the same color and call + * index. The call index is atomically incremented on each pread/pwrite call, and will wrap around + * once it reaches the maximum value the integer type `std::uint64_t` can hold (this overflow + * behavior is well-defined in C++). The color is picked from an internal color palette according to + * the call index value. + * + * @return A pair of NVTX color and call index. + */ inline const std::pair get_next_color_and_call_idx() noexcept { static std::atomic_uint64_t call_counter{0ull}; @@ -52,9 +62,13 @@ std::future submit_task(F op, { return defaults::thread_pool().submit_task([=] { KVIKIO_NVTX_SCOPED_RANGE("task", nvtx_payload, nvtx_color); + + // Rename the worker thread in the thread pool to improve clarity from nsys-ui. + // Note: This NVTX feature is currently not supported by nsys-ui. thread_local std::once_flag call_once_per_thread; std::call_once(call_once_per_thread, [] { nvtx_manager::rename_current_thread("thread pool"); }); + return op(buf, size, file_offset, devPtr_offset); }); } From c1b938bfe921b7bd771b86c0b027be25ff607156 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 10 Feb 2025 12:08:10 -0500 Subject: [PATCH 6/9] Rename vars --- cpp/include/kvikio/nvtx.hpp | 52 ++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/cpp/include/kvikio/nvtx.hpp b/cpp/include/kvikio/nvtx.hpp index 64a5392380..c6e9a08a73 100644 --- a/cpp/include/kvikio/nvtx.hpp +++ b/cpp/include/kvikio/nvtx.hpp @@ -43,34 +43,34 @@ using nvtx_registered_string = nvtx3::registered_string_in; // Macro to create a static, registered string that will not have a name conflict with any // registered string defined in the same scope. -#define KVIKIO_REGISTER_STRING(msg) \ - [](const char* a_msg) -> auto& { \ - static nvtx_registered_string a_reg_str{a_msg}; \ - return a_reg_str; \ +#define KVIKIO_REGISTER_STRING(msg) \ + [](const char* a_msg) -> auto& { \ + static kvikio::nvtx_registered_string a_reg_str{a_msg}; \ + return a_reg_str; \ }(msg) // Macro overloads of KVIKIO_NVTX_FUNC_RANGE -#define KVIKIO_NVTX_FUNC_RANGE_IMPL() NVTX3_FUNC_RANGE_IN(libkvikio_domain) - -#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_3(msg, nvtx_payload_v, nvtx_color_v) \ - kvikio::nvtx_scoped_range KVIKIO_CONCAT(_kvikio_nvtx_range, __LINE__) \ - { \ - nvtx3::event_attributes \ - { \ - KVIKIO_REGISTER_STRING(msg), nvtx3::payload{convert_to_64bit(nvtx_payload_v)}, nvtx_color_v \ - } \ +#define KVIKIO_NVTX_FUNC_RANGE_IMPL() NVTX3_FUNC_RANGE_IN(kvikio::libkvikio_domain) + +#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_3(msg, payload_v, color) \ + kvikio::nvtx_scoped_range KVIKIO_CONCAT(_kvikio_nvtx_range, __LINE__) \ + { \ + nvtx3::event_attributes \ + { \ + KVIKIO_REGISTER_STRING(msg), nvtx3::payload{kvikio::convert_to_64bit(payload_v)}, color \ + } \ } -#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_2(msg, nvtx_payload_v) \ - KVIKIO_NVTX_SCOPED_RANGE_IMPL_3(msg, nvtx_payload_v, kvikio::nvtx_manager::default_color()) +#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_2(msg, payload) \ + KVIKIO_NVTX_SCOPED_RANGE_IMPL_3(msg, payload, kvikio::nvtx_manager::default_color()) #define KVIKIO_NVTX_SCOPED_RANGE_SELECTOR(_1, _2, _3, NAME, ...) NAME #define KVIKIO_NVTX_SCOPED_RANGE_IMPL(...) \ KVIKIO_NVTX_SCOPED_RANGE_SELECTOR( \ __VA_ARGS__, KVIKIO_NVTX_SCOPED_RANGE_IMPL_3, KVIKIO_NVTX_SCOPED_RANGE_IMPL_2) \ (__VA_ARGS__) -#define KVIKIO_NVTX_MARKER_IMPL(msg, nvtx_payload_v) \ - nvtx3::mark_in(nvtx3::event_attributes{ \ - KVIKIO_REGISTER_STRING(msg), nvtx3::payload{convert_to_64bit(nvtx_payload_v)}}) +#define KVIKIO_NVTX_MARKER_IMPL(msg, payload_v) \ + nvtx3::mark_in(nvtx3::event_attributes{ \ + KVIKIO_REGISTER_STRING(msg), nvtx3::payload{kvikio::convert_to_64bit(payload_v)}}) #endif @@ -148,7 +148,7 @@ class nvtx_manager { * @brief Convenience macro for generating an NVTX scoped range in the `libkvikio` domain to * annotate a time duration. * - * Takes two arguments (message, nvtx_payload_v). + * Takes two arguments (message, payload). * * Example: * ``` @@ -161,8 +161,8 @@ class nvtx_manager { #ifdef KVIKIO_CUDA_FOUND #define KVIKIO_NVTX_SCOPED_RANGE(...) KVIKIO_NVTX_SCOPED_RANGE_IMPL(__VA_ARGS__) #else -#define KVIKIO_NVTX_SCOPED_RANGE(msg, nvtx_payload_v, ...) \ - do { \ +#define KVIKIO_NVTX_SCOPED_RANGE(msg, payload, ...) \ + do { \ } while (0) #endif @@ -170,8 +170,8 @@ class nvtx_manager { * @brief Convenience macro for generating an NVTX marker in the `libkvikio` domain to annotate a * certain time point. * - * Takes two arguments (message, nvtx_payload_v). Use this macro to annotate asynchronous I/O - * operations, where the nvtx_payload_v refers to the I/O size. + * Takes two arguments (message, payload). Use this macro to annotate asynchronous I/O + * operations, where the payload refers to the I/O size. * * Example: * ``` @@ -184,10 +184,10 @@ class nvtx_manager { * ``` */ #ifdef KVIKIO_CUDA_FOUND -#define KVIKIO_NVTX_MARKER(message, nvtx_payload_v) KVIKIO_NVTX_MARKER_IMPL(message, nvtx_payload_v) +#define KVIKIO_NVTX_MARKER(message, payload) KVIKIO_NVTX_MARKER_IMPL(message, payload) #else -#define KVIKIO_NVTX_MARKER(message, nvtx_payload_v) \ - do { \ +#define KVIKIO_NVTX_MARKER(message, payload) \ + do { \ } while (0) #endif From 6196835a18d6e4525387e721d7ea137f2fd91dcf Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 10 Feb 2025 14:43:51 -0500 Subject: [PATCH 7/9] Rename vars and types --- cpp/include/kvikio/nvtx.hpp | 22 ++++++------ cpp/include/kvikio/parallel_operation.hpp | 14 ++++---- cpp/src/file_handle.cpp | 16 ++++----- cpp/src/nvtx.cpp | 42 +++++++++++------------ cpp/src/remote_handle.cpp | 4 +-- 5 files changed, 49 insertions(+), 49 deletions(-) diff --git a/cpp/include/kvikio/nvtx.hpp b/cpp/include/kvikio/nvtx.hpp index c6e9a08a73..ef0ff7de84 100644 --- a/cpp/include/kvikio/nvtx.hpp +++ b/cpp/include/kvikio/nvtx.hpp @@ -34,8 +34,8 @@ struct libkvikio_domain { static constexpr char const* name{"libkvikio"}; }; -using nvtx_scoped_range = nvtx3::scoped_range_in; -using nvtx_registered_string = nvtx3::registered_string_in; +using nvtx_scoped_range_type = nvtx3::scoped_range_in; +using nvtx_registered_string_type = nvtx3::registered_string_in; // Macro to concatenate two tokens x and y. #define KVIKIO_CONCAT_HELPER(x, y) x##y @@ -43,17 +43,17 @@ using nvtx_registered_string = nvtx3::registered_string_in; // Macro to create a static, registered string that will not have a name conflict with any // registered string defined in the same scope. -#define KVIKIO_REGISTER_STRING(msg) \ - [](const char* a_msg) -> auto& { \ - static kvikio::nvtx_registered_string a_reg_str{a_msg}; \ - return a_reg_str; \ +#define KVIKIO_REGISTER_STRING(msg) \ + [](const char* a_msg) -> auto& { \ + static kvikio::nvtx_registered_string_type a_reg_str{a_msg}; \ + return a_reg_str; \ }(msg) // Macro overloads of KVIKIO_NVTX_FUNC_RANGE #define KVIKIO_NVTX_FUNC_RANGE_IMPL() NVTX3_FUNC_RANGE_IN(kvikio::libkvikio_domain) #define KVIKIO_NVTX_SCOPED_RANGE_IMPL_3(msg, payload_v, color) \ - kvikio::nvtx_scoped_range KVIKIO_CONCAT(_kvikio_nvtx_range, __LINE__) \ + kvikio::nvtx_scoped_range_type KVIKIO_CONCAT(_kvikio_nvtx_range, __LINE__) \ { \ nvtx3::event_attributes \ { \ @@ -75,9 +75,9 @@ using nvtx_registered_string = nvtx3::registered_string_in; #endif #ifdef KVIKIO_CUDA_FOUND -using nvtx_color = nvtx3::color; +using nvtx_color_type = nvtx3::color; #else -using nvtx_color = int; +using nvtx_color_type = int; #endif /** @@ -92,7 +92,7 @@ class nvtx_manager { * * @return Default color. */ - static const nvtx_color& default_color() noexcept; + static const nvtx_color_type& default_color() noexcept; /** * @brief Return the color at the given index from the internal color palette whose size n is a @@ -102,7 +102,7 @@ class nvtx_manager { * @param idx The index value. * @return The color picked from the internal color palette. */ - static const nvtx_color& get_color_by_index(std::uint64_t idx) noexcept; + static const nvtx_color_type& get_color_by_index(std::uint64_t idx) noexcept; /** * @brief Rename the current thread under the KvikIO NVTX domain. diff --git a/cpp/include/kvikio/parallel_operation.hpp b/cpp/include/kvikio/parallel_operation.hpp index 512a8890f9..2c59d8ad5b 100644 --- a/cpp/include/kvikio/parallel_operation.hpp +++ b/cpp/include/kvikio/parallel_operation.hpp @@ -42,7 +42,7 @@ namespace detail { * * @return A pair of NVTX color and call index. */ -inline const std::pair get_next_color_and_call_idx() noexcept +inline const std::pair get_next_color_and_call_idx() noexcept { static std::atomic_uint64_t call_counter{0ull}; auto call_idx = @@ -58,7 +58,7 @@ std::future submit_task(F op, std::size_t file_offset, std::size_t devPtr_offset, std::uint64_t nvtx_payload = 0ull, - nvtx_color nvtx_color = nvtx_manager::default_color()) + nvtx_color_type nvtx_color = nvtx_manager::default_color()) { return defaults::thread_pool().submit_task([=] { KVIKIO_NVTX_SCOPED_RANGE("task", nvtx_payload, nvtx_color); @@ -94,14 +94,14 @@ std::future parallel_io(F op, std::size_t file_offset, std::size_t task_size, std::size_t devPtr_offset, - std::uint64_t call_idx = 0, - nvtx_color nvtx_color_v = nvtx_manager::default_color()) + std::uint64_t call_idx = 0, + nvtx_color_type nvtx_color = nvtx_manager::default_color()) { if (task_size == 0) { throw std::invalid_argument("`task_size` cannot be zero"); } // Single-task guard if (task_size >= size || page_size >= size) { - return detail::submit_task(op, buf, size, file_offset, devPtr_offset, call_idx, nvtx_color_v); + return detail::submit_task(op, buf, size, file_offset, devPtr_offset, call_idx, nvtx_color); } // We know an upper bound of the total number of tasks @@ -111,7 +111,7 @@ std::future parallel_io(F op, // 1) Submit `task_size` sized tasks while (size >= task_size) { tasks.push_back( - detail::submit_task(op, buf, task_size, file_offset, devPtr_offset, call_idx, nvtx_color_v)); + detail::submit_task(op, buf, task_size, file_offset, devPtr_offset, call_idx, nvtx_color)); file_offset += task_size; devPtr_offset += task_size; size -= task_size; @@ -120,7 +120,7 @@ std::future parallel_io(F op, // 2) Submit a task for the remainder if (size > 0) { tasks.push_back( - detail::submit_task(op, buf, size, file_offset, devPtr_offset, call_idx, nvtx_color_v)); + detail::submit_task(op, buf, size, file_offset, devPtr_offset, call_idx, nvtx_color)); } // Finally, we sum the result of all tasks. diff --git a/cpp/src/file_handle.cpp b/cpp/src/file_handle.cpp index d310c93037..2b88fb2784 100644 --- a/cpp/src/file_handle.cpp +++ b/cpp/src/file_handle.cpp @@ -194,8 +194,8 @@ std::future FileHandle::pread(void* buf, std::size_t gds_threshold, bool sync_default_stream) { - auto& [nvtx_color_v, call_idx] = detail::get_next_color_and_call_idx(); - KVIKIO_NVTX_SCOPED_RANGE("FileHandle::pread()", size, nvtx_color_v); + auto& [nvtx_color, call_idx] = detail::get_next_color_and_call_idx(); + KVIKIO_NVTX_SCOPED_RANGE("FileHandle::pread()", size, nvtx_color); if (is_host_memory(buf)) { auto op = [this](void* hostPtr_base, std::size_t size, @@ -206,7 +206,7 @@ std::future FileHandle::pread(void* buf, _fd_direct_off.fd(), buf, size, file_offset); }; - return parallel_io(op, buf, size, file_offset, task_size, 0, call_idx, nvtx_color_v); + return parallel_io(op, buf, size, file_offset, task_size, 0, call_idx, nvtx_color); } CUcontext ctx = get_context_from_pointer(buf); @@ -236,7 +236,7 @@ std::future FileHandle::pread(void* buf, }; auto [devPtr_base, base_size, devPtr_offset] = get_alloc_info(buf, &ctx); return parallel_io( - task, devPtr_base, size, file_offset, task_size, devPtr_offset, call_idx, nvtx_color_v); + task, devPtr_base, size, file_offset, task_size, devPtr_offset, call_idx, nvtx_color); } std::future FileHandle::pwrite(void const* buf, @@ -246,8 +246,8 @@ std::future FileHandle::pwrite(void const* buf, std::size_t gds_threshold, bool sync_default_stream) { - auto& [nvtx_color_v, call_idx] = detail::get_next_color_and_call_idx(); - KVIKIO_NVTX_SCOPED_RANGE("FileHandle::pwrite()", size, nvtx_color_v); + auto& [nvtx_color, call_idx] = detail::get_next_color_and_call_idx(); + KVIKIO_NVTX_SCOPED_RANGE("FileHandle::pwrite()", size, nvtx_color); if (is_host_memory(buf)) { auto op = [this](void const* hostPtr_base, std::size_t size, @@ -258,7 +258,7 @@ std::future FileHandle::pwrite(void const* buf, _fd_direct_off.fd(), buf, size, file_offset); }; - return parallel_io(op, buf, size, file_offset, task_size, 0, call_idx, nvtx_color_v); + return parallel_io(op, buf, size, file_offset, task_size, 0, call_idx, nvtx_color); } CUcontext ctx = get_context_from_pointer(buf); @@ -288,7 +288,7 @@ std::future FileHandle::pwrite(void const* buf, }; auto [devPtr_base, base_size, devPtr_offset] = get_alloc_info(buf, &ctx); return parallel_io( - op, devPtr_base, size, file_offset, task_size, devPtr_offset, call_idx, nvtx_color_v); + op, devPtr_base, size, file_offset, task_size, devPtr_offset, call_idx, nvtx_color); } void FileHandle::read_async(void* devPtr_base, diff --git a/cpp/src/nvtx.cpp b/cpp/src/nvtx.cpp index 0b26f6c7ac..8611533a2f 100644 --- a/cpp/src/nvtx.cpp +++ b/cpp/src/nvtx.cpp @@ -32,42 +32,42 @@ nvtx_manager& nvtx_manager::instance() noexcept return _instance; } -const nvtx_color& nvtx_manager::default_color() noexcept +const nvtx_color_type& nvtx_manager::default_color() noexcept { #ifdef KVIKIO_CUDA_FOUND - static nvtx_color default_color{nvtx3::argb{0, 255, 255, 255}}; + static nvtx_color_type default_color{nvtx3::argb{0, 255, 255, 255}}; return default_color; #else - static nvtx_color dummy{}; + static nvtx_color_type dummy{}; return dummy; #endif } -const nvtx_color& nvtx_manager::get_color_by_index(std::uint64_t idx) noexcept +const nvtx_color_type& nvtx_manager::get_color_by_index(std::uint64_t idx) noexcept { #ifdef KVIKIO_CUDA_FOUND constexpr std::size_t num_color{16}; static_assert((num_color & (num_color - 1)) == 0); // Is power of 2 - static std::array color_palette = {nvtx3::rgb{106, 192, 67}, - nvtx3::rgb{191, 73, 203}, - nvtx3::rgb{93, 151, 76}, - nvtx3::rgb{96, 72, 194}, - nvtx3::rgb{179, 170, 71}, - nvtx3::rgb{92, 58, 113}, - nvtx3::rgb{212, 136, 57}, - nvtx3::rgb{96, 144, 194}, - nvtx3::rgb{211, 69, 56}, - nvtx3::rgb{97, 179, 155}, - nvtx3::rgb{203, 69, 131}, - nvtx3::rgb{57, 89, 48}, - nvtx3::rgb{184, 133, 199}, - nvtx3::rgb{128, 102, 51}, - nvtx3::rgb{211, 138, 130}, - nvtx3::rgb{122, 50, 49}}; + static std::array color_palette = {nvtx3::rgb{106, 192, 67}, + nvtx3::rgb{191, 73, 203}, + nvtx3::rgb{93, 151, 76}, + nvtx3::rgb{96, 72, 194}, + nvtx3::rgb{179, 170, 71}, + nvtx3::rgb{92, 58, 113}, + nvtx3::rgb{212, 136, 57}, + nvtx3::rgb{96, 144, 194}, + nvtx3::rgb{211, 69, 56}, + nvtx3::rgb{97, 179, 155}, + nvtx3::rgb{203, 69, 131}, + nvtx3::rgb{57, 89, 48}, + nvtx3::rgb{184, 133, 199}, + nvtx3::rgb{128, 102, 51}, + nvtx3::rgb{211, 138, 130}, + nvtx3::rgb{122, 50, 49}}; auto safe_idx = idx & (num_color - 1); // idx % num_color return color_palette[safe_idx]; #else - static nvtx_color dummy{}; + static nvtx_color_type dummy{}; return dummy; #endif } diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp index c6ad0f4d28..1096cfac5a 100644 --- a/cpp/src/remote_handle.cpp +++ b/cpp/src/remote_handle.cpp @@ -393,7 +393,7 @@ std::future RemoteHandle::pread(void* buf, std::size_t file_offset, std::size_t task_size) { - auto& [nvtx_color_v, call_idx] = detail::get_next_color_and_call_idx(); + auto& [nvtx_color, call_idx] = detail::get_next_color_and_call_idx(); KVIKIO_NVTX_SCOPED_RANGE("RemoteHandle::pread()", size); auto task = [this](void* devPtr_base, std::size_t size, @@ -401,7 +401,7 @@ std::future RemoteHandle::pread(void* buf, std::size_t devPtr_offset) -> std::size_t { return read(static_cast(devPtr_base) + devPtr_offset, size, file_offset); }; - return parallel_io(task, buf, size, file_offset, task_size, 0, call_idx, nvtx_color_v); + return parallel_io(task, buf, size, file_offset, task_size, 0, call_idx, nvtx_color); } } // namespace kvikio From 5725cb2c6b13de800d1a95b3aa7313f46b7457ad Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 10 Feb 2025 14:56:00 -0500 Subject: [PATCH 8/9] Add more comments --- cpp/include/kvikio/nvtx.hpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cpp/include/kvikio/nvtx.hpp b/cpp/include/kvikio/nvtx.hpp index ef0ff7de84..586cf98017 100644 --- a/cpp/include/kvikio/nvtx.hpp +++ b/cpp/include/kvikio/nvtx.hpp @@ -148,7 +148,10 @@ class nvtx_manager { * @brief Convenience macro for generating an NVTX scoped range in the `libkvikio` domain to * annotate a time duration. * - * Takes two arguments (message, payload). + * @param message String literal for NVTX annotation. To improve profile-time performance, the + * string literal is registered in NVTX. + * @param payload NVTX payload. + * @param color (Optional) NVTX color. If unspecified, a default NVTX color is used. * * Example: * ``` @@ -170,8 +173,9 @@ class nvtx_manager { * @brief Convenience macro for generating an NVTX marker in the `libkvikio` domain to annotate a * certain time point. * - * Takes two arguments (message, payload). Use this macro to annotate asynchronous I/O - * operations, where the payload refers to the I/O size. + * @param message String literal for NVTX annotation. To improve profile-time performance, the + * string literal is registered in NVTX. + * @param payload NVTX payload. * * Example: * ``` From 37df91a6f0c1cb1c9e5d966159bf8ea7b5e5f563 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 10 Feb 2025 15:00:03 -0500 Subject: [PATCH 9/9] Improve comments --- cpp/include/kvikio/nvtx.hpp | 40 +++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/cpp/include/kvikio/nvtx.hpp b/cpp/include/kvikio/nvtx.hpp index 586cf98017..fc401fd38a 100644 --- a/cpp/include/kvikio/nvtx.hpp +++ b/cpp/include/kvikio/nvtx.hpp @@ -43,34 +43,36 @@ using nvtx_registered_string_type = nvtx3::registered_string_in auto& { \ - static kvikio::nvtx_registered_string_type a_reg_str{a_msg}; \ - return a_reg_str; \ - }(msg) +#define KVIKIO_REGISTER_STRING(message) \ + [](const char* a_message) -> auto& { \ + static kvikio::nvtx_registered_string_type a_reg_str{a_message}; \ + return a_reg_str; \ + }(message) -// Macro overloads of KVIKIO_NVTX_FUNC_RANGE +// Implementation of KVIKIO_NVTX_FUNC_RANGE() #define KVIKIO_NVTX_FUNC_RANGE_IMPL() NVTX3_FUNC_RANGE_IN(kvikio::libkvikio_domain) -#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_3(msg, payload_v, color) \ - kvikio::nvtx_scoped_range_type KVIKIO_CONCAT(_kvikio_nvtx_range, __LINE__) \ - { \ - nvtx3::event_attributes \ - { \ - KVIKIO_REGISTER_STRING(msg), nvtx3::payload{kvikio::convert_to_64bit(payload_v)}, color \ - } \ +// Implementation of KVIKIO_NVTX_SCOPED_RANGE(...) +#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_3(message, payload_v, color) \ + kvikio::nvtx_scoped_range_type KVIKIO_CONCAT(_kvikio_nvtx_range, __LINE__) \ + { \ + nvtx3::event_attributes \ + { \ + KVIKIO_REGISTER_STRING(message), nvtx3::payload{kvikio::convert_to_64bit(payload_v)}, color \ + } \ } -#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_2(msg, payload) \ - KVIKIO_NVTX_SCOPED_RANGE_IMPL_3(msg, payload, kvikio::nvtx_manager::default_color()) +#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_2(message, payload) \ + KVIKIO_NVTX_SCOPED_RANGE_IMPL_3(message, payload, kvikio::nvtx_manager::default_color()) #define KVIKIO_NVTX_SCOPED_RANGE_SELECTOR(_1, _2, _3, NAME, ...) NAME #define KVIKIO_NVTX_SCOPED_RANGE_IMPL(...) \ KVIKIO_NVTX_SCOPED_RANGE_SELECTOR( \ __VA_ARGS__, KVIKIO_NVTX_SCOPED_RANGE_IMPL_3, KVIKIO_NVTX_SCOPED_RANGE_IMPL_2) \ (__VA_ARGS__) -#define KVIKIO_NVTX_MARKER_IMPL(msg, payload_v) \ +// Implementation of KVIKIO_NVTX_MARKER(message, payload) +#define KVIKIO_NVTX_MARKER_IMPL(message, payload_v) \ nvtx3::mark_in(nvtx3::event_attributes{ \ - KVIKIO_REGISTER_STRING(msg), nvtx3::payload{kvikio::convert_to_64bit(payload_v)}}) + KVIKIO_REGISTER_STRING(message), nvtx3::payload{kvikio::convert_to_64bit(payload_v)}}) #endif @@ -164,8 +166,8 @@ class nvtx_manager { #ifdef KVIKIO_CUDA_FOUND #define KVIKIO_NVTX_SCOPED_RANGE(...) KVIKIO_NVTX_SCOPED_RANGE_IMPL(__VA_ARGS__) #else -#define KVIKIO_NVTX_SCOPED_RANGE(msg, payload, ...) \ - do { \ +#define KVIKIO_NVTX_SCOPED_RANGE(message, payload, ...) \ + do { \ } while (0) #endif