diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 4c1daa58c6..fc59f35591 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -144,6 +144,7 @@ set(SOURCES "src/error.cpp" "src/file_handle.cpp" "src/file_utils.cpp" + "src/nvtx.cpp" "src/posix_io.cpp" "src/shim/cuda.cpp" "src/shim/cufile.cpp" diff --git a/cpp/include/kvikio/nvtx.hpp b/cpp/include/kvikio/nvtx.hpp new file mode 100644 index 0000000000..fc401fd38a --- /dev/null +++ b/cpp/include/kvikio/nvtx.hpp @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#ifdef KVIKIO_CUDA_FOUND +#include +#endif + +#include +#include + +namespace kvikio { + +#ifdef KVIKIO_CUDA_FOUND +/** + * @brief Tag type for libkvikio's NVTX domain. + */ +struct libkvikio_domain { + static constexpr char const* name{"libkvikio"}; +}; + +using nvtx_scoped_range_type = nvtx3::scoped_range_in; +using nvtx_registered_string_type = nvtx3::registered_string_in; + +// Macro to concatenate two tokens x and y. +#define KVIKIO_CONCAT_HELPER(x, y) x##y +#define KVIKIO_CONCAT(x, y) KVIKIO_CONCAT_HELPER(x, y) + +// Macro to create a static, registered string that will not have a name conflict with any +// registered string defined in the same scope. +#define KVIKIO_REGISTER_STRING(message) \ + [](const char* a_message) -> auto& { \ + static kvikio::nvtx_registered_string_type a_reg_str{a_message}; \ + return a_reg_str; \ + }(message) + +// Implementation of KVIKIO_NVTX_FUNC_RANGE() +#define KVIKIO_NVTX_FUNC_RANGE_IMPL() NVTX3_FUNC_RANGE_IN(kvikio::libkvikio_domain) + +// Implementation of KVIKIO_NVTX_SCOPED_RANGE(...) +#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_3(message, payload_v, color) \ + kvikio::nvtx_scoped_range_type KVIKIO_CONCAT(_kvikio_nvtx_range, __LINE__) \ + { \ + nvtx3::event_attributes \ + { \ + KVIKIO_REGISTER_STRING(message), nvtx3::payload{kvikio::convert_to_64bit(payload_v)}, color \ + } \ + } +#define KVIKIO_NVTX_SCOPED_RANGE_IMPL_2(message, payload) \ + KVIKIO_NVTX_SCOPED_RANGE_IMPL_3(message, payload, kvikio::nvtx_manager::default_color()) +#define KVIKIO_NVTX_SCOPED_RANGE_SELECTOR(_1, _2, _3, NAME, ...) NAME +#define KVIKIO_NVTX_SCOPED_RANGE_IMPL(...) \ + KVIKIO_NVTX_SCOPED_RANGE_SELECTOR( \ + __VA_ARGS__, KVIKIO_NVTX_SCOPED_RANGE_IMPL_3, KVIKIO_NVTX_SCOPED_RANGE_IMPL_2) \ + (__VA_ARGS__) + +// Implementation of KVIKIO_NVTX_MARKER(message, payload) +#define KVIKIO_NVTX_MARKER_IMPL(message, payload_v) \ + nvtx3::mark_in(nvtx3::event_attributes{ \ + KVIKIO_REGISTER_STRING(message), nvtx3::payload{kvikio::convert_to_64bit(payload_v)}}) + +#endif + +#ifdef KVIKIO_CUDA_FOUND +using nvtx_color_type = nvtx3::color; +#else +using nvtx_color_type = int; +#endif + +/** + * @brief Utility singleton class for NVTX annotation. + */ +class nvtx_manager { + public: + static nvtx_manager& instance() noexcept; + + /** + * @brief Return the default color. + * + * @return Default color. + */ + static const nvtx_color_type& default_color() noexcept; + + /** + * @brief Return the color at the given index from the internal color palette whose size n is a + * power of 2. The index may exceed the size of the color palette, in which case it wraps around, + * i.e. (idx mod n). + * + * @param idx The index value. + * @return The color picked from the internal color palette. + */ + static const nvtx_color_type& get_color_by_index(std::uint64_t idx) noexcept; + + /** + * @brief Rename the current thread under the KvikIO NVTX domain. + * + * @note This NVTX feature is currently not supported by the Nsight System profiler. As a result, + * the OS thread will not be renamed in the nsys-ui. + */ + static void rename_current_thread(std::string_view new_name) noexcept; + + nvtx_manager(nvtx_manager const&) = delete; + nvtx_manager& operator=(nvtx_manager const&) = delete; + nvtx_manager(nvtx_manager&&) = delete; + nvtx_manager& operator=(nvtx_manager&&) = delete; + + private: + nvtx_manager() = default; +}; + +/** + * @brief Convenience macro for generating an NVTX range in the `libkvikio` domain + * from the lifetime of a function. + * + * Takes no argument. The name of the immediately enclosing function returned by `__func__` is used + * as the message. + * + * Example: + * ``` + * void some_function(){ + * KVIKIO_NVTX_FUNC_RANGE(); // The name `some_function` is used as the message + * ... + * } + * ``` + */ +#ifdef KVIKIO_CUDA_FOUND +#define KVIKIO_NVTX_FUNC_RANGE() KVIKIO_NVTX_FUNC_RANGE_IMPL() +#else +#define KVIKIO_NVTX_FUNC_RANGE(...) \ + do { \ + } while (0) +#endif + +/** + * @brief Convenience macro for generating an NVTX scoped range in the `libkvikio` domain to + * annotate a time duration. + * + * @param message String literal for NVTX annotation. To improve profile-time performance, the + * string literal is registered in NVTX. + * @param payload NVTX payload. + * @param color (Optional) NVTX color. If unspecified, a default NVTX color is used. + * + * Example: + * ``` + * void some_function(){ + * KVIKIO_NVTX_SCOPED_RANGE("my function", 42); + * ... + * } + * ``` + */ +#ifdef KVIKIO_CUDA_FOUND +#define KVIKIO_NVTX_SCOPED_RANGE(...) KVIKIO_NVTX_SCOPED_RANGE_IMPL(__VA_ARGS__) +#else +#define KVIKIO_NVTX_SCOPED_RANGE(message, payload, ...) \ + do { \ + } while (0) +#endif + +/** + * @brief Convenience macro for generating an NVTX marker in the `libkvikio` domain to annotate a + * certain time point. + * + * @param message String literal for NVTX annotation. To improve profile-time performance, the + * string literal is registered in NVTX. + * @param payload NVTX payload. + * + * Example: + * ``` + * std::future some_function(){ + * size_t io_size{2077}; + * KVIKIO_NVTX_MARKER("I/O operation", io_size); + * perform_async_io_operation(io_size); + * ... + * } + * ``` + */ +#ifdef KVIKIO_CUDA_FOUND +#define KVIKIO_NVTX_MARKER(message, payload) KVIKIO_NVTX_MARKER_IMPL(message, payload) +#else +#define KVIKIO_NVTX_MARKER(message, payload) \ + do { \ + } while (0) +#endif + +} // namespace kvikio diff --git a/cpp/include/kvikio/parallel_operation.hpp b/cpp/include/kvikio/parallel_operation.hpp index f345333c4f..2c59d8ad5b 100644 --- a/cpp/include/kvikio/parallel_operation.hpp +++ b/cpp/include/kvikio/parallel_operation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -24,18 +25,52 @@ #include #include +#include #include namespace kvikio { namespace detail { +/** + * @brief Determine the NVTX color and call index. They are used to identify tasks from different + * pread/pwrite calls. Tasks from the same pread/pwrite call are given the same color and call + * index. The call index is atomically incremented on each pread/pwrite call, and will wrap around + * once it reaches the maximum value the integer type `std::uint64_t` can hold (this overflow + * behavior is well-defined in C++). The color is picked from an internal color palette according to + * the call index value. + * + * @return A pair of NVTX color and call index. + */ +inline const std::pair get_next_color_and_call_idx() noexcept +{ + static std::atomic_uint64_t call_counter{0ull}; + auto call_idx = + 1ull + std::atomic_fetch_add_explicit(&call_counter, 1ull, std::memory_order_relaxed); + auto& nvtx_color = nvtx_manager::get_color_by_index(call_idx); + return {nvtx_color, call_idx}; +} + template -std::future submit_task( - F op, T buf, std::size_t size, std::size_t file_offset, std::size_t devPtr_offset) +std::future submit_task(F op, + T buf, + std::size_t size, + std::size_t file_offset, + std::size_t devPtr_offset, + std::uint64_t nvtx_payload = 0ull, + nvtx_color_type nvtx_color = nvtx_manager::default_color()) { - return defaults::thread_pool().submit_task( - [=] { return op(buf, size, file_offset, devPtr_offset); }); + return defaults::thread_pool().submit_task([=] { + KVIKIO_NVTX_SCOPED_RANGE("task", nvtx_payload, nvtx_color); + + // Rename the worker thread in the thread pool to improve clarity from nsys-ui. + // Note: This NVTX feature is currently not supported by nsys-ui. + thread_local std::once_flag call_once_per_thread; + std::call_once(call_once_per_thread, + [] { nvtx_manager::rename_current_thread("thread pool"); }); + + return op(buf, size, file_offset, devPtr_offset); + }); } } // namespace detail @@ -58,13 +93,15 @@ std::future parallel_io(F op, std::size_t size, std::size_t file_offset, std::size_t task_size, - std::size_t devPtr_offset) + std::size_t devPtr_offset, + std::uint64_t call_idx = 0, + nvtx_color_type nvtx_color = nvtx_manager::default_color()) { if (task_size == 0) { throw std::invalid_argument("`task_size` cannot be zero"); } // Single-task guard if (task_size >= size || page_size >= size) { - return detail::submit_task(op, buf, size, file_offset, devPtr_offset); + return detail::submit_task(op, buf, size, file_offset, devPtr_offset, call_idx, nvtx_color); } // We know an upper bound of the total number of tasks @@ -73,14 +110,18 @@ std::future parallel_io(F op, // 1) Submit `task_size` sized tasks while (size >= task_size) { - tasks.push_back(detail::submit_task(op, buf, task_size, file_offset, devPtr_offset)); + tasks.push_back( + detail::submit_task(op, buf, task_size, file_offset, devPtr_offset, call_idx, nvtx_color)); file_offset += task_size; devPtr_offset += task_size; size -= task_size; } // 2) Submit a task for the remainder - if (size > 0) { tasks.push_back(detail::submit_task(op, buf, size, file_offset, devPtr_offset)); } + if (size > 0) { + tasks.push_back( + detail::submit_task(op, buf, size, file_offset, devPtr_offset, call_idx, nvtx_color)); + } // Finally, we sum the result of all tasks. auto gather_tasks = [](std::vector>&& tasks) -> std::size_t { diff --git a/cpp/include/kvikio/posix_io.hpp b/cpp/include/kvikio/posix_io.hpp index 99964315b3..955262066e 100644 --- a/cpp/include/kvikio/posix_io.hpp +++ b/cpp/include/kvikio/posix_io.hpp @@ -23,6 +23,7 @@ #include #include +#include #include #include diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp index b10e54c482..4b0c8aa75a 100644 --- a/cpp/include/kvikio/utils.hpp +++ b/cpp/include/kvikio/utils.hpp @@ -23,10 +23,6 @@ #include #include -#ifdef KVIKIO_CUDA_FOUND -#include -#endif - #include namespace kvikio { @@ -54,6 +50,12 @@ template >* = nullptr> return std::int64_t(value); } +/** + * @brief Helper function to allow NVTX payload of type std::uint64_t to pass through without doing + * anything. + */ +[[nodiscard]] inline std::uint64_t convert_to_64bit(std::uint64_t value) { return value; } + /** * @brief Help function to convert value to 64 bit float */ @@ -155,112 +157,4 @@ bool is_future_done(T const& future) return future.wait_for(std::chrono::seconds(0)) != std::future_status::timeout; } -#ifdef KVIKIO_CUDA_FOUND -/** - * @brief Tag type for libkvikio's NVTX domain. - */ -struct libkvikio_domain { - static constexpr char const* name{"libkvikio"}; -}; - -// Macro to concatenate two tokens x and y. -#define KVIKIO_CONCAT_HELPER(x, y) x##y -#define KVIKIO_CONCAT(x, y) KVIKIO_CONCAT_HELPER(x, y) - -// Macro to create a static, registered string that will not have a name conflict with any -// registered string defined in the same scope. -#define KVIKIO_REGISTER_STRING(msg) \ - [](const char* a_msg) -> auto& { \ - static nvtx3::registered_string_in a_reg_str{a_msg}; \ - return a_reg_str; \ - }(msg) - -// Macro overloads of KVIKIO_NVTX_FUNC_RANGE -#define KVIKIO_NVTX_FUNC_RANGE_IMPL() NVTX3_FUNC_RANGE_IN(libkvikio_domain) - -#define KVIKIO_NVTX_SCOPED_RANGE_IMPL(msg, val) \ - nvtx3::scoped_range_in KVIKIO_CONCAT(_kvikio_nvtx_range, __LINE__) \ - { \ - nvtx3::event_attributes \ - { \ - KVIKIO_REGISTER_STRING(msg), nvtx3::payload { convert_to_64bit(val) } \ - } \ - } - -#define KVIKIO_NVTX_MARKER_IMPL(msg, val) \ - nvtx3::mark_in( \ - nvtx3::event_attributes{KVIKIO_REGISTER_STRING(msg), nvtx3::payload{convert_to_64bit(val)}}) - -#endif - -/** - * @brief Convenience macro for generating an NVTX range in the `libkvikio` domain - * from the lifetime of a function. - * - * Takes no argument. The name of the immediately enclosing function returned by `__func__` is used - * as the message. - * - * Example: - * ``` - * void some_function(){ - * KVIKIO_NVTX_FUNC_RANGE(); // The name `some_function` is used as the message - * ... - * } - * ``` - */ -#ifdef KVIKIO_CUDA_FOUND -#define KVIKIO_NVTX_FUNC_RANGE() KVIKIO_NVTX_FUNC_RANGE_IMPL() -#else -#define KVIKIO_NVTX_FUNC_RANGE(...) \ - do { \ - } while (0) -#endif - -/** - * @brief Convenience macro for generating an NVTX scoped range in the `libkvikio` domain to - * annotate a time duration. - * - * Takes two arguments (message, payload). - * - * Example: - * ``` - * void some_function(){ - * KVIKIO_NVTX_SCOPED_RANGE("my function", 42); - * ... - * } - * ``` - */ -#ifdef KVIKIO_CUDA_FOUND -#define KVIKIO_NVTX_SCOPED_RANGE(msg, val) KVIKIO_NVTX_SCOPED_RANGE_IMPL(msg, val) -#else -#define KVIKIO_NVTX_SCOPED_RANGE(msg, val) \ - do { \ - } while (0) -#endif - -/** - * @brief Convenience macro for generating an NVTX marker in the `libkvikio` domain to annotate a - * certain time point. - * - * Takes two arguments (message, payload). Use this macro to annotate asynchronous I/O operations, - * where the payload refers to the I/O size. - * - * Example: - * ``` - * std::future some_function(){ - * size_t io_size{2077}; - * KVIKIO_NVTX_MARKER("I/O operation", io_size); - * perform_async_io_operation(io_size); - * ... - * } - * ``` - */ -#ifdef KVIKIO_CUDA_FOUND -#define KVIKIO_NVTX_MARKER(message, payload) KVIKIO_NVTX_MARKER_IMPL(message, payload) -#else -#define KVIKIO_NVTX_MARKER(message, payload) \ - do { \ - } while (0) -#endif - } // namespace kvikio diff --git a/cpp/src/file_handle.cpp b/cpp/src/file_handle.cpp index 0e65afb7fd..2b88fb2784 100644 --- a/cpp/src/file_handle.cpp +++ b/cpp/src/file_handle.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,7 @@ #include #include #include +#include namespace kvikio { @@ -139,13 +141,13 @@ std::size_t FileHandle::read(void* devPtr_base, std::size_t devPtr_offset, bool sync_default_stream) { + KVIKIO_NVTX_SCOPED_RANGE("FileHandle::read()", size); if (is_compat_mode_preferred()) { return detail::posix_device_read( _fd_direct_off.fd(), devPtr_base, size, file_offset, devPtr_offset); } if (sync_default_stream) { CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(nullptr)); } - KVIKIO_NVTX_SCOPED_RANGE("cufileRead()", size); ssize_t ret = cuFileAPI::instance().Read(_cufile_handle.handle(), devPtr_base, size, @@ -161,6 +163,7 @@ std::size_t FileHandle::write(void const* devPtr_base, std::size_t devPtr_offset, bool sync_default_stream) { + KVIKIO_NVTX_SCOPED_RANGE("FileHandle::write()", size); _nbytes = 0; // Invalidate the computed file size if (is_compat_mode_preferred()) { @@ -169,7 +172,6 @@ std::size_t FileHandle::write(void const* devPtr_base, } if (sync_default_stream) { CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(nullptr)); } - KVIKIO_NVTX_SCOPED_RANGE("cufileWrite()", size); ssize_t ret = cuFileAPI::instance().Write(_cufile_handle.handle(), devPtr_base, size, @@ -192,7 +194,8 @@ std::future FileHandle::pread(void* buf, std::size_t gds_threshold, bool sync_default_stream) { - KVIKIO_NVTX_MARKER("FileHandle::pread()", size); + auto& [nvtx_color, call_idx] = detail::get_next_color_and_call_idx(); + KVIKIO_NVTX_SCOPED_RANGE("FileHandle::pread()", size, nvtx_color); if (is_host_memory(buf)) { auto op = [this](void* hostPtr_base, std::size_t size, @@ -203,7 +206,7 @@ std::future FileHandle::pread(void* buf, _fd_direct_off.fd(), buf, size, file_offset); }; - return parallel_io(op, buf, size, file_offset, task_size, 0); + return parallel_io(op, buf, size, file_offset, task_size, 0, call_idx, nvtx_color); } CUcontext ctx = get_context_from_pointer(buf); @@ -232,7 +235,8 @@ std::future FileHandle::pread(void* buf, return read(devPtr_base, size, file_offset, devPtr_offset, /* sync_default_stream = */ false); }; auto [devPtr_base, base_size, devPtr_offset] = get_alloc_info(buf, &ctx); - return parallel_io(task, devPtr_base, size, file_offset, task_size, devPtr_offset); + return parallel_io( + task, devPtr_base, size, file_offset, task_size, devPtr_offset, call_idx, nvtx_color); } std::future FileHandle::pwrite(void const* buf, @@ -242,7 +246,8 @@ std::future FileHandle::pwrite(void const* buf, std::size_t gds_threshold, bool sync_default_stream) { - KVIKIO_NVTX_MARKER("FileHandle::pwrite()", size); + auto& [nvtx_color, call_idx] = detail::get_next_color_and_call_idx(); + KVIKIO_NVTX_SCOPED_RANGE("FileHandle::pwrite()", size, nvtx_color); if (is_host_memory(buf)) { auto op = [this](void const* hostPtr_base, std::size_t size, @@ -253,7 +258,7 @@ std::future FileHandle::pwrite(void const* buf, _fd_direct_off.fd(), buf, size, file_offset); }; - return parallel_io(op, buf, size, file_offset, task_size, 0); + return parallel_io(op, buf, size, file_offset, task_size, 0, call_idx, nvtx_color); } CUcontext ctx = get_context_from_pointer(buf); @@ -282,7 +287,8 @@ std::future FileHandle::pwrite(void const* buf, return write(devPtr_base, size, file_offset, devPtr_offset, /* sync_default_stream = */ false); }; auto [devPtr_base, base_size, devPtr_offset] = get_alloc_info(buf, &ctx); - return parallel_io(op, devPtr_base, size, file_offset, task_size, devPtr_offset); + return parallel_io( + op, devPtr_base, size, file_offset, task_size, devPtr_offset, call_idx, nvtx_color); } void FileHandle::read_async(void* devPtr_base, diff --git a/cpp/src/nvtx.cpp b/cpp/src/nvtx.cpp new file mode 100644 index 0000000000..8611533a2f --- /dev/null +++ b/cpp/src/nvtx.cpp @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#ifdef KVIKIO_CUDA_FOUND +#include +#endif + +#include + +namespace kvikio { + +nvtx_manager& nvtx_manager::instance() noexcept +{ + static nvtx_manager _instance; + return _instance; +} + +const nvtx_color_type& nvtx_manager::default_color() noexcept +{ +#ifdef KVIKIO_CUDA_FOUND + static nvtx_color_type default_color{nvtx3::argb{0, 255, 255, 255}}; + return default_color; +#else + static nvtx_color_type dummy{}; + return dummy; +#endif +} + +const nvtx_color_type& nvtx_manager::get_color_by_index(std::uint64_t idx) noexcept +{ +#ifdef KVIKIO_CUDA_FOUND + constexpr std::size_t num_color{16}; + static_assert((num_color & (num_color - 1)) == 0); // Is power of 2 + static std::array color_palette = {nvtx3::rgb{106, 192, 67}, + nvtx3::rgb{191, 73, 203}, + nvtx3::rgb{93, 151, 76}, + nvtx3::rgb{96, 72, 194}, + nvtx3::rgb{179, 170, 71}, + nvtx3::rgb{92, 58, 113}, + nvtx3::rgb{212, 136, 57}, + nvtx3::rgb{96, 144, 194}, + nvtx3::rgb{211, 69, 56}, + nvtx3::rgb{97, 179, 155}, + nvtx3::rgb{203, 69, 131}, + nvtx3::rgb{57, 89, 48}, + nvtx3::rgb{184, 133, 199}, + nvtx3::rgb{128, 102, 51}, + nvtx3::rgb{211, 138, 130}, + nvtx3::rgb{122, 50, 49}}; + auto safe_idx = idx & (num_color - 1); // idx % num_color + return color_palette[safe_idx]; +#else + static nvtx_color_type dummy{}; + return dummy; +#endif +} + +void nvtx_manager::rename_current_thread(std::string_view new_name) noexcept +{ +#ifdef KVIKIO_CUDA_FOUND + auto tid = syscall(SYS_gettid); + std::stringstream ss; + ss << new_name << " (" << tid << ")"; + + nvtxResourceAttributes_t attribs = {0}; + attribs.version = NVTX_VERSION; + attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE; + attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_THREAD_NATIVE; + attribs.identifier.ullValue = tid; + attribs.messageType = NVTX_MESSAGE_TYPE_ASCII; + attribs.message.ascii = ss.str().c_str(); + nvtxResourceHandle_t handle = + nvtxDomainResourceCreate(nvtx3::domain::get(), &attribs); +#endif +} + +} // namespace kvikio diff --git a/cpp/src/posix_io.cpp b/cpp/src/posix_io.cpp index 9576f284dc..ed149f5d43 100644 --- a/cpp/src/posix_io.cpp +++ b/cpp/src/posix_io.cpp @@ -22,6 +22,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp index 8ca04f94ed..1096cfac5a 100644 --- a/cpp/src/remote_handle.cpp +++ b/cpp/src/remote_handle.cpp @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -392,6 +393,7 @@ std::future RemoteHandle::pread(void* buf, std::size_t file_offset, std::size_t task_size) { + auto& [nvtx_color, call_idx] = detail::get_next_color_and_call_idx(); KVIKIO_NVTX_SCOPED_RANGE("RemoteHandle::pread()", size); auto task = [this](void* devPtr_base, std::size_t size, @@ -399,7 +401,7 @@ std::future RemoteHandle::pread(void* buf, std::size_t devPtr_offset) -> std::size_t { return read(static_cast(devPtr_base) + devPtr_offset, size, file_offset); }; - return parallel_io(task, buf, size, file_offset, task_size, 0); + return parallel_io(task, buf, size, file_offset, task_size, 0, call_idx, nvtx_color); } } // namespace kvikio diff --git a/cpp/src/utils.cpp b/cpp/src/utils.cpp index bed2cbafbc..fdc174f791 100644 --- a/cpp/src/utils.cpp +++ b/cpp/src/utils.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include #include @@ -23,10 +22,6 @@ #include #include -#ifdef KVIKIO_CUDA_FOUND -#include -#endif - #include #include #include