Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add from_arrow_device function to cudf interop using nanoarrow #15458

Merged
merged 16 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ add_library(
src/interop/from_arrow.cu
src/interop/to_arrow.cu
src/interop/to_arrow_device.cu
src/interop/from_arrow_device.cu
src/interop/to_arrow_schema.cpp
src/interop/to_arrow_utilities.cpp
src/interop/detail/arrow_allocator.cpp
Expand Down
124 changes: 124 additions & 0 deletions cpp/include/cudf/interop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -348,5 +348,129 @@ std::unique_ptr<cudf::scalar> from_arrow(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

/**
* @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray
*
*/
using owned_columns_t = std::vector<std::unique_ptr<cudf::column>>;

/**
* @brief functor for a custom deleter to a unique_ptr of table_view
*
* When converting from an ArrowDeviceArray, there are cases where data can't
* be zero-copy (i.e. bools or non-UINT32 dictionary indices). This custom deleter
* is used to maintain ownership over the data allocated since a `cudf::table_view`
* doesn't hold ownership.
*/
template <typename ViewType>
struct custom_view_deleter {
/**
* @brief Construct a new custom view deleter object
*
* @param owned Vector of owning columns
*/
explicit custom_view_deleter(owned_columns_t&& owned) : owned_mem_{std::move(owned)} {}

/**
* @brief operator to delete the unique_ptr
*
* @param ptr Pointer to the object to be deleted
*/
void operator()(ViewType* ptr) const { delete ptr; }

owned_columns_t owned_mem_; ///< Owned columns that must be deleted.
};

/**
* @brief typedef for a unique_ptr to a `cudf::table_view` with custom deleter
*
*/
using unique_table_view_t =
std::unique_ptr<cudf::table_view, custom_view_deleter<cudf::table_view>>;

/**
* @brief Create `cudf::table_view` from given `ArrowDeviceArray` and `ArrowSchema`
*
* Constructs a non-owning `cudf::table_view` using `ArrowDeviceArray` and `ArrowSchema`,
* data must be accessible to the CUDA device. Because the resulting `cudf::table_view` will
* not own the data, the `ArrowDeviceArray` must be kept alive for the lifetime of the result.
* It is the responsibility of callers to ensure they call the release callback on the
* `ArrowDeviceArray` after it is no longer needed, and that the `cudf::table_view` is not
* accessed after this happens.
*
* @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
* or `ARROW_DEVICE_CUDA_MANAGED`
*
* @throws cudf::data_type_error if the input array is not a struct array, non-struct
* arrays should be passed to `from_arrow_device_column` instead.
*
* @throws cudf::data_type_error if the input arrow data type is not supported.
*
* Each child of the input struct will be the columns of the resulting table_view.
*
* @note The custom deleter used for the unique_ptr to the table_view maintains ownership
* over any memory which is allocated, such as converting boolean columns from the bitmap
* used by Arrow to the 1-byte per value for cudf.
*
* @note If the input `ArrowDeviceArray` contained a non-null sync_event it is assumed
* to be a `cudaEvent_t*` and the passed in stream will have `cudaStreamWaitEvent` called
* on it with the event. This function, however, will not explicitly synchronize on the
* stream.
*
* @param schema `ArrowSchema` pointer to object describing the type of the device array
* @param input `ArrowDeviceArray` pointer to object owning the Arrow data
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to perform any allocations
* @return `cudf::table_view` generated from given Arrow data
*/
unique_table_view_t from_arrow_device(
ArrowSchema const* schema,
ArrowDeviceArray const* input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief typedef for a unique_ptr to a `cudf::column_view` with custom deleter
*
*/
using unique_column_view_t =
std::unique_ptr<cudf::column_view, custom_view_deleter<cudf::column_view>>;

/**
* @brief Create `cudf::column_view` from given `ArrowDeviceArray` and `ArrowSchema`
*
* Constructs a non-owning `cudf::column_view` using `ArrowDeviceArray` and `ArrowSchema`,
* data must be accessible to the CUDA device. Because the resulting `cudf::column_view` will
* not own the data, the `ArrowDeviceArray` must be kept alive for the lifetime of the result.
* It is the responsibility of callers to ensure they call the release callback on the
* `ArrowDeviceArray` after it is no longer needed, and that the `cudf::column_view` is not
* accessed after this happens.
*
* @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
* or `ARROW_DEVICE_CUDA_MANAGED`
*
* @throws cudf::data_type_error input arrow data type is not supported.
*
* @note The custom deleter used for the unique_ptr to the table_view maintains ownership
* over any memory which is allocated, such as converting boolean columns from the bitmap
* used by Arrow to the 1-byte per value for cudf.
*
* @note If the input `ArrowDeviceArray` contained a non-null sync_event it is assumed
* to be a `cudaEvent_t*` and the passed in stream will have `cudaStreamWaitEvent` called
* on it with the event. This function, however, will not explicitly synchronize on the
* stream.
*
* @param schema `ArrowSchema` pointer to object describing the type of the device array
* @param input `ArrowDeviceArray` pointer to object owning the Arrow data
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to perform any allocations
* @return `cudf::column_view` generated from given Arrow data
*/
unique_column_view_t from_arrow_device_column(
ArrowSchema const* schema,
ArrowDeviceArray const* input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
} // namespace cudf
30 changes: 30 additions & 0 deletions cpp/src/interop/arrow_utilities.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

namespace cudf {
namespace detail {

/**
* @brief constants for buffer indexes of Arrow arrays
*
*/
static constexpr int validity_buffer_idx = 0;
static constexpr int fixed_width_data_buffer_idx = 1;

} // namespace detail
} // namespace cudf
Loading
Loading