diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4051ff822be..4f838ba3f45 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -60,6 +60,8 @@ repos:
# of dependencies, so we'll have to update this manually.
additional_dependencies:
- cmakelang==0.6.13
+ verbose: true
+ require_serial: true
- id: cmake-lint
name: cmake-lint
entry: ./cpp/scripts/run-cmake-format.sh cmake-lint
@@ -69,6 +71,8 @@ repos:
# of dependencies, so we'll have to update this manually.
additional_dependencies:
- cmakelang==0.6.13
+ verbose: true
+ require_serial: true
- id: copyright-check
name: copyright-check
# This hook's use of Git tools appears to conflict with
diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index 5d03a518fcf..5593633640a 100755
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -37,7 +37,7 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/"
export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"
# Dask & Distributed option to install main(nightly) or `conda-forge` packages.
-export INSTALL_DASK_MAIN=1
+export INSTALL_DASK_MAIN=0
function remove_libcudf_kernel_cache_dir {
EXITCODE=$?
@@ -82,8 +82,8 @@ if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
gpuci_logger "gpuci_mamba_retry update dask"
gpuci_mamba_retry update dask
else
- gpuci_logger "gpuci_mamba_retry install conda-forge::dask>=2022.05.2 conda-forge::distributed>=2022.05.2 conda-forge::dask-core>=2022.05.2 --force-reinstall"
- gpuci_mamba_retry install conda-forge::dask>=2022.05.2 conda-forge::distributed>=2022.05.2 conda-forge::dask-core>=2022.05.2 --force-reinstall
+ gpuci_logger "gpuci_mamba_retry install conda-forge::dask==2022.7.1 conda-forge::distributed==2022.7.1 conda-forge::dask-core==2022.7.1 --force-reinstall"
+ gpuci_mamba_retry install conda-forge::dask==2022.7.1 conda-forge::distributed==2022.7.1 conda-forge::dask-core==2022.7.1 --force-reinstall
fi
# Install the master version of streamz
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 7b26519aa7d..8f215d1bb54 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -32,7 +32,7 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
unset GIT_DESCRIBE_TAG
# Dask & Distributed option to install main(nightly) or `conda-forge` packages.
-export INSTALL_DASK_MAIN=1
+export INSTALL_DASK_MAIN=0
# ucx-py version
export UCX_PY_VERSION='0.28.*'
@@ -92,8 +92,8 @@ function install_dask {
gpuci_mamba_retry update dask
conda list
else
- gpuci_logger "gpuci_mamba_retry install conda-forge::dask>=2022.05.2 conda-forge::distributed>=2022.05.2 conda-forge::dask-core>=2022.05.2 --force-reinstall"
- gpuci_mamba_retry install conda-forge::dask>=2022.05.2 conda-forge::distributed>=2022.05.2 conda-forge::dask-core>=2022.05.2 --force-reinstall
+ gpuci_logger "gpuci_mamba_retry install conda-forge::dask==2022.7.1 conda-forge::distributed==2022.7.1 conda-forge::dask-core==2022.7.1 --force-reinstall"
+ gpuci_mamba_retry install conda-forge::dask==2022.7.1 conda-forge::distributed==2022.7.1 conda-forge::dask-core==2022.7.1 --force-reinstall
fi
# Install the main version of streamz
gpuci_logger "Install the main version of streamz"
diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index 56531a7ae58..1e323182ffd 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -48,8 +48,8 @@ dependencies:
- pydocstyle=6.1.1
- typing_extensions
- pre-commit
- - dask>=2022.05.2
- - distributed>=2022.05.2
+ - dask==2022.7.1
+ - distributed==2022.7.1
- streamz
- arrow-cpp=8
- dlpack>=0.5,<0.6.0a0
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index acf85426d09..118f084b436 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -29,8 +29,8 @@ requirements:
- python
- streamz
- cudf ={{ version }}
- - dask>=2022.05.2
- - distributed>=2022.05.2
+ - dask==2022.7.1
+ - distributed==2022.7.1
- python-confluent-kafka >=1.7.0,<1.8.0a0
- cudf_kafka ={{ version }}
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 3d7e7895578..c9a179301b0 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -24,14 +24,14 @@ requirements:
host:
- python
- cudf ={{ version }}
- - dask>=2022.05.2
- - distributed>=2022.05.2
+ - dask==2022.7.1
+ - distributed==2022.7.1
- cudatoolkit ={{ cuda_version }}
run:
- python
- cudf ={{ version }}
- - dask>=2022.05.2
- - distributed>=2022.05.2
+ - dask==2022.7.1
+ - distributed==2022.7.1
- {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
test: # [linux64]
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 8e5e2a53692..2f96b6ce9ae 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -28,6 +28,12 @@ project(
VERSION 22.10.00
LANGUAGES C CXX CUDA
)
+if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5)
+ message(
+ FATAL_ERROR
+ "libcudf requires CUDA Toolkit 11.5+ to compile (nvcc ${CMAKE_CUDA_COMPILER_VERSION} provided)"
+ )
+endif()
# Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to
# have different values for the `Threads::Threads` target. Setting this flag ensures
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 116c5442dc3..e0f9a711776 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -273,7 +273,12 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
endfunction()
-set(CUDF_VERSION_Arrow 8.0.0)
+if(NOT DEFINED CUDF_VERSION_Arrow)
+ set(CUDF_VERSION_Arrow
+ 8.0.0
+ CACHE STRING "The version of Arrow to find (or build)"
+ )
+endif()
find_and_configure_arrow(
${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} ${CUDF_ENABLE_ARROW_ORC}
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index e6b12948d85..5f43f5af0e4 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -1146,7 +1146,7 @@ HTML_FILE_EXTENSION = .html
# of the possible markers and block names see the documentation.
# This tag requires that the tag GENERATE_HTML is set to YES.
-HTML_HEADER =
+HTML_HEADER = header.html
# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
# generated HTML page. If the tag is left blank doxygen will generate a standard
@@ -1156,7 +1156,7 @@ HTML_HEADER =
# that doxygen normally uses.
# This tag requires that the tag GENERATE_HTML is set to YES.
-HTML_FOOTER = footer.html
+HTML_FOOTER =
# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
# sheet that is used by each HTML page. It can be used to fine-tune the look of
diff --git a/cpp/doxygen/footer.html b/cpp/doxygen/footer.html
deleted file mode 100644
index 9bd79eeb539..00000000000
--- a/cpp/doxygen/footer.html
+++ /dev/null
@@ -1,4 +0,0 @@
-
-
-
-
diff --git a/cpp/doxygen/header.html b/cpp/doxygen/header.html
new file mode 100644
index 00000000000..569b8450e3a
--- /dev/null
+++ b/cpp/doxygen/header.html
@@ -0,0 +1,61 @@
+
+
+
+
+
+
+
+
+$projectname: $title
+$title
+
+
+
+$treeview
+$search
+$mathjax
+
+$extrastylesheet
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ $projectname
+ $projectnumber
+
+ $projectbrief
+
+
+
+
+
+ $projectbrief
+
+
+
+
+
+ $searchbox
+
+
+
+
+
+
+
+
diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index ac3824dfc21..c5f6d339ae9 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -23,6 +23,7 @@
#include
#include
+#include
#include
#include
@@ -75,6 +76,33 @@ class column {
*/
column(column&& other) noexcept;
+ /**
+ * @brief Construct a new column by taking ownership of the contents of a device_uvector.
+ *
+ * @param other The device_uvector whose contents will be moved into the new column.
+ * @param null_mask Optional, column's null value indicator bitmask. May
+ * be empty if `null_count` is 0 or `UNKNOWN_NULL_COUNT`.
+ * @param null_count Optional, the count of null elements. If unknown, specify
+ * `UNKNOWN_NULL_COUNT` to indicate that the null count should be computed on
+ * the first invocation of `null_count()`.
+ */
+ template () or cudf::is_chrono())>
+ column(rmm::device_uvector&& other,
+ rmm::device_buffer&& null_mask = {},
+ size_type null_count = UNKNOWN_NULL_COUNT)
+ : _type{cudf::data_type{cudf::type_to_id()}},
+ _size{[&]() {
+ CUDF_EXPECTS(
+ other.size() <= static_cast(std::numeric_limits::max()),
+ "The device_uvector size exceeds the maximum size_type.");
+ return static_cast(other.size());
+ }()},
+ _data{other.release()},
+ _null_mask{std::move(null_mask)},
+ _null_count{null_count}
+ {
+ }
+
/**
* @brief Construct a new column from existing device memory.
*
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index 7d8ac5c9325..1a4b8f02dd3 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -151,35 +151,6 @@ flattened_table flatten_nested_columns(
std::vector const& null_precedence,
column_nullability nullability = column_nullability::MATCH_INCOMING);
-/**
- * @brief Unflatten columns flattened as by `flatten_nested_columns()`,
- * based on the provided `blueprint`.
- *
- * cudf::flatten_nested_columns() executes depth first, and serializes the struct null vector
- * before the child/member columns.
- * E.g. STRUCT_1< STRUCT_2< A, B >, C > is flattened to:
- * 1. Null Vector for STRUCT_1
- * 2. Null Vector for STRUCT_2
- * 3. Member STRUCT_2::A
- * 4. Member STRUCT_2::B
- * 5. Member STRUCT_1::C
- *
- * `unflatten_nested_columns()` reconstructs nested columns from flattened input that follows
- * the convention above.
- *
- * Note: This function requires a null-mask vector for each STRUCT column, including for nested
- * STRUCT members.
- *
- * @param flattened "Flattened" `table` of input columns, following the conventions in
- * `flatten_nested_columns()`.
- * @param blueprint The exemplar `table_view` with nested columns intact, whose structure defines
- * the nesting of the reconstructed output table.
- * @return std::unique_ptr Unflattened table (with nested STRUCT columns) reconstructed
- * based on `blueprint`.
- */
-std::unique_ptr unflatten_nested_columns(std::unique_ptr&& flattened,
- table_view const& blueprint);
-
/**
* @brief Push down nulls from a parent mask into a child column, using bitwise AND.
*
diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index f985135064f..0521418d2d3 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -116,18 +116,17 @@ struct genericAtomicOperationImpl {
using T_int = unsigned int;
T old_value = *addr;
- T assumed{old_value};
+ T_int assumed;
+ T_int ret;
do {
- assumed = old_value;
- const T new_value = op(old_value, update_value);
+ T_int const new_value = type_reinterpret(op(old_value, update_value));
- T_int ret = atomicCAS(reinterpret_cast(addr),
- type_reinterpret(assumed),
- type_reinterpret(new_value));
+ assumed = type_reinterpret(old_value);
+ ret = atomicCAS(reinterpret_cast(addr), assumed, new_value);
old_value = type_reinterpret(ret);
- } while (assumed != old_value);
+ } while (assumed != ret);
return old_value;
}
@@ -142,18 +141,17 @@ struct genericAtomicOperationImpl {
static_assert(sizeof(T) == sizeof(T_int));
T old_value = *addr;
- T assumed{old_value};
+ T_int assumed;
+ T_int ret;
do {
- assumed = old_value;
- const T new_value = op(old_value, update_value);
+ T_int const new_value = type_reinterpret(op(old_value, update_value));
- T_int ret = atomicCAS(reinterpret_cast(addr),
- type_reinterpret(assumed),
- type_reinterpret(new_value));
+ assumed = type_reinterpret(old_value);
+ ret = atomicCAS(reinterpret_cast(addr), assumed, new_value);
old_value = type_reinterpret(ret);
- } while (assumed != old_value);
+ } while (assumed != ret);
return old_value;
}
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 30acf80548b..7f3cb95e4b2 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -24,6 +24,7 @@
#include
#include
+#include
#include
#include
#include
@@ -51,8 +52,8 @@ class orc_reader_options_builder;
class orc_reader_options {
source_info _source;
- // Names of column to read; empty is all
- std::vector _columns;
+ // Names of column to read; `nullopt` is all
+ std::optional> _columns;
// List of individual stripes to read (ignored if empty)
std::vector> _stripes;
@@ -105,18 +106,18 @@ class orc_reader_options {
[[nodiscard]] source_info const& get_source() const { return _source; }
/**
- * @brief Returns names of the columns to read.
+ * @brief Returns names of the columns to read, if set.
*
- * @return Names of the columns to read
+ * @return Names of the columns to read; `nullopt` if the option is not set
*/
- [[nodiscard]] std::vector const& get_columns() const { return _columns; }
+ [[nodiscard]] auto const& get_columns() const { return _columns; }
/**
* @brief Returns vector of vectors, stripes to read for each input source
*
* @return Vector of vectors, stripes to read for each input source
*/
- std::vector> const& get_stripes() const { return _stripes; }
+ [[nodiscard]] auto const& get_stripes() const { return _stripes; }
/**
* @brief Returns number of rows to skip from the start.
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 10368f84824..19156e01c1e 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -51,7 +51,7 @@ class parquet_reader_options_builder;
class parquet_reader_options {
source_info _source;
- // Path in schema of column to read; empty is all
+ // Path in schema of column to read; `nullopt` is all
std::optional> _columns;
// List of individual row groups to read (ignored if empty)
@@ -152,17 +152,14 @@ class parquet_reader_options {
*
* @return Names of column to be read; `nullopt` if the option is not set
*/
- [[nodiscard]] std::optional> const& get_columns() const
- {
- return _columns;
- }
+ [[nodiscard]] auto const& get_columns() const { return _columns; }
/**
* @brief Returns list of individual row groups to be read.
*
* @return List of individual row groups to be read
*/
- std::vector> const& get_row_groups() const { return _row_groups; }
+ [[nodiscard]] auto const& get_row_groups() const { return _row_groups; }
/**
* @brief Returns timestamp type used to cast timestamp columns.
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 6504e790677..c31176ab51c 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -23,11 +23,11 @@
#include
-#include
-
#include
#include
+#include
#include
+#include
#include
// Forward declarations
@@ -383,12 +383,12 @@ class table_input_metadata;
class column_in_metadata {
friend table_input_metadata;
std::string _name = "";
- thrust::optional _nullable;
+ std::optional _nullable;
bool _list_column_is_map = false;
bool _use_int96_timestamp = false;
bool _output_as_binary = false;
- thrust::optional _decimal_precision;
- thrust::optional _parquet_field_id;
+ std::optional _decimal_precision;
+ std::optional _parquet_field_id;
std::vector children;
public:
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index af66eb32618..573d0c81380 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -373,6 +373,19 @@ constexpr inline bool is_floating_point(data_type type)
return cudf::type_dispatcher(type, is_floating_point_impl{});
}
+/**
+ * @brief Indicates whether `T` is a std::byte type.
+ *
+ * @tparam T The type to verify
+ * @return true `type` is std::byte
+ * @return false `type` is not std::byte
+ */
+template
+constexpr inline bool is_byte()
+{
+ return std::is_same_v, std::byte>;
+}
+
/**
* @brief Indicates whether `T` is a Boolean type.
*
@@ -561,7 +574,8 @@ constexpr inline bool is_chrono(data_type type)
template
constexpr bool is_rep_layout_compatible()
{
- return cudf::is_numeric() or cudf::is_chrono() or cudf::is_boolean();
+ return cudf::is_numeric() or cudf::is_chrono() or cudf::is_boolean() or
+ cudf::is_byte();
}
/**
diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh
index 9c981c6cdaa..b9157c76492 100755
--- a/cpp/scripts/run-cmake-format.sh
+++ b/cpp/scripts/run-cmake-format.sh
@@ -1,5 +1,7 @@
#!/bin/bash
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+
# This script is a wrapper for cmakelang that may be used with pre-commit. The
# wrapping is necessary because RAPIDS libraries split configuration for
# cmakelang linters between a local config file and a second config file that's
@@ -69,5 +71,14 @@ fi
if [[ $1 == "cmake-format" ]]; then
cmake-format -i --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2}
elif [[ $1 == "cmake-lint" ]]; then
- cmake-lint --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2}
+ # Since the pre-commit hook is verbose, we have to be careful to only
+ # present cmake-lint's output (which is quite verbose) if we actually
+ # observe a failure.
+ OUTPUT=$(cmake-lint --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2})
+ status=$?
+
+ if ! [ ${status} -eq 0 ]; then
+ echo "${OUTPUT}"
+ fi
+ exit ${status}
fi
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 82765c60c1e..df3dfca5fa9 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -18,6 +18,7 @@
#include
#include
+#include
namespace cudf::io::orc::detail {
@@ -249,17 +250,17 @@ std::vector aggregate_orc_metadata::select_stri
}
column_hierarchy aggregate_orc_metadata::select_columns(
- std::vector const& column_paths)
+ std::optional> const& column_paths)
{
auto const& pfm = per_file_metadata[0];
column_hierarchy::nesting_map selected_columns;
- if (column_paths.empty()) {
+ if (not column_paths.has_value()) {
for (auto const& col_id : pfm.ff.types[0].subtypes) {
add_column_to_mapping(selected_columns, pfm, col_id);
}
} else {
- for (const auto& path : column_paths) {
+ for (const auto& path : column_paths.value()) {
bool name_found = false;
for (auto col_id = 1; col_id < pfm.get_num_columns(); ++col_id) {
if (pfm.column_path(col_id) == path) {
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index 9d2380c0097..3ce1a922f31 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -17,6 +17,7 @@
#include "orc.hpp"
#include
+#include
#include
namespace cudf::io::orc::detail {
@@ -126,10 +127,11 @@ class aggregate_orc_metadata {
* Paths are in format "grandparent_col.parent_col.child_col", where the root ORC column is
* omitted to match the cuDF table hierarchy.
*
- * @param column_paths List of full column names (i.e. paths) to select from the ORC file
+ * @param column_paths List of full column names (i.e. paths) to select from the ORC file;
+ * `nullopt` if user did not select columns to read
* @return Columns hierarchy - lists of children columns and sorted columns in each nesting level
*/
- column_hierarchy select_columns(std::vector const& column_paths);
+ column_hierarchy select_columns(std::optional> const& column_paths);
};
} // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index ff278f63366..74565b2f244 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -18,10 +18,9 @@
#include "parquet.hpp"
-#include
-
#include
#include
+#include
#include
#include
@@ -264,10 +263,10 @@ class ParquetFieldInt32 {
*/
class ParquetFieldOptionalInt32 {
int field_val;
- thrust::optional& val;
+ std::optional& val;
public:
- ParquetFieldOptionalInt32(int f, thrust::optional& v) : field_val(f), val(v) {}
+ ParquetFieldOptionalInt32(int f, std::optional& v) : field_val(f), val(v) {}
inline bool operator()(CompactProtocolReader* cpr, int field_type)
{
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index b03ba23737e..a03fdf27953 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -18,9 +18,8 @@
#include "parquet_common.hpp"
-#include
-
#include
+#include
#include
#include
@@ -147,7 +146,7 @@ struct SchemaElement {
int32_t num_children = 0;
int32_t decimal_scale = 0;
int32_t decimal_precision = 0;
- thrust::optional field_id = thrust::nullopt;
+ std::optional field_id = std::nullopt;
bool output_as_byte_array = false;
// The following fields are filled in later during schema initialization
diff --git a/cpp/src/io/statistics/byte_array_view.cuh b/cpp/src/io/statistics/byte_array_view.cuh
index 315e753a732..c1958780321 100644
--- a/cpp/src/io/statistics/byte_array_view.cuh
+++ b/cpp/src/io/statistics/byte_array_view.cuh
@@ -28,7 +28,7 @@ namespace cudf::io::statistics {
*/
class byte_array_view {
public:
- using element_type = uint8_t const; ///< The type of the elements in the byte array
+ using element_type = std::byte const; ///< The type of the elements in the byte array
constexpr byte_array_view() noexcept {}
/**
diff --git a/cpp/src/io/statistics/statistics.cuh b/cpp/src/io/statistics/statistics.cuh
index ab6674e4328..f2611f7cc26 100644
--- a/cpp/src/io/statistics/statistics.cuh
+++ b/cpp/src/io/statistics/statistics.cuh
@@ -85,7 +85,8 @@ struct t_array_stats {
__host__ __device__ __forceinline__ operator ReturnType() { return ReturnType(ptr, length); }
};
using string_stats = t_array_stats;
-using byte_array_stats = t_array_stats;
+using byte_array_view = statistics::byte_array_view;
+using byte_array_stats = t_array_stats;
union statistics_val {
string_stats str_val; //!< string columns
@@ -129,10 +130,10 @@ template ();
auto const* d_data = col.child(lists_column_view::child_column_index).data();
- offset_type offset = d_offsets[index];
+ auto const offset = d_offsets[index];
return T(d_data + offset, d_offsets[index + 1] - offset);
}
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 50d641c9a74..bc6bdd9dc7b 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -60,7 +60,7 @@ static reclass cclass_S(NCCLASS_S); // \S
static reclass cclass_D(NCCLASS_D); // \D
// Tables for analyzing quantifiers
-const std::array valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL, RBRA}};
+const std::array valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL}};
const std::array quantifiers{{'*', '?', '+', '{', '|'}};
// Valid regex characters that can be escaped and used as literals
const std::array escapable_chars{
@@ -459,16 +459,42 @@ class regex_parser {
}
// The quantifiers require at least one "real" previous item.
- // We are throwing an error in these two if-checks for invalid quantifiers.
+ // We are throwing errors for invalid quantifiers.
// Another option is to just return CHAR silently here which effectively
// treats the chr character as a literal instead as a quantifier.
// This could lead to confusion where sometimes unescaped quantifier characters
// are treated as regex expressions and sometimes they are not.
if (_items.empty()) { CUDF_FAIL("invalid regex pattern: nothing to repeat at position 0"); }
+ // Check that the previous item can be used with quantifiers.
+ // If the previous item is a capture group, we need to check items inside the
+ // capture group can be used with quantifiers too.
+ // (Note that capture groups can be nested).
+ auto previous_type = _items.back().type;
+ if (previous_type == RBRA) { // previous item is a capture group
+ // look for matching LBRA
+ auto nested_count = 1;
+ auto lbra_itr =
+ std::find_if(_items.rbegin(), _items.rend(), [nested_count](auto const& item) mutable {
+ auto const is_closing = (item.type == RBRA);
+ auto const is_opening = (item.type == LBRA || item.type == LBRA_NC);
+ nested_count += is_closing - is_opening;
+ return is_opening && (nested_count == 0);
+ });
+ // search for the first valid item within the LBRA-RBRA range
+ auto first_valid = std::find_first_of(
+ _items.rbegin() + 1,
+ lbra_itr,
+ valid_preceding_inst_types.begin(),
+ valid_preceding_inst_types.end(),
+ [](auto const item, auto const valid_type) { return item.type == valid_type; });
+ // set previous_type to be checked in next if-statement
+ previous_type = (first_valid == lbra_itr) ? (--lbra_itr)->type : first_valid->type;
+ }
+
if (std::find(valid_preceding_inst_types.begin(),
valid_preceding_inst_types.end(),
- _items.back().type) == valid_preceding_inst_types.end()) {
+ previous_type) == valid_preceding_inst_types.end()) {
CUDF_FAIL("invalid regex pattern: nothing to repeat at position " +
std::to_string(_expr_ptr - _pattern_begin - 1));
}
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index 1d5ebfaa7fc..bf4216b6983 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -209,98 +209,6 @@ flattened_table flatten_nested_columns(table_view const& input,
return table_flattener{input, column_order, null_precedence, nullability}();
}
-namespace {
-using vector_of_columns = std::vector>;
-using column_index_t = typename vector_of_columns::size_type;
-
-// Forward declaration, to enable recursion via `unflattener`.
-std::unique_ptr unflatten_struct(vector_of_columns& flattened,
- column_index_t& current_index,
- cudf::column_view const& blueprint);
-
-/**
- * @brief Helper functor to reconstruct STRUCT columns from its flattened member columns.
- *
- */
-class unflattener {
- public:
- unflattener(vector_of_columns& flattened_, column_index_t& current_index_)
- : flattened{flattened_}, current_index{current_index_}
- {
- }
-
- auto operator()(column_view const& blueprint)
- {
- return is_struct(blueprint) ? unflatten_struct(flattened, current_index, blueprint)
- : std::move(flattened[current_index++]);
- }
-
- private:
- vector_of_columns& flattened;
- column_index_t& current_index;
-
-}; // class unflattener;
-
-std::unique_ptr unflatten_struct(vector_of_columns& flattened,
- column_index_t& current_index,
- cudf::column_view const& blueprint)
-{
- // "Consume" columns from `flattened`, starting at `current_index`,
- // based on the provided `blueprint` struct col. Recurse for struct children.
- CUDF_EXPECTS(blueprint.type().id() == type_id::STRUCT,
- "Expected blueprint column to be a STRUCT column.");
-
- CUDF_EXPECTS(current_index < flattened.size(), "STRUCT column can't have 0 children.");
-
- auto const num_rows = flattened[current_index]->size();
-
- // cudf::flatten_nested_columns() executes depth first, and serializes the struct null vector
- // before the child/member columns.
- // E.g. STRUCT_1< STRUCT_2< A, B >, C > is flattened to:
- // 1. Null Vector for STRUCT_1
- // 2. Null Vector for STRUCT_2
- // 3. Member STRUCT_2::A
- // 4. Member STRUCT_2::B
- // 5. Member STRUCT_1::C
- //
- // Extract null-vector *before* child columns are constructed.
- auto struct_null_column_contents = flattened[current_index++]->release();
- auto unflattening_iter =
- thrust::make_transform_iterator(blueprint.child_begin(), unflattener{flattened, current_index});
-
- return cudf::make_structs_column(
- num_rows,
- vector_of_columns{unflattening_iter, unflattening_iter + blueprint.num_children()},
- UNKNOWN_NULL_COUNT, // Do count?
- std::move(*struct_null_column_contents.null_mask));
-}
-} // namespace
-
-std::unique_ptr unflatten_nested_columns(std::unique_ptr&& flattened,
- table_view const& blueprint)
-{
- // Bail, if LISTs are present.
- auto const has_lists = std::any_of(blueprint.begin(), blueprint.end(), is_or_has_nested_lists);
- CUDF_EXPECTS(not has_lists, "Unflattening LIST columns is not supported.");
-
- // If there are no STRUCTs, unflattening is a NOOP.
- auto const has_structs = std::any_of(blueprint.begin(), blueprint.end(), is_struct);
- if (not has_structs) {
- return std::move(flattened); // Unchanged.
- }
-
- // There be struct columns.
- // Note: Requires null vectors for all struct input columns.
- auto flattened_columns = flattened->release();
- auto current_idx = column_index_t{0};
-
- auto unflattening_iter =
- thrust::make_transform_iterator(blueprint.begin(), unflattener{flattened_columns, current_idx});
-
- return std::make_unique(
- vector_of_columns{unflattening_iter, unflattening_iter + blueprint.num_columns()});
-}
-
// Helper function to superimpose validity of parent struct
// over the specified member (child) column.
void superimpose_parent_nulls(bitmask_type const* parent_null_mask,
diff --git a/cpp/tests/column/column_test.cu b/cpp/tests/column/column_test.cu
index 6fcabbcf823..801cee285b6 100644
--- a/cpp/tests/column/column_test.cu
+++ b/cpp/tests/column/column_test.cu
@@ -345,6 +345,42 @@ TYPED_TEST(TypedColumnTest, MoveConstructorWithMask)
EXPECT_EQ(original_mask, moved_to_view.null_mask());
}
+TYPED_TEST(TypedColumnTest, DeviceUvectorConstructorNoMask)
+{
+ rmm::device_uvector original{static_cast(this->num_elements()),
+ cudf::default_stream_value};
+ thrust::copy(thrust::device,
+ static_cast(this->data.data()),
+ static_cast(this->data.data()) + this->num_elements(),
+ original.begin());
+ auto original_data = original.data();
+ cudf::column moved_to{std::move(original)};
+ verify_column_views(moved_to);
+
+ // Verify move
+ cudf::column_view moved_to_view = moved_to;
+ EXPECT_EQ(original_data, moved_to_view.head());
+}
+
+TYPED_TEST(TypedColumnTest, DeviceUvectorConstructorWithMask)
+{
+ rmm::device_uvector original{static_cast(this->num_elements()),
+ cudf::default_stream_value};
+ thrust::copy(thrust::device,
+ static_cast(this->data.data()),
+ static_cast(this->data.data()) + this->num_elements(),
+ original.begin());
+ auto original_data = original.data();
+ auto original_mask = this->all_valid_mask.data();
+ cudf::column moved_to{std::move(original), std::move(this->all_valid_mask)};
+ verify_column_views(moved_to);
+
+ // Verify move
+ cudf::column_view moved_to_view = moved_to;
+ EXPECT_EQ(original_data, moved_to_view.head());
+ EXPECT_EQ(original_mask, moved_to_view.null_mask());
+}
+
TYPED_TEST(TypedColumnTest, ConstructWithChildren)
{
std::vector> children;
diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index 0b7e0d13c24..1d2c8c489f3 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -459,5 +459,30 @@ TYPED_TEST(groupby_max_floating_point_test, values_with_infinity)
keys, vals, expected_keys, expected_vals, std::move(agg), force_use_sort_impl::YES);
}
+TYPED_TEST(groupby_max_floating_point_test, values_with_nan)
+{
+ using T = TypeParam;
+ using int32s_col = fixed_width_column_wrapper;
+ using floats_col = fixed_width_column_wrapper;
+
+ auto constexpr nan = std::numeric_limits::quiet_NaN();
+
+ auto const keys = int32s_col{1, 1};
+ auto const vals = floats_col{nan, nan};
+
+ std::vector requests;
+ requests.emplace_back(groupby::aggregation_request());
+ requests[0].values = vals;
+ requests[0].aggregations.emplace_back(cudf::make_max_aggregation());
+
+ // Without properly handling NaN, this will hang forever in hash-based aggregate (which is the
+ // default back-end for min/max in groupby context).
+ // This test is just to verify that the aggregate operation does not hang.
+ auto gb_obj = groupby::groupby(table_view({keys}));
+ auto const result = gb_obj.aggregate(requests);
+
+ EXPECT_EQ(result.first->num_rows(), 1);
+}
+
} // namespace test
} // namespace cudf
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index a12ec7c8739..9606c8c55ee 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -458,5 +458,30 @@ TYPED_TEST(groupby_min_floating_point_test, values_with_infinity)
keys, vals, expected_keys, expected_vals, std::move(agg), force_use_sort_impl::YES);
}
+TYPED_TEST(groupby_min_floating_point_test, values_with_nan)
+{
+ using T = TypeParam;
+ using int32s_col = fixed_width_column_wrapper;
+ using floats_col = fixed_width_column_wrapper;
+
+ auto constexpr nan = std::numeric_limits::quiet_NaN();
+
+ auto const keys = int32s_col{1, 1};
+ auto const vals = floats_col{nan, nan};
+
+ std::vector requests;
+ requests.emplace_back(groupby::aggregation_request());
+ requests[0].values = vals;
+ requests[0].aggregations.emplace_back(cudf::make_min_aggregation());
+
+ // Without properly handling NaN, this will hang forever in hash-based aggregate (which is the
+ // default back-end for min/max in groupby context).
+ // This test is just to verify that the aggregate operation does not hang.
+ auto gb_obj = groupby::groupby(table_view({keys}));
+ auto const result = gb_obj.aggregate(requests);
+
+ EXPECT_EQ(result.first->num_rows(), 1);
+}
+
} // namespace test
} // namespace cudf
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index b6a6270ca8b..c8aefece94f 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -915,4 +915,13 @@ TEST_F(JsonReaderTest, BadDtypeParams)
EXPECT_THROW(cudf_io::read_json(options_map), cudf::logic_error);
}
+TEST_F(JsonReaderTest, ExperimentalParam)
+{
+ cudf_io::json_reader_options const options =
+ cudf_io::json_reader_options::builder(cudf_io::source_info{nullptr, 0}).experimental(true);
+
+ // should throw for now
+ EXPECT_THROW(cudf_io::read_json(options), cudf::logic_error);
+}
+
CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index b3df2c8a8dd..76ffc92e243 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1514,4 +1514,23 @@ TEST_F(OrcWriterTest, DecimalOptionsNested)
result.tbl->view().column(0).child(1).child(0).child(1));
}
+TEST_F(OrcReaderTest, EmptyColumnsParam)
+{
+ srand(31337);
+ auto const expected = create_random_fixed_table(2, 4, false);
+
+ std::vector out_buffer;
+ cudf_io::orc_writer_options args =
+ cudf_io::orc_writer_options::builder(cudf_io::sink_info{&out_buffer}, *expected);
+ cudf_io::write_orc(args);
+
+ cudf_io::orc_reader_options read_opts =
+ cudf_io::orc_reader_options::builder(cudf_io::source_info{out_buffer.data(), out_buffer.size()})
+ .columns({});
+ auto const result = cudf_io::read_orc(read_opts);
+
+ EXPECT_EQ(result.tbl->num_columns(), 0);
+ EXPECT_EQ(result.tbl->num_rows(), 0);
+}
+
CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 70f28aa139d..d725f3d5dd0 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -424,6 +424,25 @@ TEST_F(StringsContainsTests, FixedQuantifier)
}
}
+TEST_F(StringsContainsTests, QuantifierErrors)
+{
+ auto input = cudf::test::strings_column_wrapper({"a", "aa", "aaa", "aaaa", "aaaaa", "aaaaaa"});
+ auto sv = cudf::strings_column_view(input);
+
+ EXPECT_THROW(cudf::strings::contains_re(sv, "^+"), cudf::logic_error);
+ EXPECT_THROW(cudf::strings::count_re(sv, "$+"), cudf::logic_error);
+ EXPECT_THROW(cudf::strings::count_re(sv, "(^)+"), cudf::logic_error);
+ EXPECT_THROW(cudf::strings::contains_re(sv, "($)+"), cudf::logic_error);
+ EXPECT_THROW(cudf::strings::count_re(sv, "\\A+"), cudf::logic_error);
+ EXPECT_THROW(cudf::strings::count_re(sv, "\\Z+"), cudf::logic_error);
+ EXPECT_THROW(cudf::strings::contains_re(sv, "(\\A)+"), cudf::logic_error);
+ EXPECT_THROW(cudf::strings::contains_re(sv, "(\\Z)+"), cudf::logic_error);
+
+ EXPECT_THROW(cudf::strings::contains_re(sv, "(^($))+"), cudf::logic_error);
+ EXPECT_NO_THROW(cudf::strings::contains_re(sv, "(^a($))+"));
+ EXPECT_NO_THROW(cudf::strings::count_re(sv, "(^(a$))+"));
+}
+
TEST_F(StringsContainsTests, OverlappedClasses)
{
auto input = cudf::test::strings_column_wrapper({"abcdefg", "defghí", "", "éééééé", "ghijkl"});
diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp
index b26ea87c5b8..d58568cd1b5 100644
--- a/cpp/tests/structs/utilities_tests.cpp
+++ b/cpp/tests/structs/utilities_tests.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -30,26 +30,13 @@
namespace cudf::test {
-/**
- * @brief Round-trip input table through flatten/unflatten,
- * verify that the table remains equivalent.
- */
-void flatten_unflatten_compare(table_view const& input_table)
-{
- using namespace cudf::structs::detail;
-
- auto flattened = flatten_nested_columns(input_table, {}, {}, column_nullability::FORCE);
- auto unflattened =
- unflatten_nested_columns(std::make_unique(flattened), input_table);
-
- CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, unflattened->view());
-}
-
using namespace cudf;
using namespace iterators;
+using namespace cudf::structs::detail;
using strings = strings_column_wrapper;
using dictionary = dictionary_column_wrapper;
using structs = structs_column_wrapper;
+using bools = fixed_width_column_wrapper;
template
using nums = fixed_width_column_wrapper;
@@ -66,7 +53,7 @@ struct TypedStructUtilitiesTest : StructUtilitiesTest {
TYPED_TEST_SUITE(TypedStructUtilitiesTest, FixedWidthTypes);
-TYPED_TEST(TypedStructUtilitiesTest, ListsAtTopLevelUnsupported)
+TYPED_TEST(TypedStructUtilitiesTest, ListsAtTopLevel)
{
using T = TypeParam;
using lists = lists_column_wrapper;
@@ -75,8 +62,10 @@ TYPED_TEST(TypedStructUtilitiesTest, ListsAtTopLevelUnsupported)
auto lists_col = lists{{0, 1}, {22, 33}, {44, 55, 66}};
auto nums_col = nums{{0, 1, 2}, null_at(6)};
- EXPECT_THROW(flatten_unflatten_compare(cudf::table_view{{lists_col, nums_col}}),
- cudf::logic_error);
+ auto table = cudf::table_view{{lists_col, nums_col}};
+
+ CUDF_TEST_EXPECT_TABLES_EQUAL(table,
+ flatten_nested_columns(table, {}, {}, column_nullability::FORCE));
}
TYPED_TEST(TypedStructUtilitiesTest, NestedListsUnsupported)
@@ -88,10 +77,10 @@ TYPED_TEST(TypedStructUtilitiesTest, NestedListsUnsupported)
auto lists_member = lists{{0, 1}, {22, 33}, {44, 55, 66}};
auto nums_member = nums{{0, 1, 2}, null_at(6)};
auto structs_col = structs{{nums_member, lists_member}};
+ auto nums_col = nums{{0, 1, 2}, null_at(6)};
- auto nums_col = nums{{0, 1, 2}, null_at(6)};
-
- EXPECT_THROW(flatten_unflatten_compare(cudf::table_view{{nums_col, structs_col}}),
+ EXPECT_THROW(flatten_nested_columns(
+ cudf::table_view{{nums_col, structs_col}}, {}, {}, column_nullability::FORCE),
cudf::logic_error);
}
@@ -104,7 +93,10 @@ TYPED_TEST(TypedStructUtilitiesTest, NoStructs)
auto strings_col = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
auto nuther_nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
- flatten_unflatten_compare(cudf::table_view{{nums_col, strings_col, nuther_nums_col}});
+ auto table = cudf::table_view{{nums_col, strings_col, nuther_nums_col}};
+
+ CUDF_TEST_EXPECT_TABLES_EQUAL(table,
+ flatten_nested_columns(table, {}, {}, column_nullability::FORCE));
}
TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStruct)
@@ -116,8 +108,19 @@ TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStruct)
auto strings_member = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
auto structs_col = structs{{nums_member, strings_member}};
auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
-
- flatten_unflatten_compare(cudf::table_view{{nums_col, structs_col}});
+ auto table = cudf::table_view{{nums_col, structs_col}};
+
+ auto expected_nums_col_1 = cudf::column(nums_col);
+ auto expected_structs_col = bools{{1, 1, 1, 1, 1, 1, 1}};
+ auto expected_nums_col_2 =
+ cudf::column(static_cast(structs_col).get_sliced_child(0));
+ auto expected_strings_col =
+ cudf::column(static_cast(structs_col).get_sliced_child(1));
+ auto expected = cudf::table_view{
+ {expected_nums_col_1, expected_structs_col, expected_nums_col_2, expected_strings_col}};
+
+ CUDF_TEST_EXPECT_TABLES_EQUAL(expected,
+ flatten_nested_columns(table, {}, {}, column_nullability::FORCE));
}
TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStructWithNulls)
@@ -129,8 +132,19 @@ TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStructWithNulls)
auto strings_member = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
auto structs_col = structs{{nums_member, strings_member}, null_at(2)};
auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
-
- flatten_unflatten_compare(cudf::table_view{{nums_col, structs_col}});
+ auto table = cudf::table_view{{nums_col, structs_col}};
+
+ auto expected_nums_col_1 = cudf::column(nums_col);
+ auto expected_structs_col = bools{{1, 1, 0, 1, 1, 1, 1}, null_at(2)};
+ auto expected_nums_col_2 =
+ cudf::column(static_cast(structs_col).get_sliced_child(0));
+ auto expected_strings_col =
+ cudf::column(static_cast(structs_col).get_sliced_child(1));
+ auto expected = cudf::table_view{
+ {expected_nums_col_1, expected_structs_col, expected_nums_col_2, expected_strings_col}};
+
+ CUDF_TEST_EXPECT_TABLES_EQUAL(expected,
+ flatten_nested_columns(table, {}, {}, column_nullability::FORCE));
}
TYPED_TEST(TypedStructUtilitiesTest, StructOfStruct)
@@ -147,8 +161,26 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStruct)
auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)};
auto struct_of_structs_col = structs{{struct_1_nums_member, structs_1_structs_member}};
-
- flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}});
+ auto table = cudf::table_view{{nums_col, struct_of_structs_col}};
+
+ auto expected_nums_col_1 = cudf::column(nums_col);
+ auto expected_structs_col_1 = bools{{1, 1, 1, 1, 1, 1, 1}};
+ auto expected_nums_col_2 =
+ cudf::column(static_cast(struct_of_structs_col).get_sliced_child(0));
+ auto expected_structs_col_2 = bools{{1, 1, 1, 1, 1, 1, 1}};
+ auto expected_nums_col_3 = cudf::column(
+ static_cast(struct_of_structs_col).get_sliced_child(1).child(0));
+ auto expected_strings_col = cudf::column(
+ static_cast(struct_of_structs_col).get_sliced_child(1).child(1));
+ auto expected = cudf::table_view{{expected_nums_col_1,
+ expected_structs_col_1,
+ expected_nums_col_2,
+ expected_structs_col_2,
+ expected_nums_col_3,
+ expected_strings_col}};
+
+ CUDF_TEST_EXPECT_TABLES_EQUAL(expected,
+ flatten_nested_columns(table, {}, {}, column_nullability::FORCE));
}
TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtLeafLevel)
@@ -166,8 +198,26 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtLeafLevel)
auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)};
auto struct_of_structs_col = structs{{struct_1_nums_member, structs_1_structs_member}};
-
- flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}});
+ auto table = cudf::table_view{{nums_col, struct_of_structs_col}};
+
+ auto expected_nums_col_1 = cudf::column(nums_col);
+ auto expected_structs_col_1 = bools{{1, 1, 1, 1, 1, 1, 1}};
+ auto expected_nums_col_2 =
+ cudf::column(static_cast(struct_of_structs_col).get_sliced_child(0));
+ auto expected_structs_col_2 = bools{{1, 1, 0, 1, 1, 1, 1}, null_at(2)};
+ auto expected_nums_col_3 = cudf::column(
+ static_cast(struct_of_structs_col).get_sliced_child(1).child(0));
+ auto expected_strings_col = cudf::column(
+ static_cast(struct_of_structs_col).get_sliced_child(1).child(1));
+ auto expected = cudf::table_view{{expected_nums_col_1,
+ expected_structs_col_1,
+ expected_nums_col_2,
+ expected_structs_col_2,
+ expected_nums_col_3,
+ expected_strings_col}};
+
+ CUDF_TEST_EXPECT_TABLES_EQUAL(expected,
+ flatten_nested_columns(table, {}, {}, column_nullability::FORCE));
}
TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtTopLevel)
@@ -185,8 +235,26 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtTopLevel)
auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)};
auto struct_of_structs_col =
structs{{struct_1_nums_member, structs_1_structs_member}, null_at(4)};
-
- flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}});
+ auto table = cudf::table_view{{nums_col, struct_of_structs_col}};
+
+ auto expected_nums_col_1 = cudf::column(nums_col);
+ auto expected_structs_col_1 = bools{{1, 1, 1, 1, 0, 1, 1}, null_at(4)};
+ auto expected_nums_col_2 =
+ cudf::column(static_cast(struct_of_structs_col).get_sliced_child(0));
+ auto expected_structs_col_2 = bools{{1, 1, 1, 1, 0, 1, 1}, null_at(4)};
+ auto expected_nums_col_3 = cudf::column(
+ static_cast(struct_of_structs_col).get_sliced_child(1).child(0));
+ auto expected_strings_col = cudf::column(
+ static_cast(struct_of_structs_col).get_sliced_child(1).child(1));
+ auto expected = cudf::table_view{{expected_nums_col_1,
+ expected_structs_col_1,
+ expected_nums_col_2,
+ expected_structs_col_2,
+ expected_nums_col_3,
+ expected_strings_col}};
+
+ CUDF_TEST_EXPECT_TABLES_EQUAL(expected,
+ flatten_nested_columns(table, {}, {}, column_nullability::FORCE));
}
TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtAllLevels)
@@ -205,8 +273,26 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtAllLevels)
auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)};
auto struct_of_structs_col =
structs{{struct_1_nums_member, structs_1_structs_member}, null_at(4)};
-
- flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}});
+ auto table = cudf::table_view{{nums_col, struct_of_structs_col}};
+
+ auto expected_nums_col_1 = cudf::column(nums_col);
+ auto expected_structs_col_1 = bools{{1, 1, 1, 1, 0, 1, 1}, null_at(4)};
+ auto expected_nums_col_2 =
+ cudf::column(static_cast(struct_of_structs_col).get_sliced_child(0));
+ auto expected_structs_col_2 = bools{{1, 1, 0, 1, 0, 1, 1}, {1, 1, 0, 1, 0, 1, 1}};
+ auto expected_nums_col_3 = cudf::column(
+ static_cast(struct_of_structs_col).get_sliced_child(1).child(0));
+ auto expected_strings_col = cudf::column(
+ static_cast(struct_of_structs_col).get_sliced_child(1).child(1));
+ auto expected = cudf::table_view{{expected_nums_col_1,
+ expected_structs_col_1,
+ expected_nums_col_2,
+ expected_structs_col_2,
+ expected_nums_col_3,
+ expected_strings_col}};
+
+ CUDF_TEST_EXPECT_TABLES_EQUAL(expected,
+ flatten_nested_columns(table, {}, {}, column_nullability::FORCE));
}
TYPED_TEST(TypedStructUtilitiesTest, ListsAreUnsupported)
@@ -222,7 +308,8 @@ TYPED_TEST(TypedStructUtilitiesTest, ListsAreUnsupported)
auto structs_with_lists_col = structs{lists_member, ints_member};
- EXPECT_THROW(flatten_unflatten_compare(cudf::table_view{{structs_with_lists_col}}),
+ EXPECT_THROW(flatten_nested_columns(
+ cudf::table_view{{structs_with_lists_col}}, {}, {}, column_nullability::FORCE),
cudf::logic_error);
}
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetOptions.java
index dd771cab7ea..1ae1b91b962 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetOptions.java
@@ -18,6 +18,10 @@
package ai.rapids.cudf;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
/**
* Options for reading a parquet file
*/
@@ -26,24 +30,32 @@ public class ParquetOptions extends ColumnFilterOptions {
public static ParquetOptions DEFAULT = new ParquetOptions(new Builder());
private final DType unit;
-
-
+ private final boolean[] readBinaryAsString;
private ParquetOptions(Builder builder) {
super(builder);
unit = builder.unit;
+ readBinaryAsString = new boolean[builder.binaryAsStringColumns.size()];
+ for (int i = 0 ; i < builder.binaryAsStringColumns.size() ; i++) {
+ readBinaryAsString[i] = builder.binaryAsStringColumns.get(i);
+ }
}
DType timeUnit() {
return unit;
}
- public static Builder builder() {
+ boolean[] getReadBinaryAsString() {
+ return readBinaryAsString;
+ }
+
+ public static ParquetOptions.Builder builder() {
return new Builder();
}
public static class Builder extends ColumnFilterOptions.Builder {
private DType unit = DType.EMPTY;
+ final List binaryAsStringColumns = new ArrayList<>();
/**
* Specify the time unit to use when returning timestamps.
@@ -56,6 +68,43 @@ public Builder withTimeUnit(DType unit) {
return this;
}
+ /**
+ * Include one or more specific columns. Any column not included will not be read.
+ * @param names the name of the column, or more than one if you want.
+ */
+ @Override
+ public Builder includeColumn(String... names) {
+ super.includeColumn(names);
+ for (int i = 0 ; i < names.length ; i++) {
+ binaryAsStringColumns.add(true);
+ }
+ return this;
+ }
+
+ /**
+ * Include this column.
+ * @param name the name of the column
+ * @param isBinary whether this column is to be read in as binary
+ */
+ public Builder includeColumn(String name, boolean isBinary) {
+ includeColumnNames.add(name);
+ binaryAsStringColumns.add(!isBinary);
+ return this;
+ }
+
+ /**
+ * Include one or more specific columns. Any column not included will not be read.
+ * @param names the name of the column, or more than one if you want.
+ */
+ @Override
+ public Builder includeColumn(Collection names) {
+ super.includeColumn(names);
+ for (int i = 0 ; i < names.size() ; i++) {
+ binaryAsStringColumns.add(true);
+ }
+ return this;
+ }
+
public ParquetOptions build() {
return new ParquetOptions(this);
}
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index c8f842fcc63..e5194b8b7eb 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -239,16 +239,20 @@ private static native long[] readJSON(String[] columnNames,
String filePath, long address, long length,
boolean dayFirst, boolean lines) throws CudfException;
+ private static native long readAndInferJSON(long address, long length,
+ boolean dayFirst, boolean lines) throws CudfException;
+
/**
* Read in Parquet formatted data.
* @param filterColumnNames name of the columns to read, or an empty array if we want to read
* all of them
+ * @param binaryToString whether to convert this column to String if binary
* @param filePath the path of the file to read, or null if no path should be read.
* @param address the address of the buffer to read from or 0 if we should not.
* @param length the length of the buffer to read from.
* @param timeUnit return type of TimeStamp in units
*/
- private static native long[] readParquet(String[] filterColumnNames, String filePath,
+ private static native long[] readParquet(String[] filterColumnNames, boolean[] binaryToString, String filePath,
long address, long length, int timeUnit) throws CudfException;
/**
@@ -918,6 +922,26 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon
}
}
+ /**
+ * Read JSON formatted data and infer the column names and schema.
+ * @param opts various JSON parsing options.
+ * @param buffer raw UTF8 formatted bytes.
+ * @param offset the starting offset into buffer.
+ * @param len the number of bytes to parse.
+ * @return the data parsed as a table on the GPU and the metadata for the table returned.
+ */
+ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
+ long offset, long len) {
+ if (len <= 0) {
+ len = buffer.length - offset;
+ }
+ assert len > 0;
+ assert len <= buffer.length - offset;
+ assert offset >= 0 && offset < buffer.length;
+ return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len,
+ opts.isDayFirst(), opts.isLines()));
+ }
+
/**
* Read JSON formatted data.
* @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
@@ -956,7 +980,7 @@ public static Table readParquet(File path) {
* @return the file parsed as a table on the GPU.
*/
public static Table readParquet(ParquetOptions opts, File path) {
- return new Table(readParquet(opts.getIncludeColumnNames(),
+ return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
path.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId()));
}
@@ -1016,7 +1040,7 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer,
assert len > 0;
assert len <= buffer.getLength() - offset;
assert offset >= 0 && offset < buffer.length;
- return new Table(readParquet(opts.getIncludeColumnNames(),
+ return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId()));
}
diff --git a/java/src/main/java/ai/rapids/cudf/TableWithMeta.java b/java/src/main/java/ai/rapids/cudf/TableWithMeta.java
new file mode 100644
index 00000000000..9baa127d39d
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/TableWithMeta.java
@@ -0,0 +1,67 @@
+/*
+ *
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+package ai.rapids.cudf;
+
+/**
+ * A table along with some metadata about the table. This is typically returned when
+ * reading data from an input file where the metadata can be important.
+ */
+public class TableWithMeta implements AutoCloseable {
+ private long handle;
+
+ TableWithMeta(long handle) {
+ this.handle = handle;
+ }
+
+ /**
+ * Get the table out of this metadata. Note that this can only be called once. Later calls
+ * will return a null.
+ */
+ public Table releaseTable() {
+ long[] ptr = releaseTable(handle);
+ if (ptr == null) {
+ return null;
+ } else {
+ return new Table(ptr);
+ }
+ }
+
+ /**
+ * Get the names of the top level columns. In the future new APIs can be added to get
+ * names of child columns.
+ */
+ public String[] getColumnNames() {
+ return getColumnNames(handle);
+ }
+
+ @Override
+ public void close() throws Exception {
+ if (handle != 0) {
+ close(handle);
+ handle = 0;
+ }
+ }
+
+ private static native void close(long handle);
+
+ private static native long[] releaseTable(long handle);
+
+ private static native String[] getColumnNames(long handle);
+}
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index d511512431b..44c08aec110 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1314,6 +1314,77 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
CATCH_STD(env, NULL);
}
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
+ JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) {
+
+ JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
+ if (buffer_length <= 0) {
+ JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "An empty buffer is not supported", 0);
+ }
+
+ try {
+ cudf::jni::auto_set_device(env);
+
+ auto source = cudf::io::source_info{reinterpret_cast(buffer),
+ static_cast(buffer_length)};
+
+ cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
+ .dayfirst(static_cast(day_first))
+ .lines(static_cast(lines));
+
+ auto result =
+ std::make_unique(cudf::io::read_json(opts.build()));
+
+ return reinterpret_cast(result.release());
+ }
+ CATCH_STD(env, 0);
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_TableWithMeta_close(JNIEnv *env, jclass, jlong handle) {
+ JNI_NULL_CHECK(env, handle, "handle is null", );
+
+ try {
+ cudf::jni::auto_set_device(env);
+ delete reinterpret_cast(handle);
+ }
+ CATCH_STD(env, );
+}
+
+JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_TableWithMeta_getColumnNames(JNIEnv *env, jclass,
+ jlong handle) {
+ JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
+
+ try {
+ cudf::jni::auto_set_device(env);
+ auto ptr = reinterpret_cast(handle);
+ auto length = ptr->metadata.column_names.size();
+ auto ret = static_cast(
+ env->NewObjectArray(length, env->FindClass("java/lang/String"), nullptr));
+ for (size_t i = 0; i < length; i++) {
+ env->SetObjectArrayElement(ret, i, env->NewStringUTF(ptr->metadata.column_names[i].c_str()));
+ }
+
+ return ret;
+ }
+ CATCH_STD(env, nullptr);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIEnv *env, jclass,
+ jlong handle) {
+ JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
+
+ try {
+ cudf::jni::auto_set_device(env);
+ auto ptr = reinterpret_cast(handle);
+ if (ptr->tbl) {
+ return convert_table_for_return(env, ptr->tbl);
+ } else {
+ return nullptr;
+ }
+ }
+ CATCH_STD(env, nullptr);
+}
+
JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON(
JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) {
@@ -1428,11 +1499,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON(
CATCH_STD(env, NULL);
}
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env, jclass,
- jobjectArray filter_col_names,
- jstring inputfilepath,
- jlong buffer,
- jlong buffer_length, jint unit) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
+ JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read,
+ jstring inputfilepath, jlong buffer, jlong buffer_length, jint unit) {
+
+ JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0);
bool read_buffer = true;
if (buffer == 0) {
JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
@@ -1454,6 +1525,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env,
}
cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+ cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
auto source = read_buffer ? cudf::io::source_info(reinterpret_cast(buffer),
static_cast(buffer_length)) :
@@ -1461,7 +1533,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env,
auto builder = cudf::io::parquet_reader_options::builder(source);
if (n_filter_col_names.size() > 0) {
- builder = builder.columns(n_filter_col_names.as_cpp_vector());
+ builder = builder.columns(n_filter_col_names.as_cpp_vector())
+ .convert_binary_to_strings(n_col_binary_read.to_vector());
}
cudf::io::parquet_reader_options opts =
@@ -1678,10 +1751,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
cudf::io::source_info(reinterpret_cast(buffer), buffer_length) :
cudf::io::source_info(filename.get());
+ auto builder = cudf::io::orc_reader_options::builder(source);
+ if (n_filter_col_names.size() > 0) {
+ builder = builder.columns(n_filter_col_names.as_cpp_vector());
+ }
+
cudf::io::orc_reader_options opts =
- cudf::io::orc_reader_options::builder(source)
- .columns(n_filter_col_names.as_cpp_vector())
- .use_index(false)
+ builder.use_index(false)
.use_np_dtypes(static_cast(usingNumPyTypes))
.timestamp_type(cudf::data_type(static_cast(unit)))
.decimal128_columns(n_dec128_col_names.as_cpp_vector())
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 7ef47d6a7cc..c7e6fecea26 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -78,6 +78,7 @@
public class TableTest extends CudfTestBase {
private static final File TEST_PARQUET_FILE = TestUtils.getResourceAsFile("acq.parquet");
+ private static final File TEST_PARQUET_FILE_BINARY = TestUtils.getResourceAsFile("binary.parquet");
private static final File TEST_ORC_FILE = TestUtils.getResourceAsFile("TestOrcFile.orc");
private static final File TEST_ORC_TIMESTAMP_DATE_FILE = TestUtils.getResourceAsFile("timestamp-date-test.orc");
private static final File TEST_DECIMAL_PARQUET_FILE = TestUtils.getResourceAsFile("decimal.parquet");
@@ -566,6 +567,19 @@ void testReadParquet() {
}
}
+ @Test
+ void testReadParquetBinary() {
+ ParquetOptions opts = ParquetOptions.builder()
+ .includeColumn("value1", true)
+ .includeColumn("value2", false)
+ .build();
+ try (Table table = Table.readParquet(opts, TEST_PARQUET_FILE_BINARY)) {
+ assertTableTypes(new DType[]{DType.LIST, DType.STRING}, table);
+ ColumnView columnView = table.getColumn(0);
+ assertEquals(DType.INT8, columnView.getChildColumnView(0).getType());
+ }
+ }
+
@Test
void testReadParquetBuffer() throws IOException {
ParquetOptions opts = ParquetOptions.builder()
diff --git a/java/src/test/resources/binary.parquet b/java/src/test/resources/binary.parquet
new file mode 100644
index 00000000000..b72be9f36cc
Binary files /dev/null and b/java/src/test/resources/binary.parquet differ
diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd
index 2c65e329bb0..bc9d87a5cbf 100644
--- a/python/cudf/cudf/_lib/cpp/io/json.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/json.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
from libc.stdint cimport uint8_t
from libcpp cimport bool
@@ -24,6 +24,7 @@ cdef extern from "cudf/io/json.hpp" \
size_type get_byte_range_size() except+
bool is_enabled_lines() except+
bool is_enabled_dayfirst() except+
+ bool is_enabled_experimental() except+
# setter
void set_dtypes(vector[data_type] types) except+
@@ -35,6 +36,7 @@ cdef extern from "cudf/io/json.hpp" \
void set_byte_range_size(size_type size) except+
void enable_lines(bool val) except+
void enable_dayfirst(bool val) except+
+ void enable_experimental(bool val) except+
@staticmethod
json_reader_options_builder builder(
@@ -70,6 +72,9 @@ cdef extern from "cudf/io/json.hpp" \
json_reader_options_builder& dayfirst(
bool val
) except+
+ json_reader_options_builder& experimental(
+ bool val
+ ) except+
json_reader_options build() except+
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index 62ff5eb4f53..3e44ef98348 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -19,7 +19,6 @@ cdef extern from "cudf/io/orc.hpp" \
orc_reader_options() except+
cudf_io_types.source_info get_source() except+
- vector[string] get_columns() except+
vector[vector[size_type]] get_stripes() except+
size_type get_skip_rows() except+
size_type get_num_rows() except+
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 9c820a56104..0ee6062e7f2 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -31,7 +31,8 @@ cpdef read_json(object filepaths_or_buffers,
object dtype,
bool lines,
object compression,
- object byte_range):
+ object byte_range,
+ bool experimental):
"""
Cython function to call into libcudf API, see `read_json`.
@@ -98,6 +99,7 @@ cpdef read_json(object filepaths_or_buffers,
.lines(c_lines)
.byte_range_offset(c_range_offset)
.byte_range_size(c_range_size)
+ .experimental(experimental)
.build()
)
if is_list_like_dtypes:
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 4d1090d8434..11c70317a39 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -103,7 +103,7 @@ cpdef read_orc(object filepaths_or_buffers,
"""
cdef orc_reader_options c_orc_reader_options = make_orc_reader_options(
filepaths_or_buffers,
- columns or [],
+ columns,
stripes or [],
get_size_t_arg(skip_rows, "skip_rows"),
get_size_t_arg(num_rows, "num_rows"),
@@ -325,16 +325,11 @@ cdef orc_reader_options make_orc_reader_options(
for i, datasource in enumerate(filepaths_or_buffers):
if isinstance(datasource, NativeFile):
filepaths_or_buffers[i] = NativeFileDatasource(datasource)
- cdef vector[string] c_column_names
cdef vector[vector[size_type]] strps = stripes
- c_column_names.reserve(len(column_names))
- for col in column_names:
- c_column_names.push_back(str(col).encode())
cdef orc_reader_options opts
cdef source_info src = make_source_info(filepaths_or_buffers)
opts = move(
orc_reader_options.builder(src)
- .columns(c_column_names)
.stripes(strps)
.skip_rows(skip_rows)
.num_rows(num_rows)
@@ -343,6 +338,13 @@ cdef orc_reader_options make_orc_reader_options(
.build()
)
+ cdef vector[string] c_column_names
+ if column_names is not None:
+ c_column_names.reserve(len(column_names))
+ for col in column_names:
+ c_column_names.push_back(str(col).encode())
+ opts.set_columns(c_column_names)
+
return opts
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 264b1fb507b..c25360b307d 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -177,9 +177,8 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
allow_range_index = True
if columns is not None:
cpp_columns.reserve(len(columns))
- if len(cpp_columns) == 0:
- allow_range_index = False
- for col in columns or []:
+ allow_range_index = False
+ for col in columns:
cpp_columns.push_back(str(col).encode())
args.set_columns(cpp_columns)
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index 85f024e2420..e1e8e7cdb3d 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -30,7 +30,7 @@ def read_json(
raise ValueError("cudf engine only supports JSON Lines format")
if engine == "auto":
engine = "cudf" if lines else "pandas"
- if engine == "cudf":
+ if engine == "cudf" or engine == "cudf_experimental":
# Multiple sources are passed as a list. If a single source is passed,
# wrap it in a list for unified processing downstream.
if not is_list_like(path_or_buf):
@@ -58,7 +58,12 @@ def read_json(
filepaths_or_buffers.append(tmp_source)
df = libjson.read_json(
- filepaths_or_buffers, dtype, lines, compression, byte_range
+ filepaths_or_buffers,
+ dtype,
+ lines,
+ compression,
+ byte_range,
+ engine == "cudf_experimental",
)
else:
warnings.warn(
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 358687d36c3..236fd619b8e 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2674,7 +2674,7 @@ def test_rangeindex_join_user_option(default_integer_bitwidth):
idx1 = cudf.RangeIndex(0, 10)
idx2 = cudf.RangeIndex(5, 15)
- actual = idx1.join(idx2, how="inner")
+ actual = idx1.join(idx2, how="inner", sort=True)
expected = cudf.Index(
[5, 6, 7, 8, 9], dtype=f"int{default_integer_bitwidth}", name=0
)
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 84cf5872219..800ed68e8a4 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -573,3 +573,9 @@ def test_default_float_bitwidth(default_float_bitwidth):
)
assert df["a"].dtype == np.dtype(f"f{default_float_bitwidth//8}")
assert df["b"].dtype == np.dtype(f"f{default_float_bitwidth//8}")
+
+
+def test_json_experimental():
+ # should raise an exception, for now
+ with pytest.raises(RuntimeError):
+ cudf.read_json("", engine="cudf_experimental")
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 132eb528cd0..4373ef9afdf 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1758,3 +1758,25 @@ def test_orc_writer_zlib_compression(list_struct_buff):
pytest.mark.xfail(reason="nvcomp build doesn't have deflate")
else:
raise e
+
+
+@pytest.mark.parametrize("index", [True, False, None])
+@pytest.mark.parametrize("columns", [None, [], ["b", "a"]])
+def test_orc_columns_and_index_param(index, columns):
+ buffer = BytesIO()
+ df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+ df.to_orc(buffer, index=index)
+
+ expected = pd.read_orc(buffer, columns=columns)
+ got = cudf.read_orc(buffer, columns=columns)
+
+ if columns:
+ # TODO: Remove workaround after this issue is fixed:
+ # https://github.com/pandas-dev/pandas/issues/47944
+ assert_eq(
+ expected.sort_index(axis=1),
+ got.sort_index(axis=1),
+ check_index_type=True,
+ )
+ else:
+ assert_eq(expected, got, check_index_type=True)
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 3771587eb47..d3c41de842a 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -463,7 +463,7 @@
function or `StringIO`). Multiple inputs may be provided as a list. If a
list is specified each list entry may be of a different input type as long
as each input is of a valid type and all input JSON schema(s) match.
-engine : {{ 'auto', 'cudf', 'pandas' }}, default 'auto'
+engine : {{ 'auto', 'cudf', 'cudf_experimental', 'pandas' }}, default 'auto'
Parser engine to use. If 'auto' is passed, the engine will be
automatically selected based on the other parameters.
orient : string,
diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py
index 575683bc5fa..7d8a6d7c3a3 100644
--- a/python/dask_cudf/setup.py
+++ b/python/dask_cudf/setup.py
@@ -10,8 +10,8 @@
install_requires = [
"cudf",
- "dask>=2022.05.2",
- "distributed>=2022.05.2",
+ "dask==2022.7.1",
+ "distributed==2022.7.1",
"fsspec>=0.6.0",
"numpy",
"pandas>=1.0,<1.5.0dev0",