diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4051ff822be..4f838ba3f45 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -60,6 +60,8 @@ repos: # of dependencies, so we'll have to update this manually. additional_dependencies: - cmakelang==0.6.13 + verbose: true + require_serial: true - id: cmake-lint name: cmake-lint entry: ./cpp/scripts/run-cmake-format.sh cmake-lint @@ -69,6 +71,8 @@ repos: # of dependencies, so we'll have to update this manually. additional_dependencies: - cmakelang==0.6.13 + verbose: true + require_serial: true - id: copyright-check name: copyright-check # This hook's use of Git tools appears to conflict with diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh index 5d03a518fcf..5593633640a 100755 --- a/ci/benchmark/build.sh +++ b/ci/benchmark/build.sh @@ -37,7 +37,7 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/" export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache" # Dask & Distributed option to install main(nightly) or `conda-forge` packages. -export INSTALL_DASK_MAIN=1 +export INSTALL_DASK_MAIN=0 function remove_libcudf_kernel_cache_dir { EXITCODE=$? @@ -82,8 +82,8 @@ if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then gpuci_logger "gpuci_mamba_retry update dask" gpuci_mamba_retry update dask else - gpuci_logger "gpuci_mamba_retry install conda-forge::dask>=2022.05.2 conda-forge::distributed>=2022.05.2 conda-forge::dask-core>=2022.05.2 --force-reinstall" - gpuci_mamba_retry install conda-forge::dask>=2022.05.2 conda-forge::distributed>=2022.05.2 conda-forge::dask-core>=2022.05.2 --force-reinstall + gpuci_logger "gpuci_mamba_retry install conda-forge::dask==2022.7.1 conda-forge::distributed==2022.7.1 conda-forge::dask-core==2022.7.1 --force-reinstall" + gpuci_mamba_retry install conda-forge::dask==2022.7.1 conda-forge::distributed==2022.7.1 conda-forge::dask-core==2022.7.1 --force-reinstall fi # Install the master version of streamz diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 7b26519aa7d..8f215d1bb54 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -32,7 +32,7 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` unset GIT_DESCRIBE_TAG # Dask & Distributed option to install main(nightly) or `conda-forge` packages. -export INSTALL_DASK_MAIN=1 +export INSTALL_DASK_MAIN=0 # ucx-py version export UCX_PY_VERSION='0.28.*' @@ -92,8 +92,8 @@ function install_dask { gpuci_mamba_retry update dask conda list else - gpuci_logger "gpuci_mamba_retry install conda-forge::dask>=2022.05.2 conda-forge::distributed>=2022.05.2 conda-forge::dask-core>=2022.05.2 --force-reinstall" - gpuci_mamba_retry install conda-forge::dask>=2022.05.2 conda-forge::distributed>=2022.05.2 conda-forge::dask-core>=2022.05.2 --force-reinstall + gpuci_logger "gpuci_mamba_retry install conda-forge::dask==2022.7.1 conda-forge::distributed==2022.7.1 conda-forge::dask-core==2022.7.1 --force-reinstall" + gpuci_mamba_retry install conda-forge::dask==2022.7.1 conda-forge::distributed==2022.7.1 conda-forge::dask-core==2022.7.1 --force-reinstall fi # Install the main version of streamz gpuci_logger "Install the main version of streamz" diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index 56531a7ae58..1e323182ffd 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -48,8 +48,8 @@ dependencies: - pydocstyle=6.1.1 - typing_extensions - pre-commit - - dask>=2022.05.2 - - distributed>=2022.05.2 + - dask==2022.7.1 + - distributed==2022.7.1 - streamz - arrow-cpp=8 - dlpack>=0.5,<0.6.0a0 diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index acf85426d09..118f084b436 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -29,8 +29,8 @@ requirements: - python - streamz - cudf ={{ version }} - - dask>=2022.05.2 - - distributed>=2022.05.2 + - dask==2022.7.1 + - distributed==2022.7.1 - python-confluent-kafka >=1.7.0,<1.8.0a0 - cudf_kafka ={{ version }} diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index 3d7e7895578..c9a179301b0 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -24,14 +24,14 @@ requirements: host: - python - cudf ={{ version }} - - dask>=2022.05.2 - - distributed>=2022.05.2 + - dask==2022.7.1 + - distributed==2022.7.1 - cudatoolkit ={{ cuda_version }} run: - python - cudf ={{ version }} - - dask>=2022.05.2 - - distributed>=2022.05.2 + - dask==2022.7.1 + - distributed==2022.7.1 - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }} test: # [linux64] diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8e5e2a53692..2f96b6ce9ae 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -28,6 +28,12 @@ project( VERSION 22.10.00 LANGUAGES C CXX CUDA ) +if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5) + message( + FATAL_ERROR + "libcudf requires CUDA Toolkit 11.5+ to compile (nvcc ${CMAKE_CUDA_COMPILER_VERSION} provided)" + ) +endif() # Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to # have different values for the `Threads::Threads` target. Setting this flag ensures diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 116c5442dc3..e0f9a711776 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -273,7 +273,12 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endfunction() -set(CUDF_VERSION_Arrow 8.0.0) +if(NOT DEFINED CUDF_VERSION_Arrow) + set(CUDF_VERSION_Arrow + 8.0.0 + CACHE STRING "The version of Arrow to find (or build)" + ) +endif() find_and_configure_arrow( ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} ${CUDF_ENABLE_ARROW_ORC} diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index e6b12948d85..5f43f5af0e4 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -1146,7 +1146,7 @@ HTML_FILE_EXTENSION = .html # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_HEADER = +HTML_HEADER = header.html # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard @@ -1156,7 +1156,7 @@ HTML_HEADER = # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_FOOTER = footer.html +HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of diff --git a/cpp/doxygen/footer.html b/cpp/doxygen/footer.html deleted file mode 100644 index 9bd79eeb539..00000000000 --- a/cpp/doxygen/footer.html +++ /dev/null @@ -1,4 +0,0 @@ - - - - diff --git a/cpp/doxygen/header.html b/cpp/doxygen/header.html new file mode 100644 index 00000000000..569b8450e3a --- /dev/null +++ b/cpp/doxygen/header.html @@ -0,0 +1,61 @@ + + + + + + + + +$projectname: $title +$title + + + +$treeview +$search +$mathjax + +$extrastylesheet + + + + + + + +
+ + +
+ + + + + + + + + + + + + + + + + + + + + +
+
$projectname +  $projectnumber +
+
$projectbrief
+
+
$projectbrief
+
$searchbox
+
+ + diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp index ac3824dfc21..c5f6d339ae9 100644 --- a/cpp/include/cudf/column/column.hpp +++ b/cpp/include/cudf/column/column.hpp @@ -23,6 +23,7 @@ #include #include +#include #include #include @@ -75,6 +76,33 @@ class column { */ column(column&& other) noexcept; + /** + * @brief Construct a new column by taking ownership of the contents of a device_uvector. + * + * @param other The device_uvector whose contents will be moved into the new column. + * @param null_mask Optional, column's null value indicator bitmask. May + * be empty if `null_count` is 0 or `UNKNOWN_NULL_COUNT`. + * @param null_count Optional, the count of null elements. If unknown, specify + * `UNKNOWN_NULL_COUNT` to indicate that the null count should be computed on + * the first invocation of `null_count()`. + */ + template () or cudf::is_chrono())> + column(rmm::device_uvector&& other, + rmm::device_buffer&& null_mask = {}, + size_type null_count = UNKNOWN_NULL_COUNT) + : _type{cudf::data_type{cudf::type_to_id()}}, + _size{[&]() { + CUDF_EXPECTS( + other.size() <= static_cast(std::numeric_limits::max()), + "The device_uvector size exceeds the maximum size_type."); + return static_cast(other.size()); + }()}, + _data{other.release()}, + _null_mask{std::move(null_mask)}, + _null_count{null_count} + { + } + /** * @brief Construct a new column from existing device memory. * diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp index 7d8ac5c9325..1a4b8f02dd3 100644 --- a/cpp/include/cudf/detail/structs/utilities.hpp +++ b/cpp/include/cudf/detail/structs/utilities.hpp @@ -151,35 +151,6 @@ flattened_table flatten_nested_columns( std::vector const& null_precedence, column_nullability nullability = column_nullability::MATCH_INCOMING); -/** - * @brief Unflatten columns flattened as by `flatten_nested_columns()`, - * based on the provided `blueprint`. - * - * cudf::flatten_nested_columns() executes depth first, and serializes the struct null vector - * before the child/member columns. - * E.g. STRUCT_1< STRUCT_2< A, B >, C > is flattened to: - * 1. Null Vector for STRUCT_1 - * 2. Null Vector for STRUCT_2 - * 3. Member STRUCT_2::A - * 4. Member STRUCT_2::B - * 5. Member STRUCT_1::C - * - * `unflatten_nested_columns()` reconstructs nested columns from flattened input that follows - * the convention above. - * - * Note: This function requires a null-mask vector for each STRUCT column, including for nested - * STRUCT members. - * - * @param flattened "Flattened" `table` of input columns, following the conventions in - * `flatten_nested_columns()`. - * @param blueprint The exemplar `table_view` with nested columns intact, whose structure defines - * the nesting of the reconstructed output table. - * @return std::unique_ptr Unflattened table (with nested STRUCT columns) reconstructed - * based on `blueprint`. - */ -std::unique_ptr unflatten_nested_columns(std::unique_ptr&& flattened, - table_view const& blueprint); - /** * @brief Push down nulls from a parent mask into a child column, using bitwise AND. * diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh index f985135064f..0521418d2d3 100644 --- a/cpp/include/cudf/detail/utilities/device_atomics.cuh +++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh @@ -116,18 +116,17 @@ struct genericAtomicOperationImpl { using T_int = unsigned int; T old_value = *addr; - T assumed{old_value}; + T_int assumed; + T_int ret; do { - assumed = old_value; - const T new_value = op(old_value, update_value); + T_int const new_value = type_reinterpret(op(old_value, update_value)); - T_int ret = atomicCAS(reinterpret_cast(addr), - type_reinterpret(assumed), - type_reinterpret(new_value)); + assumed = type_reinterpret(old_value); + ret = atomicCAS(reinterpret_cast(addr), assumed, new_value); old_value = type_reinterpret(ret); - } while (assumed != old_value); + } while (assumed != ret); return old_value; } @@ -142,18 +141,17 @@ struct genericAtomicOperationImpl { static_assert(sizeof(T) == sizeof(T_int)); T old_value = *addr; - T assumed{old_value}; + T_int assumed; + T_int ret; do { - assumed = old_value; - const T new_value = op(old_value, update_value); + T_int const new_value = type_reinterpret(op(old_value, update_value)); - T_int ret = atomicCAS(reinterpret_cast(addr), - type_reinterpret(assumed), - type_reinterpret(new_value)); + assumed = type_reinterpret(old_value); + ret = atomicCAS(reinterpret_cast(addr), assumed, new_value); old_value = type_reinterpret(ret); - } while (assumed != old_value); + } while (assumed != ret); return old_value; } diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 30acf80548b..7f3cb95e4b2 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -51,8 +52,8 @@ class orc_reader_options_builder; class orc_reader_options { source_info _source; - // Names of column to read; empty is all - std::vector _columns; + // Names of column to read; `nullopt` is all + std::optional> _columns; // List of individual stripes to read (ignored if empty) std::vector> _stripes; @@ -105,18 +106,18 @@ class orc_reader_options { [[nodiscard]] source_info const& get_source() const { return _source; } /** - * @brief Returns names of the columns to read. + * @brief Returns names of the columns to read, if set. * - * @return Names of the columns to read + * @return Names of the columns to read; `nullopt` if the option is not set */ - [[nodiscard]] std::vector const& get_columns() const { return _columns; } + [[nodiscard]] auto const& get_columns() const { return _columns; } /** * @brief Returns vector of vectors, stripes to read for each input source * * @return Vector of vectors, stripes to read for each input source */ - std::vector> const& get_stripes() const { return _stripes; } + [[nodiscard]] auto const& get_stripes() const { return _stripes; } /** * @brief Returns number of rows to skip from the start. diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 10368f84824..19156e01c1e 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -51,7 +51,7 @@ class parquet_reader_options_builder; class parquet_reader_options { source_info _source; - // Path in schema of column to read; empty is all + // Path in schema of column to read; `nullopt` is all std::optional> _columns; // List of individual row groups to read (ignored if empty) @@ -152,17 +152,14 @@ class parquet_reader_options { * * @return Names of column to be read; `nullopt` if the option is not set */ - [[nodiscard]] std::optional> const& get_columns() const - { - return _columns; - } + [[nodiscard]] auto const& get_columns() const { return _columns; } /** * @brief Returns list of individual row groups to be read. * * @return List of individual row groups to be read */ - std::vector> const& get_row_groups() const { return _row_groups; } + [[nodiscard]] auto const& get_row_groups() const { return _row_groups; } /** * @brief Returns timestamp type used to cast timestamp columns. diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 6504e790677..c31176ab51c 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -23,11 +23,11 @@ #include -#include - #include #include +#include #include +#include #include // Forward declarations @@ -383,12 +383,12 @@ class table_input_metadata; class column_in_metadata { friend table_input_metadata; std::string _name = ""; - thrust::optional _nullable; + std::optional _nullable; bool _list_column_is_map = false; bool _use_int96_timestamp = false; bool _output_as_binary = false; - thrust::optional _decimal_precision; - thrust::optional _parquet_field_id; + std::optional _decimal_precision; + std::optional _parquet_field_id; std::vector children; public: diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index af66eb32618..573d0c81380 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -373,6 +373,19 @@ constexpr inline bool is_floating_point(data_type type) return cudf::type_dispatcher(type, is_floating_point_impl{}); } +/** + * @brief Indicates whether `T` is a std::byte type. + * + * @tparam T The type to verify + * @return true `type` is std::byte + * @return false `type` is not std::byte + */ +template +constexpr inline bool is_byte() +{ + return std::is_same_v, std::byte>; +} + /** * @brief Indicates whether `T` is a Boolean type. * @@ -561,7 +574,8 @@ constexpr inline bool is_chrono(data_type type) template constexpr bool is_rep_layout_compatible() { - return cudf::is_numeric() or cudf::is_chrono() or cudf::is_boolean(); + return cudf::is_numeric() or cudf::is_chrono() or cudf::is_boolean() or + cudf::is_byte(); } /** diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh index 9c981c6cdaa..b9157c76492 100755 --- a/cpp/scripts/run-cmake-format.sh +++ b/cpp/scripts/run-cmake-format.sh @@ -1,5 +1,7 @@ #!/bin/bash +# Copyright (c) 2021-2022, NVIDIA CORPORATION. + # This script is a wrapper for cmakelang that may be used with pre-commit. The # wrapping is necessary because RAPIDS libraries split configuration for # cmakelang linters between a local config file and a second config file that's @@ -69,5 +71,14 @@ fi if [[ $1 == "cmake-format" ]]; then cmake-format -i --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2} elif [[ $1 == "cmake-lint" ]]; then - cmake-lint --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2} + # Since the pre-commit hook is verbose, we have to be careful to only + # present cmake-lint's output (which is quite verbose) if we actually + # observe a failure. + OUTPUT=$(cmake-lint --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2}) + status=$? + + if ! [ ${status} -eq 0 ]; then + echo "${OUTPUT}" + fi + exit ${status} fi diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp index 82765c60c1e..df3dfca5fa9 100644 --- a/cpp/src/io/orc/aggregate_orc_metadata.cpp +++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp @@ -18,6 +18,7 @@ #include #include +#include namespace cudf::io::orc::detail { @@ -249,17 +250,17 @@ std::vector aggregate_orc_metadata::select_stri } column_hierarchy aggregate_orc_metadata::select_columns( - std::vector const& column_paths) + std::optional> const& column_paths) { auto const& pfm = per_file_metadata[0]; column_hierarchy::nesting_map selected_columns; - if (column_paths.empty()) { + if (not column_paths.has_value()) { for (auto const& col_id : pfm.ff.types[0].subtypes) { add_column_to_mapping(selected_columns, pfm, col_id); } } else { - for (const auto& path : column_paths) { + for (const auto& path : column_paths.value()) { bool name_found = false; for (auto col_id = 1; col_id < pfm.get_num_columns(); ++col_id) { if (pfm.column_path(col_id) == path) { diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp index 9d2380c0097..3ce1a922f31 100644 --- a/cpp/src/io/orc/aggregate_orc_metadata.hpp +++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp @@ -17,6 +17,7 @@ #include "orc.hpp" #include +#include #include namespace cudf::io::orc::detail { @@ -126,10 +127,11 @@ class aggregate_orc_metadata { * Paths are in format "grandparent_col.parent_col.child_col", where the root ORC column is * omitted to match the cuDF table hierarchy. * - * @param column_paths List of full column names (i.e. paths) to select from the ORC file + * @param column_paths List of full column names (i.e. paths) to select from the ORC file; + * `nullopt` if user did not select columns to read * @return Columns hierarchy - lists of children columns and sorted columns in each nesting level */ - column_hierarchy select_columns(std::vector const& column_paths); + column_hierarchy select_columns(std::optional> const& column_paths); }; } // namespace cudf::io::orc::detail diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp index ff278f63366..74565b2f244 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.hpp +++ b/cpp/src/io/parquet/compact_protocol_reader.hpp @@ -18,10 +18,9 @@ #include "parquet.hpp" -#include - #include #include +#include #include #include @@ -264,10 +263,10 @@ class ParquetFieldInt32 { */ class ParquetFieldOptionalInt32 { int field_val; - thrust::optional& val; + std::optional& val; public: - ParquetFieldOptionalInt32(int f, thrust::optional& v) : field_val(f), val(v) {} + ParquetFieldOptionalInt32(int f, std::optional& v) : field_val(f), val(v) {} inline bool operator()(CompactProtocolReader* cpr, int field_type) { diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index b03ba23737e..a03fdf27953 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -18,9 +18,8 @@ #include "parquet_common.hpp" -#include - #include +#include #include #include @@ -147,7 +146,7 @@ struct SchemaElement { int32_t num_children = 0; int32_t decimal_scale = 0; int32_t decimal_precision = 0; - thrust::optional field_id = thrust::nullopt; + std::optional field_id = std::nullopt; bool output_as_byte_array = false; // The following fields are filled in later during schema initialization diff --git a/cpp/src/io/statistics/byte_array_view.cuh b/cpp/src/io/statistics/byte_array_view.cuh index 315e753a732..c1958780321 100644 --- a/cpp/src/io/statistics/byte_array_view.cuh +++ b/cpp/src/io/statistics/byte_array_view.cuh @@ -28,7 +28,7 @@ namespace cudf::io::statistics { */ class byte_array_view { public: - using element_type = uint8_t const; ///< The type of the elements in the byte array + using element_type = std::byte const; ///< The type of the elements in the byte array constexpr byte_array_view() noexcept {} /** diff --git a/cpp/src/io/statistics/statistics.cuh b/cpp/src/io/statistics/statistics.cuh index ab6674e4328..f2611f7cc26 100644 --- a/cpp/src/io/statistics/statistics.cuh +++ b/cpp/src/io/statistics/statistics.cuh @@ -85,7 +85,8 @@ struct t_array_stats { __host__ __device__ __forceinline__ operator ReturnType() { return ReturnType(ptr, length); } }; using string_stats = t_array_stats; -using byte_array_stats = t_array_stats; +using byte_array_view = statistics::byte_array_view; +using byte_array_stats = t_array_stats; union statistics_val { string_stats str_val; //!< string columns @@ -129,10 +130,10 @@ template (); auto const* d_data = col.child(lists_column_view::child_column_index).data(); - offset_type offset = d_offsets[index]; + auto const offset = d_offsets[index]; return T(d_data + offset, d_offsets[index + 1] - offset); } diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 50d641c9a74..bc6bdd9dc7b 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -60,7 +60,7 @@ static reclass cclass_S(NCCLASS_S); // \S static reclass cclass_D(NCCLASS_D); // \D // Tables for analyzing quantifiers -const std::array valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL, RBRA}}; +const std::array valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL}}; const std::array quantifiers{{'*', '?', '+', '{', '|'}}; // Valid regex characters that can be escaped and used as literals const std::array escapable_chars{ @@ -459,16 +459,42 @@ class regex_parser { } // The quantifiers require at least one "real" previous item. - // We are throwing an error in these two if-checks for invalid quantifiers. + // We are throwing errors for invalid quantifiers. // Another option is to just return CHAR silently here which effectively // treats the chr character as a literal instead as a quantifier. // This could lead to confusion where sometimes unescaped quantifier characters // are treated as regex expressions and sometimes they are not. if (_items.empty()) { CUDF_FAIL("invalid regex pattern: nothing to repeat at position 0"); } + // Check that the previous item can be used with quantifiers. + // If the previous item is a capture group, we need to check items inside the + // capture group can be used with quantifiers too. + // (Note that capture groups can be nested). + auto previous_type = _items.back().type; + if (previous_type == RBRA) { // previous item is a capture group + // look for matching LBRA + auto nested_count = 1; + auto lbra_itr = + std::find_if(_items.rbegin(), _items.rend(), [nested_count](auto const& item) mutable { + auto const is_closing = (item.type == RBRA); + auto const is_opening = (item.type == LBRA || item.type == LBRA_NC); + nested_count += is_closing - is_opening; + return is_opening && (nested_count == 0); + }); + // search for the first valid item within the LBRA-RBRA range + auto first_valid = std::find_first_of( + _items.rbegin() + 1, + lbra_itr, + valid_preceding_inst_types.begin(), + valid_preceding_inst_types.end(), + [](auto const item, auto const valid_type) { return item.type == valid_type; }); + // set previous_type to be checked in next if-statement + previous_type = (first_valid == lbra_itr) ? (--lbra_itr)->type : first_valid->type; + } + if (std::find(valid_preceding_inst_types.begin(), valid_preceding_inst_types.end(), - _items.back().type) == valid_preceding_inst_types.end()) { + previous_type) == valid_preceding_inst_types.end()) { CUDF_FAIL("invalid regex pattern: nothing to repeat at position " + std::to_string(_expr_ptr - _pattern_begin - 1)); } diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp index 1d5ebfaa7fc..bf4216b6983 100644 --- a/cpp/src/structs/utilities.cpp +++ b/cpp/src/structs/utilities.cpp @@ -209,98 +209,6 @@ flattened_table flatten_nested_columns(table_view const& input, return table_flattener{input, column_order, null_precedence, nullability}(); } -namespace { -using vector_of_columns = std::vector>; -using column_index_t = typename vector_of_columns::size_type; - -// Forward declaration, to enable recursion via `unflattener`. -std::unique_ptr unflatten_struct(vector_of_columns& flattened, - column_index_t& current_index, - cudf::column_view const& blueprint); - -/** - * @brief Helper functor to reconstruct STRUCT columns from its flattened member columns. - * - */ -class unflattener { - public: - unflattener(vector_of_columns& flattened_, column_index_t& current_index_) - : flattened{flattened_}, current_index{current_index_} - { - } - - auto operator()(column_view const& blueprint) - { - return is_struct(blueprint) ? unflatten_struct(flattened, current_index, blueprint) - : std::move(flattened[current_index++]); - } - - private: - vector_of_columns& flattened; - column_index_t& current_index; - -}; // class unflattener; - -std::unique_ptr unflatten_struct(vector_of_columns& flattened, - column_index_t& current_index, - cudf::column_view const& blueprint) -{ - // "Consume" columns from `flattened`, starting at `current_index`, - // based on the provided `blueprint` struct col. Recurse for struct children. - CUDF_EXPECTS(blueprint.type().id() == type_id::STRUCT, - "Expected blueprint column to be a STRUCT column."); - - CUDF_EXPECTS(current_index < flattened.size(), "STRUCT column can't have 0 children."); - - auto const num_rows = flattened[current_index]->size(); - - // cudf::flatten_nested_columns() executes depth first, and serializes the struct null vector - // before the child/member columns. - // E.g. STRUCT_1< STRUCT_2< A, B >, C > is flattened to: - // 1. Null Vector for STRUCT_1 - // 2. Null Vector for STRUCT_2 - // 3. Member STRUCT_2::A - // 4. Member STRUCT_2::B - // 5. Member STRUCT_1::C - // - // Extract null-vector *before* child columns are constructed. - auto struct_null_column_contents = flattened[current_index++]->release(); - auto unflattening_iter = - thrust::make_transform_iterator(blueprint.child_begin(), unflattener{flattened, current_index}); - - return cudf::make_structs_column( - num_rows, - vector_of_columns{unflattening_iter, unflattening_iter + blueprint.num_children()}, - UNKNOWN_NULL_COUNT, // Do count? - std::move(*struct_null_column_contents.null_mask)); -} -} // namespace - -std::unique_ptr unflatten_nested_columns(std::unique_ptr&& flattened, - table_view const& blueprint) -{ - // Bail, if LISTs are present. - auto const has_lists = std::any_of(blueprint.begin(), blueprint.end(), is_or_has_nested_lists); - CUDF_EXPECTS(not has_lists, "Unflattening LIST columns is not supported."); - - // If there are no STRUCTs, unflattening is a NOOP. - auto const has_structs = std::any_of(blueprint.begin(), blueprint.end(), is_struct); - if (not has_structs) { - return std::move(flattened); // Unchanged. - } - - // There be struct columns. - // Note: Requires null vectors for all struct input columns. - auto flattened_columns = flattened->release(); - auto current_idx = column_index_t{0}; - - auto unflattening_iter = - thrust::make_transform_iterator(blueprint.begin(), unflattener{flattened_columns, current_idx}); - - return std::make_unique( - vector_of_columns{unflattening_iter, unflattening_iter + blueprint.num_columns()}); -} - // Helper function to superimpose validity of parent struct // over the specified member (child) column. void superimpose_parent_nulls(bitmask_type const* parent_null_mask, diff --git a/cpp/tests/column/column_test.cu b/cpp/tests/column/column_test.cu index 6fcabbcf823..801cee285b6 100644 --- a/cpp/tests/column/column_test.cu +++ b/cpp/tests/column/column_test.cu @@ -345,6 +345,42 @@ TYPED_TEST(TypedColumnTest, MoveConstructorWithMask) EXPECT_EQ(original_mask, moved_to_view.null_mask()); } +TYPED_TEST(TypedColumnTest, DeviceUvectorConstructorNoMask) +{ + rmm::device_uvector original{static_cast(this->num_elements()), + cudf::default_stream_value}; + thrust::copy(thrust::device, + static_cast(this->data.data()), + static_cast(this->data.data()) + this->num_elements(), + original.begin()); + auto original_data = original.data(); + cudf::column moved_to{std::move(original)}; + verify_column_views(moved_to); + + // Verify move + cudf::column_view moved_to_view = moved_to; + EXPECT_EQ(original_data, moved_to_view.head()); +} + +TYPED_TEST(TypedColumnTest, DeviceUvectorConstructorWithMask) +{ + rmm::device_uvector original{static_cast(this->num_elements()), + cudf::default_stream_value}; + thrust::copy(thrust::device, + static_cast(this->data.data()), + static_cast(this->data.data()) + this->num_elements(), + original.begin()); + auto original_data = original.data(); + auto original_mask = this->all_valid_mask.data(); + cudf::column moved_to{std::move(original), std::move(this->all_valid_mask)}; + verify_column_views(moved_to); + + // Verify move + cudf::column_view moved_to_view = moved_to; + EXPECT_EQ(original_data, moved_to_view.head()); + EXPECT_EQ(original_mask, moved_to_view.null_mask()); +} + TYPED_TEST(TypedColumnTest, ConstructWithChildren) { std::vector> children; diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp index 0b7e0d13c24..1d2c8c489f3 100644 --- a/cpp/tests/groupby/max_tests.cpp +++ b/cpp/tests/groupby/max_tests.cpp @@ -459,5 +459,30 @@ TYPED_TEST(groupby_max_floating_point_test, values_with_infinity) keys, vals, expected_keys, expected_vals, std::move(agg), force_use_sort_impl::YES); } +TYPED_TEST(groupby_max_floating_point_test, values_with_nan) +{ + using T = TypeParam; + using int32s_col = fixed_width_column_wrapper; + using floats_col = fixed_width_column_wrapper; + + auto constexpr nan = std::numeric_limits::quiet_NaN(); + + auto const keys = int32s_col{1, 1}; + auto const vals = floats_col{nan, nan}; + + std::vector requests; + requests.emplace_back(groupby::aggregation_request()); + requests[0].values = vals; + requests[0].aggregations.emplace_back(cudf::make_max_aggregation()); + + // Without properly handling NaN, this will hang forever in hash-based aggregate (which is the + // default back-end for min/max in groupby context). + // This test is just to verify that the aggregate operation does not hang. + auto gb_obj = groupby::groupby(table_view({keys})); + auto const result = gb_obj.aggregate(requests); + + EXPECT_EQ(result.first->num_rows(), 1); +} + } // namespace test } // namespace cudf diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp index a12ec7c8739..9606c8c55ee 100644 --- a/cpp/tests/groupby/min_tests.cpp +++ b/cpp/tests/groupby/min_tests.cpp @@ -458,5 +458,30 @@ TYPED_TEST(groupby_min_floating_point_test, values_with_infinity) keys, vals, expected_keys, expected_vals, std::move(agg), force_use_sort_impl::YES); } +TYPED_TEST(groupby_min_floating_point_test, values_with_nan) +{ + using T = TypeParam; + using int32s_col = fixed_width_column_wrapper; + using floats_col = fixed_width_column_wrapper; + + auto constexpr nan = std::numeric_limits::quiet_NaN(); + + auto const keys = int32s_col{1, 1}; + auto const vals = floats_col{nan, nan}; + + std::vector requests; + requests.emplace_back(groupby::aggregation_request()); + requests[0].values = vals; + requests[0].aggregations.emplace_back(cudf::make_min_aggregation()); + + // Without properly handling NaN, this will hang forever in hash-based aggregate (which is the + // default back-end for min/max in groupby context). + // This test is just to verify that the aggregate operation does not hang. + auto gb_obj = groupby::groupby(table_view({keys})); + auto const result = gb_obj.aggregate(requests); + + EXPECT_EQ(result.first->num_rows(), 1); +} + } // namespace test } // namespace cudf diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index b6a6270ca8b..c8aefece94f 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -915,4 +915,13 @@ TEST_F(JsonReaderTest, BadDtypeParams) EXPECT_THROW(cudf_io::read_json(options_map), cudf::logic_error); } +TEST_F(JsonReaderTest, ExperimentalParam) +{ + cudf_io::json_reader_options const options = + cudf_io::json_reader_options::builder(cudf_io::source_info{nullptr, 0}).experimental(true); + + // should throw for now + EXPECT_THROW(cudf_io::read_json(options), cudf::logic_error); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index b3df2c8a8dd..76ffc92e243 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1514,4 +1514,23 @@ TEST_F(OrcWriterTest, DecimalOptionsNested) result.tbl->view().column(0).child(1).child(0).child(1)); } +TEST_F(OrcReaderTest, EmptyColumnsParam) +{ + srand(31337); + auto const expected = create_random_fixed_table(2, 4, false); + + std::vector out_buffer; + cudf_io::orc_writer_options args = + cudf_io::orc_writer_options::builder(cudf_io::sink_info{&out_buffer}, *expected); + cudf_io::write_orc(args); + + cudf_io::orc_reader_options read_opts = + cudf_io::orc_reader_options::builder(cudf_io::source_info{out_buffer.data(), out_buffer.size()}) + .columns({}); + auto const result = cudf_io::read_orc(read_opts); + + EXPECT_EQ(result.tbl->num_columns(), 0); + EXPECT_EQ(result.tbl->num_rows(), 0); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 70f28aa139d..d725f3d5dd0 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -424,6 +424,25 @@ TEST_F(StringsContainsTests, FixedQuantifier) } } +TEST_F(StringsContainsTests, QuantifierErrors) +{ + auto input = cudf::test::strings_column_wrapper({"a", "aa", "aaa", "aaaa", "aaaaa", "aaaaaa"}); + auto sv = cudf::strings_column_view(input); + + EXPECT_THROW(cudf::strings::contains_re(sv, "^+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::count_re(sv, "$+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::count_re(sv, "(^)+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::contains_re(sv, "($)+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::count_re(sv, "\\A+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::count_re(sv, "\\Z+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::contains_re(sv, "(\\A)+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::contains_re(sv, "(\\Z)+"), cudf::logic_error); + + EXPECT_THROW(cudf::strings::contains_re(sv, "(^($))+"), cudf::logic_error); + EXPECT_NO_THROW(cudf::strings::contains_re(sv, "(^a($))+")); + EXPECT_NO_THROW(cudf::strings::count_re(sv, "(^(a$))+")); +} + TEST_F(StringsContainsTests, OverlappedClasses) { auto input = cudf::test::strings_column_wrapper({"abcdefg", "defghí", "", "éééééé", "ghijkl"}); diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp index b26ea87c5b8..d58568cd1b5 100644 --- a/cpp/tests/structs/utilities_tests.cpp +++ b/cpp/tests/structs/utilities_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,26 +30,13 @@ namespace cudf::test { -/** - * @brief Round-trip input table through flatten/unflatten, - * verify that the table remains equivalent. - */ -void flatten_unflatten_compare(table_view const& input_table) -{ - using namespace cudf::structs::detail; - - auto flattened = flatten_nested_columns(input_table, {}, {}, column_nullability::FORCE); - auto unflattened = - unflatten_nested_columns(std::make_unique(flattened), input_table); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, unflattened->view()); -} - using namespace cudf; using namespace iterators; +using namespace cudf::structs::detail; using strings = strings_column_wrapper; using dictionary = dictionary_column_wrapper; using structs = structs_column_wrapper; +using bools = fixed_width_column_wrapper; template using nums = fixed_width_column_wrapper; @@ -66,7 +53,7 @@ struct TypedStructUtilitiesTest : StructUtilitiesTest { TYPED_TEST_SUITE(TypedStructUtilitiesTest, FixedWidthTypes); -TYPED_TEST(TypedStructUtilitiesTest, ListsAtTopLevelUnsupported) +TYPED_TEST(TypedStructUtilitiesTest, ListsAtTopLevel) { using T = TypeParam; using lists = lists_column_wrapper; @@ -75,8 +62,10 @@ TYPED_TEST(TypedStructUtilitiesTest, ListsAtTopLevelUnsupported) auto lists_col = lists{{0, 1}, {22, 33}, {44, 55, 66}}; auto nums_col = nums{{0, 1, 2}, null_at(6)}; - EXPECT_THROW(flatten_unflatten_compare(cudf::table_view{{lists_col, nums_col}}), - cudf::logic_error); + auto table = cudf::table_view{{lists_col, nums_col}}; + + CUDF_TEST_EXPECT_TABLES_EQUAL(table, + flatten_nested_columns(table, {}, {}, column_nullability::FORCE)); } TYPED_TEST(TypedStructUtilitiesTest, NestedListsUnsupported) @@ -88,10 +77,10 @@ TYPED_TEST(TypedStructUtilitiesTest, NestedListsUnsupported) auto lists_member = lists{{0, 1}, {22, 33}, {44, 55, 66}}; auto nums_member = nums{{0, 1, 2}, null_at(6)}; auto structs_col = structs{{nums_member, lists_member}}; + auto nums_col = nums{{0, 1, 2}, null_at(6)}; - auto nums_col = nums{{0, 1, 2}, null_at(6)}; - - EXPECT_THROW(flatten_unflatten_compare(cudf::table_view{{nums_col, structs_col}}), + EXPECT_THROW(flatten_nested_columns( + cudf::table_view{{nums_col, structs_col}}, {}, {}, column_nullability::FORCE), cudf::logic_error); } @@ -104,7 +93,10 @@ TYPED_TEST(TypedStructUtilitiesTest, NoStructs) auto strings_col = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)}; auto nuther_nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)}; - flatten_unflatten_compare(cudf::table_view{{nums_col, strings_col, nuther_nums_col}}); + auto table = cudf::table_view{{nums_col, strings_col, nuther_nums_col}}; + + CUDF_TEST_EXPECT_TABLES_EQUAL(table, + flatten_nested_columns(table, {}, {}, column_nullability::FORCE)); } TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStruct) @@ -116,8 +108,19 @@ TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStruct) auto strings_member = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)}; auto structs_col = structs{{nums_member, strings_member}}; auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)}; - - flatten_unflatten_compare(cudf::table_view{{nums_col, structs_col}}); + auto table = cudf::table_view{{nums_col, structs_col}}; + + auto expected_nums_col_1 = cudf::column(nums_col); + auto expected_structs_col = bools{{1, 1, 1, 1, 1, 1, 1}}; + auto expected_nums_col_2 = + cudf::column(static_cast(structs_col).get_sliced_child(0)); + auto expected_strings_col = + cudf::column(static_cast(structs_col).get_sliced_child(1)); + auto expected = cudf::table_view{ + {expected_nums_col_1, expected_structs_col, expected_nums_col_2, expected_strings_col}}; + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, + flatten_nested_columns(table, {}, {}, column_nullability::FORCE)); } TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStructWithNulls) @@ -129,8 +132,19 @@ TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStructWithNulls) auto strings_member = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)}; auto structs_col = structs{{nums_member, strings_member}, null_at(2)}; auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)}; - - flatten_unflatten_compare(cudf::table_view{{nums_col, structs_col}}); + auto table = cudf::table_view{{nums_col, structs_col}}; + + auto expected_nums_col_1 = cudf::column(nums_col); + auto expected_structs_col = bools{{1, 1, 0, 1, 1, 1, 1}, null_at(2)}; + auto expected_nums_col_2 = + cudf::column(static_cast(structs_col).get_sliced_child(0)); + auto expected_strings_col = + cudf::column(static_cast(structs_col).get_sliced_child(1)); + auto expected = cudf::table_view{ + {expected_nums_col_1, expected_structs_col, expected_nums_col_2, expected_strings_col}}; + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, + flatten_nested_columns(table, {}, {}, column_nullability::FORCE)); } TYPED_TEST(TypedStructUtilitiesTest, StructOfStruct) @@ -147,8 +161,26 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStruct) auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)}; auto struct_of_structs_col = structs{{struct_1_nums_member, structs_1_structs_member}}; - - flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}}); + auto table = cudf::table_view{{nums_col, struct_of_structs_col}}; + + auto expected_nums_col_1 = cudf::column(nums_col); + auto expected_structs_col_1 = bools{{1, 1, 1, 1, 1, 1, 1}}; + auto expected_nums_col_2 = + cudf::column(static_cast(struct_of_structs_col).get_sliced_child(0)); + auto expected_structs_col_2 = bools{{1, 1, 1, 1, 1, 1, 1}}; + auto expected_nums_col_3 = cudf::column( + static_cast(struct_of_structs_col).get_sliced_child(1).child(0)); + auto expected_strings_col = cudf::column( + static_cast(struct_of_structs_col).get_sliced_child(1).child(1)); + auto expected = cudf::table_view{{expected_nums_col_1, + expected_structs_col_1, + expected_nums_col_2, + expected_structs_col_2, + expected_nums_col_3, + expected_strings_col}}; + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, + flatten_nested_columns(table, {}, {}, column_nullability::FORCE)); } TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtLeafLevel) @@ -166,8 +198,26 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtLeafLevel) auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)}; auto struct_of_structs_col = structs{{struct_1_nums_member, structs_1_structs_member}}; - - flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}}); + auto table = cudf::table_view{{nums_col, struct_of_structs_col}}; + + auto expected_nums_col_1 = cudf::column(nums_col); + auto expected_structs_col_1 = bools{{1, 1, 1, 1, 1, 1, 1}}; + auto expected_nums_col_2 = + cudf::column(static_cast(struct_of_structs_col).get_sliced_child(0)); + auto expected_structs_col_2 = bools{{1, 1, 0, 1, 1, 1, 1}, null_at(2)}; + auto expected_nums_col_3 = cudf::column( + static_cast(struct_of_structs_col).get_sliced_child(1).child(0)); + auto expected_strings_col = cudf::column( + static_cast(struct_of_structs_col).get_sliced_child(1).child(1)); + auto expected = cudf::table_view{{expected_nums_col_1, + expected_structs_col_1, + expected_nums_col_2, + expected_structs_col_2, + expected_nums_col_3, + expected_strings_col}}; + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, + flatten_nested_columns(table, {}, {}, column_nullability::FORCE)); } TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtTopLevel) @@ -185,8 +235,26 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtTopLevel) auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)}; auto struct_of_structs_col = structs{{struct_1_nums_member, structs_1_structs_member}, null_at(4)}; - - flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}}); + auto table = cudf::table_view{{nums_col, struct_of_structs_col}}; + + auto expected_nums_col_1 = cudf::column(nums_col); + auto expected_structs_col_1 = bools{{1, 1, 1, 1, 0, 1, 1}, null_at(4)}; + auto expected_nums_col_2 = + cudf::column(static_cast(struct_of_structs_col).get_sliced_child(0)); + auto expected_structs_col_2 = bools{{1, 1, 1, 1, 0, 1, 1}, null_at(4)}; + auto expected_nums_col_3 = cudf::column( + static_cast(struct_of_structs_col).get_sliced_child(1).child(0)); + auto expected_strings_col = cudf::column( + static_cast(struct_of_structs_col).get_sliced_child(1).child(1)); + auto expected = cudf::table_view{{expected_nums_col_1, + expected_structs_col_1, + expected_nums_col_2, + expected_structs_col_2, + expected_nums_col_3, + expected_strings_col}}; + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, + flatten_nested_columns(table, {}, {}, column_nullability::FORCE)); } TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtAllLevels) @@ -205,8 +273,26 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtAllLevels) auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)}; auto struct_of_structs_col = structs{{struct_1_nums_member, structs_1_structs_member}, null_at(4)}; - - flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}}); + auto table = cudf::table_view{{nums_col, struct_of_structs_col}}; + + auto expected_nums_col_1 = cudf::column(nums_col); + auto expected_structs_col_1 = bools{{1, 1, 1, 1, 0, 1, 1}, null_at(4)}; + auto expected_nums_col_2 = + cudf::column(static_cast(struct_of_structs_col).get_sliced_child(0)); + auto expected_structs_col_2 = bools{{1, 1, 0, 1, 0, 1, 1}, {1, 1, 0, 1, 0, 1, 1}}; + auto expected_nums_col_3 = cudf::column( + static_cast(struct_of_structs_col).get_sliced_child(1).child(0)); + auto expected_strings_col = cudf::column( + static_cast(struct_of_structs_col).get_sliced_child(1).child(1)); + auto expected = cudf::table_view{{expected_nums_col_1, + expected_structs_col_1, + expected_nums_col_2, + expected_structs_col_2, + expected_nums_col_3, + expected_strings_col}}; + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, + flatten_nested_columns(table, {}, {}, column_nullability::FORCE)); } TYPED_TEST(TypedStructUtilitiesTest, ListsAreUnsupported) @@ -222,7 +308,8 @@ TYPED_TEST(TypedStructUtilitiesTest, ListsAreUnsupported) auto structs_with_lists_col = structs{lists_member, ints_member}; - EXPECT_THROW(flatten_unflatten_compare(cudf::table_view{{structs_with_lists_col}}), + EXPECT_THROW(flatten_nested_columns( + cudf::table_view{{structs_with_lists_col}}, {}, {}, column_nullability::FORCE), cudf::logic_error); } diff --git a/java/src/main/java/ai/rapids/cudf/ParquetOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetOptions.java index dd771cab7ea..1ae1b91b962 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetOptions.java @@ -18,6 +18,10 @@ package ai.rapids.cudf; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + /** * Options for reading a parquet file */ @@ -26,24 +30,32 @@ public class ParquetOptions extends ColumnFilterOptions { public static ParquetOptions DEFAULT = new ParquetOptions(new Builder()); private final DType unit; - - + private final boolean[] readBinaryAsString; private ParquetOptions(Builder builder) { super(builder); unit = builder.unit; + readBinaryAsString = new boolean[builder.binaryAsStringColumns.size()]; + for (int i = 0 ; i < builder.binaryAsStringColumns.size() ; i++) { + readBinaryAsString[i] = builder.binaryAsStringColumns.get(i); + } } DType timeUnit() { return unit; } - public static Builder builder() { + boolean[] getReadBinaryAsString() { + return readBinaryAsString; + } + + public static ParquetOptions.Builder builder() { return new Builder(); } public static class Builder extends ColumnFilterOptions.Builder { private DType unit = DType.EMPTY; + final List binaryAsStringColumns = new ArrayList<>(); /** * Specify the time unit to use when returning timestamps. @@ -56,6 +68,43 @@ public Builder withTimeUnit(DType unit) { return this; } + /** + * Include one or more specific columns. Any column not included will not be read. + * @param names the name of the column, or more than one if you want. + */ + @Override + public Builder includeColumn(String... names) { + super.includeColumn(names); + for (int i = 0 ; i < names.length ; i++) { + binaryAsStringColumns.add(true); + } + return this; + } + + /** + * Include this column. + * @param name the name of the column + * @param isBinary whether this column is to be read in as binary + */ + public Builder includeColumn(String name, boolean isBinary) { + includeColumnNames.add(name); + binaryAsStringColumns.add(!isBinary); + return this; + } + + /** + * Include one or more specific columns. Any column not included will not be read. + * @param names the name of the column, or more than one if you want. + */ + @Override + public Builder includeColumn(Collection names) { + super.includeColumn(names); + for (int i = 0 ; i < names.size() ; i++) { + binaryAsStringColumns.add(true); + } + return this; + } + public ParquetOptions build() { return new ParquetOptions(this); } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index c8f842fcc63..e5194b8b7eb 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -239,16 +239,20 @@ private static native long[] readJSON(String[] columnNames, String filePath, long address, long length, boolean dayFirst, boolean lines) throws CudfException; + private static native long readAndInferJSON(long address, long length, + boolean dayFirst, boolean lines) throws CudfException; + /** * Read in Parquet formatted data. * @param filterColumnNames name of the columns to read, or an empty array if we want to read * all of them + * @param binaryToString whether to convert this column to String if binary * @param filePath the path of the file to read, or null if no path should be read. * @param address the address of the buffer to read from or 0 if we should not. * @param length the length of the buffer to read from. * @param timeUnit return type of TimeStamp in units */ - private static native long[] readParquet(String[] filterColumnNames, String filePath, + private static native long[] readParquet(String[] filterColumnNames, boolean[] binaryToString, String filePath, long address, long length, int timeUnit) throws CudfException; /** @@ -918,6 +922,26 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon } } + /** + * Read JSON formatted data and infer the column names and schema. + * @param opts various JSON parsing options. + * @param buffer raw UTF8 formatted bytes. + * @param offset the starting offset into buffer. + * @param len the number of bytes to parse. + * @return the data parsed as a table on the GPU and the metadata for the table returned. + */ + public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer, + long offset, long len) { + if (len <= 0) { + len = buffer.length - offset; + } + assert len > 0; + assert len <= buffer.length - offset; + assert offset >= 0 && offset < buffer.length; + return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len, + opts.isDayFirst(), opts.isLines())); + } + /** * Read JSON formatted data. * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. @@ -956,7 +980,7 @@ public static Table readParquet(File path) { * @return the file parsed as a table on the GPU. */ public static Table readParquet(ParquetOptions opts, File path) { - return new Table(readParquet(opts.getIncludeColumnNames(), + return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), path.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId())); } @@ -1016,7 +1040,7 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer, assert len > 0; assert len <= buffer.getLength() - offset; assert offset >= 0 && offset < buffer.length; - return new Table(readParquet(opts.getIncludeColumnNames(), + return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId())); } diff --git a/java/src/main/java/ai/rapids/cudf/TableWithMeta.java b/java/src/main/java/ai/rapids/cudf/TableWithMeta.java new file mode 100644 index 00000000000..9baa127d39d --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/TableWithMeta.java @@ -0,0 +1,67 @@ +/* + * + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +package ai.rapids.cudf; + +/** + * A table along with some metadata about the table. This is typically returned when + * reading data from an input file where the metadata can be important. + */ +public class TableWithMeta implements AutoCloseable { + private long handle; + + TableWithMeta(long handle) { + this.handle = handle; + } + + /** + * Get the table out of this metadata. Note that this can only be called once. Later calls + * will return a null. + */ + public Table releaseTable() { + long[] ptr = releaseTable(handle); + if (ptr == null) { + return null; + } else { + return new Table(ptr); + } + } + + /** + * Get the names of the top level columns. In the future new APIs can be added to get + * names of child columns. + */ + public String[] getColumnNames() { + return getColumnNames(handle); + } + + @Override + public void close() throws Exception { + if (handle != 0) { + close(handle); + handle = 0; + } + } + + private static native void close(long handle); + + private static native long[] releaseTable(long handle); + + private static native String[] getColumnNames(long handle); +} diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index d511512431b..44c08aec110 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1314,6 +1314,77 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV( CATCH_STD(env, NULL); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( + JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) { + + JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); + if (buffer_length <= 0) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "An empty buffer is not supported", 0); + } + + try { + cudf::jni::auto_set_device(env); + + auto source = cudf::io::source_info{reinterpret_cast(buffer), + static_cast(buffer_length)}; + + cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) + .dayfirst(static_cast(day_first)) + .lines(static_cast(lines)); + + auto result = + std::make_unique(cudf::io::read_json(opts.build())); + + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0); +} + +JNIEXPORT void JNICALL Java_ai_rapids_cudf_TableWithMeta_close(JNIEnv *env, jclass, jlong handle) { + JNI_NULL_CHECK(env, handle, "handle is null", ); + + try { + cudf::jni::auto_set_device(env); + delete reinterpret_cast(handle); + } + CATCH_STD(env, ); +} + +JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_TableWithMeta_getColumnNames(JNIEnv *env, jclass, + jlong handle) { + JNI_NULL_CHECK(env, handle, "handle is null", nullptr); + + try { + cudf::jni::auto_set_device(env); + auto ptr = reinterpret_cast(handle); + auto length = ptr->metadata.column_names.size(); + auto ret = static_cast( + env->NewObjectArray(length, env->FindClass("java/lang/String"), nullptr)); + for (size_t i = 0; i < length; i++) { + env->SetObjectArrayElement(ret, i, env->NewStringUTF(ptr->metadata.column_names[i].c_str())); + } + + return ret; + } + CATCH_STD(env, nullptr); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIEnv *env, jclass, + jlong handle) { + JNI_NULL_CHECK(env, handle, "handle is null", nullptr); + + try { + cudf::jni::auto_set_device(env); + auto ptr = reinterpret_cast(handle); + if (ptr->tbl) { + return convert_table_for_return(env, ptr->tbl); + } else { + return nullptr; + } + } + CATCH_STD(env, nullptr); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) { @@ -1428,11 +1499,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON( CATCH_STD(env, NULL); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env, jclass, - jobjectArray filter_col_names, - jstring inputfilepath, - jlong buffer, - jlong buffer_length, jint unit) { +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet( + JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, + jstring inputfilepath, jlong buffer, jlong buffer_length, jint unit) { + + JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0); bool read_buffer = true; if (buffer == 0) { JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL); @@ -1454,6 +1525,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env, } cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); + cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read); auto source = read_buffer ? cudf::io::source_info(reinterpret_cast(buffer), static_cast(buffer_length)) : @@ -1461,7 +1533,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env, auto builder = cudf::io::parquet_reader_options::builder(source); if (n_filter_col_names.size() > 0) { - builder = builder.columns(n_filter_col_names.as_cpp_vector()); + builder = builder.columns(n_filter_col_names.as_cpp_vector()) + .convert_binary_to_strings(n_col_binary_read.to_vector()); } cudf::io::parquet_reader_options opts = @@ -1678,10 +1751,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC( cudf::io::source_info(reinterpret_cast(buffer), buffer_length) : cudf::io::source_info(filename.get()); + auto builder = cudf::io::orc_reader_options::builder(source); + if (n_filter_col_names.size() > 0) { + builder = builder.columns(n_filter_col_names.as_cpp_vector()); + } + cudf::io::orc_reader_options opts = - cudf::io::orc_reader_options::builder(source) - .columns(n_filter_col_names.as_cpp_vector()) - .use_index(false) + builder.use_index(false) .use_np_dtypes(static_cast(usingNumPyTypes)) .timestamp_type(cudf::data_type(static_cast(unit))) .decimal128_columns(n_dec128_col_names.as_cpp_vector()) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 7ef47d6a7cc..c7e6fecea26 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -78,6 +78,7 @@ public class TableTest extends CudfTestBase { private static final File TEST_PARQUET_FILE = TestUtils.getResourceAsFile("acq.parquet"); + private static final File TEST_PARQUET_FILE_BINARY = TestUtils.getResourceAsFile("binary.parquet"); private static final File TEST_ORC_FILE = TestUtils.getResourceAsFile("TestOrcFile.orc"); private static final File TEST_ORC_TIMESTAMP_DATE_FILE = TestUtils.getResourceAsFile("timestamp-date-test.orc"); private static final File TEST_DECIMAL_PARQUET_FILE = TestUtils.getResourceAsFile("decimal.parquet"); @@ -566,6 +567,19 @@ void testReadParquet() { } } + @Test + void testReadParquetBinary() { + ParquetOptions opts = ParquetOptions.builder() + .includeColumn("value1", true) + .includeColumn("value2", false) + .build(); + try (Table table = Table.readParquet(opts, TEST_PARQUET_FILE_BINARY)) { + assertTableTypes(new DType[]{DType.LIST, DType.STRING}, table); + ColumnView columnView = table.getColumn(0); + assertEquals(DType.INT8, columnView.getChildColumnView(0).getType()); + } + } + @Test void testReadParquetBuffer() throws IOException { ParquetOptions opts = ParquetOptions.builder() diff --git a/java/src/test/resources/binary.parquet b/java/src/test/resources/binary.parquet new file mode 100644 index 00000000000..b72be9f36cc Binary files /dev/null and b/java/src/test/resources/binary.parquet differ diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index 2c65e329bb0..bc9d87a5cbf 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport uint8_t from libcpp cimport bool @@ -24,6 +24,7 @@ cdef extern from "cudf/io/json.hpp" \ size_type get_byte_range_size() except+ bool is_enabled_lines() except+ bool is_enabled_dayfirst() except+ + bool is_enabled_experimental() except+ # setter void set_dtypes(vector[data_type] types) except+ @@ -35,6 +36,7 @@ cdef extern from "cudf/io/json.hpp" \ void set_byte_range_size(size_type size) except+ void enable_lines(bool val) except+ void enable_dayfirst(bool val) except+ + void enable_experimental(bool val) except+ @staticmethod json_reader_options_builder builder( @@ -70,6 +72,9 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& dayfirst( bool val ) except+ + json_reader_options_builder& experimental( + bool val + ) except+ json_reader_options build() except+ diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd index 62ff5eb4f53..3e44ef98348 100644 --- a/python/cudf/cudf/_lib/cpp/io/orc.pxd +++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd @@ -19,7 +19,6 @@ cdef extern from "cudf/io/orc.hpp" \ orc_reader_options() except+ cudf_io_types.source_info get_source() except+ - vector[string] get_columns() except+ vector[vector[size_type]] get_stripes() except+ size_type get_skip_rows() except+ size_type get_num_rows() except+ diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 9c820a56104..0ee6062e7f2 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -31,7 +31,8 @@ cpdef read_json(object filepaths_or_buffers, object dtype, bool lines, object compression, - object byte_range): + object byte_range, + bool experimental): """ Cython function to call into libcudf API, see `read_json`. @@ -98,6 +99,7 @@ cpdef read_json(object filepaths_or_buffers, .lines(c_lines) .byte_range_offset(c_range_offset) .byte_range_size(c_range_size) + .experimental(experimental) .build() ) if is_list_like_dtypes: diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 4d1090d8434..11c70317a39 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -103,7 +103,7 @@ cpdef read_orc(object filepaths_or_buffers, """ cdef orc_reader_options c_orc_reader_options = make_orc_reader_options( filepaths_or_buffers, - columns or [], + columns, stripes or [], get_size_t_arg(skip_rows, "skip_rows"), get_size_t_arg(num_rows, "num_rows"), @@ -325,16 +325,11 @@ cdef orc_reader_options make_orc_reader_options( for i, datasource in enumerate(filepaths_or_buffers): if isinstance(datasource, NativeFile): filepaths_or_buffers[i] = NativeFileDatasource(datasource) - cdef vector[string] c_column_names cdef vector[vector[size_type]] strps = stripes - c_column_names.reserve(len(column_names)) - for col in column_names: - c_column_names.push_back(str(col).encode()) cdef orc_reader_options opts cdef source_info src = make_source_info(filepaths_or_buffers) opts = move( orc_reader_options.builder(src) - .columns(c_column_names) .stripes(strps) .skip_rows(skip_rows) .num_rows(num_rows) @@ -343,6 +338,13 @@ cdef orc_reader_options make_orc_reader_options( .build() ) + cdef vector[string] c_column_names + if column_names is not None: + c_column_names.reserve(len(column_names)) + for col in column_names: + c_column_names.push_back(str(col).encode()) + opts.set_columns(c_column_names) + return opts diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 264b1fb507b..c25360b307d 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -177,9 +177,8 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, allow_range_index = True if columns is not None: cpp_columns.reserve(len(columns)) - if len(cpp_columns) == 0: - allow_range_index = False - for col in columns or []: + allow_range_index = False + for col in columns: cpp_columns.push_back(str(col).encode()) args.set_columns(cpp_columns) diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 85f024e2420..e1e8e7cdb3d 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -30,7 +30,7 @@ def read_json( raise ValueError("cudf engine only supports JSON Lines format") if engine == "auto": engine = "cudf" if lines else "pandas" - if engine == "cudf": + if engine == "cudf" or engine == "cudf_experimental": # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(path_or_buf): @@ -58,7 +58,12 @@ def read_json( filepaths_or_buffers.append(tmp_source) df = libjson.read_json( - filepaths_or_buffers, dtype, lines, compression, byte_range + filepaths_or_buffers, + dtype, + lines, + compression, + byte_range, + engine == "cudf_experimental", ) else: warnings.warn( diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 358687d36c3..236fd619b8e 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2674,7 +2674,7 @@ def test_rangeindex_join_user_option(default_integer_bitwidth): idx1 = cudf.RangeIndex(0, 10) idx2 = cudf.RangeIndex(5, 15) - actual = idx1.join(idx2, how="inner") + actual = idx1.join(idx2, how="inner", sort=True) expected = cudf.Index( [5, 6, 7, 8, 9], dtype=f"int{default_integer_bitwidth}", name=0 ) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 84cf5872219..800ed68e8a4 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -573,3 +573,9 @@ def test_default_float_bitwidth(default_float_bitwidth): ) assert df["a"].dtype == np.dtype(f"f{default_float_bitwidth//8}") assert df["b"].dtype == np.dtype(f"f{default_float_bitwidth//8}") + + +def test_json_experimental(): + # should raise an exception, for now + with pytest.raises(RuntimeError): + cudf.read_json("", engine="cudf_experimental") diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 132eb528cd0..4373ef9afdf 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1758,3 +1758,25 @@ def test_orc_writer_zlib_compression(list_struct_buff): pytest.mark.xfail(reason="nvcomp build doesn't have deflate") else: raise e + + +@pytest.mark.parametrize("index", [True, False, None]) +@pytest.mark.parametrize("columns", [None, [], ["b", "a"]]) +def test_orc_columns_and_index_param(index, columns): + buffer = BytesIO() + df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + df.to_orc(buffer, index=index) + + expected = pd.read_orc(buffer, columns=columns) + got = cudf.read_orc(buffer, columns=columns) + + if columns: + # TODO: Remove workaround after this issue is fixed: + # https://github.com/pandas-dev/pandas/issues/47944 + assert_eq( + expected.sort_index(axis=1), + got.sort_index(axis=1), + check_index_type=True, + ) + else: + assert_eq(expected, got, check_index_type=True) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 3771587eb47..d3c41de842a 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -463,7 +463,7 @@ function or `StringIO`). Multiple inputs may be provided as a list. If a list is specified each list entry may be of a different input type as long as each input is of a valid type and all input JSON schema(s) match. -engine : {{ 'auto', 'cudf', 'pandas' }}, default 'auto' +engine : {{ 'auto', 'cudf', 'cudf_experimental', 'pandas' }}, default 'auto' Parser engine to use. If 'auto' is passed, the engine will be automatically selected based on the other parameters. orient : string, diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py index 575683bc5fa..7d8a6d7c3a3 100644 --- a/python/dask_cudf/setup.py +++ b/python/dask_cudf/setup.py @@ -10,8 +10,8 @@ install_requires = [ "cudf", - "dask>=2022.05.2", - "distributed>=2022.05.2", + "dask==2022.7.1", + "distributed==2022.7.1", "fsspec>=0.6.0", "numpy", "pandas>=1.0,<1.5.0dev0",