From c77075ee900eff68afccfac78a5d2229df5b9a2c Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Fri, 26 May 2023 18:49:49 +0800 Subject: [PATCH 01/17] Add pairwise diff function --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/compute/api_vector.cc | 19 ++ cpp/src/arrow/compute/api_vector.h | 31 +++ cpp/src/arrow/compute/kernels/CMakeLists.txt | 1 + .../arrow/compute/kernels/vector_pairwise.cc | 256 ++++++++++++++++++ .../compute/kernels/vector_pairwise_test.cc | 197 ++++++++++++++ cpp/src/arrow/compute/registry.cc | 1 + cpp/src/arrow/compute/registry_internal.h | 2 +- docs/source/cpp/compute.rst | 17 ++ docs/source/python/api/compute.rst | 9 + python/pyarrow/_compute.pyx | 19 ++ python/pyarrow/compute.py | 1 + python/pyarrow/includes/libarrow.pxd | 6 + python/pyarrow/tests/test_compute.py | 31 +++ 14 files changed, 590 insertions(+), 1 deletion(-) create mode 100644 cpp/src/arrow/compute/kernels/vector_pairwise.cc create mode 100644 cpp/src/arrow/compute/kernels/vector_pairwise_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 00cf899349aa1..e283fd7ee92b1 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -456,6 +456,7 @@ if(ARROW_COMPUTE) compute/kernels/scalar_validity.cc compute/kernels/vector_array_sort.cc compute/kernels/vector_cumulative_ops.cc + compute/kernels/vector_pairwise.cc compute/kernels/vector_nested.cc compute/kernels/vector_rank.cc compute/kernels/vector_replace.cc diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index b33e3feb72993..8abd2686c4adc 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -35,6 +35,7 @@ #include "arrow/result.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" +#include "arrow/util/reflection_internal.h" namespace arrow { @@ -150,6 +151,9 @@ static auto kRankOptionsType = GetFunctionOptionsType( DataMember("sort_keys", &RankOptions::sort_keys), DataMember("null_placement", &RankOptions::null_placement), DataMember("tiebreaker", &RankOptions::tiebreaker)); +static auto kPairwiseDiffOptionsType = GetFunctionOptionsType( + DataMember("periods", &PairwiseDiffOptions::periods), + DataMember("check_overflow", &PairwiseDiffOptions::check_overflow)); } // namespace } // namespace internal @@ -217,6 +221,12 @@ RankOptions::RankOptions(std::vector sort_keys, NullPlacement null_plac tiebreaker(tiebreaker) {} constexpr char RankOptions::kTypeName[]; +PairwiseDiffOptions::PairwiseDiffOptions(double periods, bool check_overflow) + : FunctionOptions(internal::kPairwiseDiffOptionsType), + periods(periods), + check_overflow(check_overflow) {} +constexpr char PairwiseDiffOptions::kTypeName[]; + namespace internal { void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kFilterOptionsType)); @@ -229,6 +239,7 @@ void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kSelectKOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kCumulativeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kRankOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kPairwiseDiffOptionsType)); } } // namespace internal @@ -338,6 +349,14 @@ Result> ValueCounts(const Datum& value, ExecContext return checked_pointer_cast(result.make_array()); } +Result> PairwiseDiff(const Array& array, + const PairwiseDiffOptions& options, + ExecContext* ctx) { + ARROW_ASSIGN_OR_RAISE(Datum result, + CallFunction("pairwise_diff", {Datum(array)}, &options, ctx)); + return result.make_array(); +} + // ---------------------------------------------------------------------- // Filter- and take-related selection functions diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 56bccb38c2b53..ff8ae46b9cfeb 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -234,6 +234,21 @@ class ARROW_EXPORT CumulativeOptions : public FunctionOptions { }; using CumulativeSumOptions = CumulativeOptions; // For backward compatibility +/// \brief Options for pairwise functions +class ARROW_EXPORT PairwiseDiffOptions : public FunctionOptions { + public: + explicit PairwiseDiffOptions(double periods = 1, bool check_overflow = false); + static constexpr char const kTypeName[] = "PairwiseDiffOptions"; + static PairwiseDiffOptions Defaults() { return PairwiseDiffOptions(); } + + /// Periods to shift for applying the binary operation, accepts negative values. + int64_t periods = 1; + + /// When true, returns an Invalid Status when overflow is detected, otherwise result is + /// wrapped around. + bool check_overflow = false; +}; + /// @} /// \brief Filter with a boolean selection filter @@ -650,6 +665,22 @@ Result CumulativeMin( const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(), ExecContext* ctx = NULLPTR); +/// \brief Return the first order difference of an array. +/// +/// Computes the first order difference of an array, i.e. output[i] = input[i] - input[i - +/// p] if i >= p, otherwise output[i] = null, where p is the period. For example, with p = +/// 1, Diff([1, 4, 9, 10, 15]) = [null, 3, 5, 1, 5]. With p = 2, Diff([1, 4, 9, 10, 15]) = +/// [null, null, 8, 6, 6] +/// +/// \param[in] datum array input +/// \param[in] options options, specifying overflow behavior and period +/// \param[in] ctx the function execution context, optional +/// \return result as array +ARROW_EXPORT +Result> PairwiseDiff(const Array& array, + const PairwiseDiffOptions& options, + ExecContext* ctx = NULLPTR); + // ---------------------------------------------------------------------- // Deprecated functions diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index dcb024089475c..1afeb419c4958 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -69,6 +69,7 @@ add_arrow_benchmark(scalar_temporal_benchmark PREFIX "arrow-compute") add_arrow_compute_test(vector_test SOURCES vector_cumulative_ops_test.cc + vector_pairwise_test.cc vector_hash_test.cc vector_nested_test.cc vector_replace_test.cc diff --git a/cpp/src/arrow/compute/kernels/vector_pairwise.cc b/cpp/src/arrow/compute/kernels/vector_pairwise.cc new file mode 100644 index 0000000000000..97ea6110e757e --- /dev/null +++ b/cpp/src/arrow/compute/kernels/vector_pairwise.cc @@ -0,0 +1,256 @@ +#include +#include "arrow/builder.h" +#include "arrow/compute/api_vector.h" +#include "arrow/compute/exec.h" +#include "arrow/compute/function.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/base_arithmetic_internal.h" +#include "arrow/compute/kernels/codegen_internal.h" +#include "arrow/compute/registry.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" +#include "arrow/visit_type_inline.h" + +namespace arrow::compute::internal { + +template +Result> GetDiffOutputType( + const std::shared_ptr& type) { + std::shared_ptr output_type; + if constexpr (is_timestamp_type::value) { // timestamp -> duration with same + // time unit + const auto* real_type = checked_cast(type.get()); + return std::make_shared(real_type->unit()); + } else if constexpr (is_time_type::value) { // time -> duration with same + // time unit + const auto* real_type = checked_cast(type.get()); + return std::make_shared(real_type->unit()); + } else if constexpr (is_date_type::value) { // date -> duration + if constexpr (InputType::type_id == Type::DATE32) { // date32 -> second + return duration(TimeUnit::SECOND); + } else { // date64 -> millisecond + return duration(TimeUnit::MILLI); + } + } else if constexpr (is_decimal_type::value) { // decimal -> decimal with + // precision + 1 + const auto* real_type = checked_cast(type.get()); + if constexpr (InputType::type_id == Type::DECIMAL128) { + return Decimal128Type::Make(real_type->precision() + 1, real_type->scale()); + } else { + return Decimal256Type::Make(real_type->precision() + 1, real_type->scale()); + } + } else { + return type; + } +} + +/// A generic pairwise implementation that can be reused by different Ops. +template +Status PairwiseKernelImpl(const ArraySpan& input, int64_t periods, + const std::shared_ptr& output_type, + std::shared_ptr* result) { + typename TypeTraits::BuilderType builder(output_type, + default_memory_pool()); + RETURN_NOT_OK(builder.Reserve(input.length)); + + Status status; + auto valid_func = [&](typename GetViewType::T left, + typename GetViewType::T right) { + auto result = Op::template Call::T>( + nullptr, left, right, &status); + builder.UnsafeAppend(result); + }; + auto null_func = [&]() { builder.UnsafeAppendNull(); }; + + if (periods > 0) { + periods = std::min(periods, input.length); + RETURN_NOT_OK(builder.AppendNulls(periods)); + ArraySpan left(input); + left.SetSlice(periods, input.length - periods); + ArraySpan right(input); + right.SetSlice(0, input.length - periods); + VisitTwoArrayValuesInline(left, right, valid_func, null_func); + RETURN_NOT_OK(status); + } else { + periods = std::max(periods, -input.length); + ArraySpan left(input); + left.SetSlice(0, input.length + periods); + ArraySpan right(input); + right.SetSlice(-periods, input.length + periods); + VisitTwoArrayValuesInline(left, right, valid_func, null_func); + RETURN_NOT_OK(status); + RETURN_NOT_OK(builder.AppendNulls(-periods)); + } + RETURN_NOT_OK(builder.FinishInternal(result)); + return Status::OK(); +} + +template +Status PairwiseDiffKernel(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const PairwiseDiffOptions& options = OptionsWrapper::Get(ctx); + std::shared_ptr result; + auto input = batch[0].array; + ARROW_ASSIGN_OR_RAISE(auto output_type, + GetDiffOutputType(input.type->GetSharedPtr())); + if (options.check_overflow) { + RETURN_NOT_OK((PairwiseKernelImpl( + batch[0].array, options.periods, output_type, &result))); + } else { + RETURN_NOT_OK((PairwiseKernelImpl( + batch[0].array, options.periods, output_type, &result))); + } + + out->value = std::move(result); + return Status::OK(); +} + +const FunctionDoc pairwise_diff_doc( + "Compute first order difference of an array", + ("This function computes the first order difference of an array, i.e. output[i]\n" + "= input[i] - input[i - p] if i >= p, otherwise output[i] = null, where p is \n" + "the period. The period can also be negative. It internally calls the scalar \n" + "function Subtract to compute the differences, so its behavior and supported \n" + "types are the same as Subtract.\n" + "\n" + "The period and handling of overflow can be specified in PairwiseDiffOptions."), + {"input"}, "PairwiseDiffOptions"); + +const PairwiseDiffOptions* GetDefaultPairwiseDiffOptions() { + static const auto kDefaultPairwiseDiffOptions = PairwiseDiffOptions::Defaults(); + return &kDefaultPairwiseDiffOptions; +} + +struct PairwiseKernelData { + InputType input; + OutputType output; + ArrayKernelExec exec; +}; + +void RegisterPairwiseDiffKernels(FunctionRegistry* registry) { + std::vector pairwise_diff_kernels = { + {int8(), int8(), PairwiseDiffKernel}, + {uint8(), uint8(), + PairwiseDiffKernel}, + {int16(), int16(), + PairwiseDiffKernel}, + {uint16(), uint16(), + PairwiseDiffKernel}, + {int32(), int32(), + PairwiseDiffKernel}, + {uint32(), uint32(), + PairwiseDiffKernel}, + {int64(), int64(), + PairwiseDiffKernel}, + {uint64(), uint64(), + PairwiseDiffKernel}, + {float32(), float32(), + PairwiseDiffKernel}, + {float64(), float64(), + PairwiseDiffKernel}, + + }; + + auto decimal_resolver = [](KernelContext*, + const std::vector& types) -> Result { + // Subtract increase decimal precision by one + DCHECK(is_decimal(types[0].id())); + auto decimal_type = checked_cast(types[0].type); + return decimal(decimal_type->precision() + 1, decimal_type->scale()); + }; + + pairwise_diff_kernels.emplace_back(PairwiseKernelData{ + Type::DECIMAL128, OutputType(decimal_resolver), + PairwiseDiffKernel}); + pairwise_diff_kernels.emplace_back(PairwiseKernelData{ + Type::DECIMAL256, OutputType(decimal_resolver), + PairwiseDiffKernel}); + + auto identity_resolver = + [](KernelContext*, const std::vector& types) -> Result { + return types[0]; + }; + + // timestamp -> duration + for (auto unit : TimeUnit::values()) { + InputType in_type(match::TimestampTypeUnit(unit)); + OutputType out_type(duration(unit)); + auto exec = + PairwiseDiffKernel; + pairwise_diff_kernels.emplace_back(PairwiseKernelData{in_type, out_type, exec}); + } + + // duration -> duration + for (auto unit : TimeUnit::values()) { + InputType in_type(match::DurationTypeUnit(unit)); + OutputType out_type(identity_resolver); + auto exec = PairwiseDiffKernel; + pairwise_diff_kernels.emplace_back( + PairwiseKernelData{in_type, out_type, std::move(exec)}); + } + + // time32 -> duration + for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI}) { + InputType in_type(match::Time32TypeUnit(unit)); + OutputType out_type(duration(unit)); + auto exec = PairwiseDiffKernel; + pairwise_diff_kernels.emplace_back( + PairwiseKernelData{in_type, out_type, std::move(exec)}); + } + + // time64 -> duration + for (auto unit : {TimeUnit::MICRO, TimeUnit::NANO}) { + InputType in_type(match::Time64TypeUnit(unit)); + OutputType out_type(duration(unit)); + auto exec = PairwiseDiffKernel; + pairwise_diff_kernels.emplace_back( + PairwiseKernelData{in_type, out_type, std::move(exec)}); + } + + // date32 -> duration(TimeUnit::SECOND) + { + InputType in_type(date32()); + OutputType out_type(duration(TimeUnit::SECOND)); + auto exec = PairwiseDiffKernel; + pairwise_diff_kernels.emplace_back( + PairwiseKernelData{in_type, out_type, std::move(exec)}); + } + + // date64 -> duration(TimeUnit::MILLI) + { + InputType in_type(date64()); + OutputType out_type(duration(TimeUnit::MILLI)); + auto exec = PairwiseDiffKernel; + pairwise_diff_kernels.emplace_back( + PairwiseKernelData{in_type, out_type, std::move(exec)}); + } + + VectorKernel base_kernel; + base_kernel.can_execute_chunkwise = false; + base_kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; + base_kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; + base_kernel.init = OptionsWrapper::Init; + auto func = + std::make_shared("pairwise_diff", Arity::Unary(), pairwise_diff_doc, + GetDefaultPairwiseDiffOptions()); + + for (const auto& kernel_data : pairwise_diff_kernels) { + base_kernel.signature = + KernelSignature::Make({kernel_data.input}, kernel_data.output); + base_kernel.exec = kernel_data.exec; + DCHECK_OK(func->AddKernel(base_kernel)); + } + + DCHECK_OK(registry->AddFunction(std::move(func))); +} + +void RegisterVectorPairwise(FunctionRegistry* registry) { + RegisterPairwiseDiffKernels(registry); +} + +} // namespace arrow::compute::internal diff --git a/cpp/src/arrow/compute/kernels/vector_pairwise_test.cc b/cpp/src/arrow/compute/kernels/vector_pairwise_test.cc new file mode 100644 index 0000000000000..b7f2fedcd9dfa --- /dev/null +++ b/cpp/src/arrow/compute/kernels/vector_pairwise_test.cc @@ -0,0 +1,197 @@ +#include +#include +#include +#include "arrow/compute/api_vector.h" +#include "arrow/compute/kernels/test_util.h" +#include "arrow/compute/registry.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/testing/util.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "gmock/gmock.h" + +namespace arrow::compute { + +Result> GetOutputType( + const std::shared_ptr input_type) { + switch (input_type->id()) { + case Type::TIMESTAMP: { + return duration(checked_cast(*input_type).unit()); + } + case Type::TIME32: { + return duration(checked_cast(*input_type).unit()); + } + case Type::TIME64: { + return duration(checked_cast(*input_type).unit()); + } + case Type::DATE32: { + return duration(TimeUnit::SECOND); + } + case Type::DATE64: { + return duration(TimeUnit::MILLI); + } + case Type::DECIMAL128: { + const auto& real_type = checked_cast(*input_type); + return Decimal128Type::Make(real_type.precision() + 1, real_type.scale()); + } + case Type::DECIMAL256: { + const auto& real_type = checked_cast(*input_type); + return Decimal256Type::Make(real_type.precision() + 1, real_type.scale()); + } + default: { + return input_type; + } + } +} + +class TestDiffKernel : public ::testing::Test { + public: + void SetUp() override { + test_numerical_types_ = NumericTypes(); + test_temporal_types_ = TemporalTypes(); + test_decimal_types_ = {decimal(4, 2), decimal(70, 10)}; + + test_input_types_.insert(test_input_types_.end(), test_numerical_types_.begin(), + test_numerical_types_.end()); + test_input_types_.insert(test_input_types_.end(), test_temporal_types_.begin(), + test_temporal_types_.end()); + test_input_types_.insert(test_input_types_.end(), test_decimal_types_.begin(), + test_decimal_types_.end()); + } + + protected: + std::vector> test_numerical_types_; + std::vector> test_temporal_types_; + std::vector> test_decimal_types_; + std::vector> test_input_types_; +}; + +TEST_F(TestDiffKernel, Empty) { + for (int64_t period = -2; period <= 2; ++period) { + PairwiseDiffOptions options(period); + for (auto input_type : test_input_types_) { + ASSERT_OK_AND_ASSIGN(auto output_type, GetOutputType(input_type)); + auto input = ArrayFromJSON(input_type, "[]"); + auto output = ArrayFromJSON(output_type, "[]"); + CheckVectorUnary("pairwise_diff", input, output, &options); + } + } +} + +TEST_F(TestDiffKernel, AllNull) { + for (int64_t period = -2; period <= 2; ++period) { + PairwiseDiffOptions options(period); + for (auto input_type : test_input_types_) { + ASSERT_OK_AND_ASSIGN(auto output_type, GetOutputType(input_type)); + auto input = ArrayFromJSON(input_type, "[null, null, null]"); + auto output = ArrayFromJSON(output_type, "[null, null, null]"); + CheckVectorUnary("pairwise_diff", input, output, &options); + } + } +} + +TEST_F(TestDiffKernel, Numeric) { + { + PairwiseDiffOptions options(1); + for (auto input_type : test_numerical_types_) { + ASSERT_OK_AND_ASSIGN(auto output_type, GetOutputType(input_type)); + auto input = ArrayFromJSON(input_type, "[null, 1, 2, null, 4, 5, 6]"); + auto output = ArrayFromJSON(output_type, "[null, null, 1, null, null, 1, 1]"); + CheckVectorUnary("pairwise_diff", input, output, &options); + } + } + + { + PairwiseDiffOptions options(2); + for (auto input_type : test_numerical_types_) { + ASSERT_OK_AND_ASSIGN(auto output_type, GetOutputType(input_type)); + auto input = ArrayFromJSON(input_type, "[null, 1, 2, null, 4, 5, 6]"); + auto output = ArrayFromJSON(output_type, "[null, null, null, null, 2, null, 2]"); + CheckVectorUnary("pairwise_diff", input, output, &options); + } + } + + { + PairwiseDiffOptions options(-1); + for (auto input_type : test_numerical_types_) { + ASSERT_OK_AND_ASSIGN(auto output_type, GetOutputType(input_type)); + auto input = ArrayFromJSON(input_type, "[6, 5, 4, null, 2, 1, null]"); + auto output = ArrayFromJSON(output_type, "[1, 1, null, null, 1, null, null]"); + CheckVectorUnary("pairwise_diff", input, output, &options); + } + } + + { + PairwiseDiffOptions options(-2); + for (auto input_type : test_numerical_types_) { + ASSERT_OK_AND_ASSIGN(auto output_type, GetOutputType(input_type)); + auto input = ArrayFromJSON(input_type, "[6, 5, 4, null, 2, 1, null]"); + auto output = ArrayFromJSON(output_type, "[2, null, 2, null, null, null, null]"); + CheckVectorUnary("pairwise_diff", input, output, &options); + } + } +} + +TEST_F(TestDiffKernel, Overflow) { + { + PairwiseDiffOptions options(1); + auto input = ArrayFromJSON(uint8(), "[3, 2, 1]"); + auto output = ArrayFromJSON(uint8(), "[null, 255, 255]"); + CheckVectorUnary("pairwise_diff", input, output, &options); + } + + { + PairwiseDiffOptions options(1, /*check_overflow=*/true); + auto input = ArrayFromJSON(uint8(), "[3, 2, 1]"); + auto output = ArrayFromJSON(uint8(), "[null, 255, 255]"); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("overflow"), + CallFunction("pairwise_diff", {input}, &options)); + } +} + +TEST_F(TestDiffKernel, Temporal) { + { + PairwiseDiffOptions options(1); + for (auto input_type : test_temporal_types_) { + ASSERT_OK_AND_ASSIGN(auto output_type, GetOutputType(input_type)); + auto input = ArrayFromJSON(input_type, "[null, 5, 1, null, 9, 6, 37]"); + auto output = ArrayFromJSON( + output_type, + input_type->id() != Type::DATE32 // Subtract date32 results in seconds + ? "[null, null, -4, null, null, -3, 31]" + : "[null, null, -345600, null, null, -259200, 2678400]"); + CheckVectorUnary("pairwise_diff", input, output, &options); + } + } +} + +TEST_F(TestDiffKernel, Decimal) { + { + PairwiseDiffOptions options(1); + auto input = ArrayFromJSON(decimal(4, 2), R"(["11.00", "22.11", "-10.25", "33.45"])"); + auto output = ArrayFromJSON(decimal(5, 2), R"([null, "11.11", "-32.36", "43.70"])"); + CheckVectorUnary("pairwise_diff", input, output, &options); + } + + { + PairwiseDiffOptions options(-1); + auto input = ArrayFromJSON( + decimal(40, 30), + R"(["1111111111.222222222222222222222222222222", "2222222222.333333333333333333333333333333"])"); + auto output = ArrayFromJSON( + decimal(41, 30), R"(["-1111111111.111111111111111111111111111111", null])"); + CheckVectorUnary("pairwise_diff", input, output, &options); + } + + { /// Out of range decimal precision + PairwiseDiffOptions options(1); + auto input = ArrayFromJSON(decimal(38, 0), R"(["1e38"])"); + + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, + testing::HasSubstr("Decimal precision out of range"), + CallFunction("pairwise_diff", {input}, &options)); + } +} +} // namespace arrow::compute diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc index fe8a83a3f6eae..a4b484a2069ea 100644 --- a/cpp/src/arrow/compute/registry.cc +++ b/cpp/src/arrow/compute/registry.cc @@ -310,6 +310,7 @@ static std::unique_ptr CreateBuiltInRegistry() { RegisterVectorSort(registry.get()); RegisterVectorRunEndEncode(registry.get()); RegisterVectorRunEndDecode(registry.get()); + RegisterVectorPairwise(registry.get()); // Aggregate functions RegisterHashAggregateBasic(registry.get()); diff --git a/cpp/src/arrow/compute/registry_internal.h b/cpp/src/arrow/compute/registry_internal.h index 6628d09716544..b4239701d9573 100644 --- a/cpp/src/arrow/compute/registry_internal.h +++ b/cpp/src/arrow/compute/registry_internal.h @@ -54,7 +54,7 @@ void RegisterVectorSelection(FunctionRegistry* registry); void RegisterVectorSort(FunctionRegistry* registry); void RegisterVectorRunEndEncode(FunctionRegistry* registry); void RegisterVectorRunEndDecode(FunctionRegistry* registry); - +void RegisterVectorPairwise(FunctionRegistry* registry); void RegisterVectorOptions(FunctionRegistry* registry); // Aggregate functions diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 70c17ae2b96ea..e76335f4f0ad2 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1847,3 +1847,20 @@ replaced, based on the remaining inputs. results in a corresponding null in the output. Also see: :ref:`if_else `. + +Pairwise functions +~~~~~~~~~~~~~~~~~~~~ +Pairwise functions are unary vector functions that performs a binary operation on +a pair of elements in the input array, typically on adjacent elements. + ++------------------------+-------+----------------------+----------------------+--------------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++========================+=======+======================+======================+================================+=======+ +| pairwise_diff | Unary | Numeric/Temporal | Numeric/Temporal | :struct:`PairwiseDiffOptions` | \(1) | ++------------------------+-------+----------------------+----------------------+--------------------------------+-------+ +* \(1) Computes the first order difference of an array, i.e. output[i] = + input[i] - input[i-period]. The default period is 1. The period can also + be negative. It internally calls the scalar function ``Subtract`` to compute + the differences, so its behavior and supported types are the same as + ``Subtract``. The period and handling of overflow can be specified in + :struct:`PairwiseDiffOptions`. diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 43deedd653425..655a049ba4724 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -521,6 +521,14 @@ Structural Transforms replace_with_mask struct_field +Pairwise Functions +------------------ + +.. autosummary:: + :toctree: ../generated/ + + pairwise_diff + Compute Options --------------- @@ -547,6 +555,7 @@ Compute Options ModeOptions NullOptions PadOptions + PairwiseDiffOptions PartitionNthOptions QuantileOptions ReplaceSliceOptions diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index d1aded326d5c8..ec4b20f575117 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -1968,6 +1968,25 @@ class CumulativeOptions(_CumulativeOptions): def __init__(self, start=None, *, skip_nulls=False): self._set_options(start, skip_nulls) +cdef class _PairwiseDiffOptions(FunctionOptions): + def _set_options(self, period, check_overflow): + self.wrapped.reset(new CPairwiseDiffOptions(period, check_overflow)) + + +class PairwiseDiffOptions(_PairwiseDiffOptions): + """ + Options for `pairwise_diff` function. + + Parameters + ---------- + period : int, default 1 + Period for computing difference. + check_overflow : bool, default False + When true, return error on overflow. When false, wrap around on overflow. + """ + + def __init__(self, period=1, check_overflow=False): + self._set_options(period, check_overflow) cdef class _ArraySortOptions(FunctionOptions): def _set_options(self, order, null_placement): diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 3d428758a497c..b7a8cf38095f4 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -50,6 +50,7 @@ ModeOptions, NullOptions, PadOptions, + PairwiseDiffOptions, PartitionNthOptions, QuantileOptions, RandomOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 37a261c833431..a45c9703ca671 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2407,6 +2407,12 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: optional[shared_ptr[CScalar]] start c_bool skip_nulls + cdef cppclass CPairwiseDiffOptions \ + "arrow::compute::PairwiseDiffOptions"(CFunctionOptions): + CPairwiseDiffOptions(int64_t period, c_bool skip_nulls) + int64_t period + c_bool skip_nulls + cdef cppclass CArraySortOptions \ "arrow::compute::ArraySortOptions"(CFunctionOptions): CArraySortOptions(CSortOrder, CNullPlacement) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 38bdeb126348b..da064abe4da57 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -155,6 +155,7 @@ def test_option_class_equality(): pc.ModeOptions(), pc.NullOptions(), pc.PadOptions(5), + pc.PairwiseDiffOptions(period=1, check_overflow=False), pc.PartitionNthOptions(1, null_placement="at_start"), pc.CumulativeOptions(start=None, skip_nulls=False), pc.QuantileOptions(), @@ -3481,3 +3482,33 @@ def test_run_end_encode(): check_run_end_encode_decode(pc.RunEndEncodeOptions(pa.int16())) check_run_end_encode_decode(pc.RunEndEncodeOptions('int32')) check_run_end_encode_decode(pc.RunEndEncodeOptions(pa.int64())) + + +def test_pairwise_diff(): + arr = pa.array([1, 2, 3, None, 4, 5]) + expected = pa.array([None, 1, 1, None, None, 1]) + result = pa.compute.pairwise_diff(arr, period=1) + assert result.equals(expected) + + arr = pa.array([1, 2, 3, None, 4, 5]) + expected = pa.array([None, None, 2, None, 1, None]) + result = pa.compute.pairwise_diff(arr, period=2) + assert result.equals(expected) + + # negative period + arr = pa.array([1, 2, 3, None, 4, 5], type=pa.int8()) + expected = pa.array([-1, -1, None, None, -1, None], type=pa.int8()) + result = pa.compute.pairwise_diff(arr, period=-1) + assert result.equals(expected) + + # wrap around overflow + arr = pa.array([1, 2, 3, None, 4, 5], type=pa.uint8()) + expected = pa.array([255, 255, None, None, 255, None], type=pa.uint8()) + result = pa.compute.pairwise_diff(arr, period=-1) + assert result.equals(expected) + + # fail on overflow + arr = pa.array([1, 2, 3, None, 4, 5], type=pa.uint8()) + with pytest.raises(pa.ArrowInvalid, + match="overflow"): + pa.compute.pairwise_diff(arr, period=-1, check_overflow=True) From 789bc9e36316256f83ae8f492373126302a52e62 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Fri, 26 May 2023 19:09:20 +0800 Subject: [PATCH 02/17] make linter happy --- .../arrow/compute/kernels/vector_pairwise.cc | 20 ++++++++++++++++++- .../compute/kernels/vector_pairwise_test.cc | 17 ++++++++++++++++ python/pyarrow/_compute.pyx | 2 ++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/vector_pairwise.cc b/cpp/src/arrow/compute/kernels/vector_pairwise.cc index 97ea6110e757e..9269df6b74589 100644 --- a/cpp/src/arrow/compute/kernels/vector_pairwise.cc +++ b/cpp/src/arrow/compute/kernels/vector_pairwise.cc @@ -1,3 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Vector kernels for pairwise computation + #include #include "arrow/builder.h" #include "arrow/compute/api_vector.h" @@ -152,7 +171,6 @@ void RegisterPairwiseDiffKernels(FunctionRegistry* registry) { PairwiseDiffKernel}, {float64(), float64(), PairwiseDiffKernel}, - }; auto decimal_resolver = [](KernelContext*, diff --git a/cpp/src/arrow/compute/kernels/vector_pairwise_test.cc b/cpp/src/arrow/compute/kernels/vector_pairwise_test.cc index b7f2fedcd9dfa..b20cff00f18bf 100644 --- a/cpp/src/arrow/compute/kernels/vector_pairwise_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_pairwise_test.cc @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + #include #include #include diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index ec4b20f575117..c3ba51081d147 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -1968,6 +1968,7 @@ class CumulativeOptions(_CumulativeOptions): def __init__(self, start=None, *, skip_nulls=False): self._set_options(start, skip_nulls) + cdef class _PairwiseDiffOptions(FunctionOptions): def _set_options(self, period, check_overflow): self.wrapped.reset(new CPairwiseDiffOptions(period, check_overflow)) @@ -1988,6 +1989,7 @@ class PairwiseDiffOptions(_PairwiseDiffOptions): def __init__(self, period=1, check_overflow=False): self._set_options(period, check_overflow) + cdef class _ArraySortOptions(FunctionOptions): def _set_options(self, order, null_placement): self.wrapped.reset(new CArraySortOptions( From 885e78cc5a9b227fdf9a9a209de2dae47732c3d9 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Fri, 26 May 2023 20:48:50 +0800 Subject: [PATCH 03/17] fix doc --- cpp/src/arrow/compute/api_vector.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index ff8ae46b9cfeb..a618287529a80 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -672,7 +672,7 @@ Result CumulativeMin( /// 1, Diff([1, 4, 9, 10, 15]) = [null, 3, 5, 1, 5]. With p = 2, Diff([1, 4, 9, 10, 15]) = /// [null, null, 8, 6, 6] /// -/// \param[in] datum array input +/// \param[in] array array input /// \param[in] options options, specifying overflow behavior and period /// \param[in] ctx the function execution context, optional /// \return result as array From 1f8d8c5c7dcc26cc62631aa02e2e5a78040df137 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Fri, 26 May 2023 21:08:03 +0800 Subject: [PATCH 04/17] fix warning --- cpp/src/arrow/compute/api_vector.cc | 2 +- cpp/src/arrow/compute/api_vector.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index 8abd2686c4adc..d5296a50286c5 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -221,7 +221,7 @@ RankOptions::RankOptions(std::vector sort_keys, NullPlacement null_plac tiebreaker(tiebreaker) {} constexpr char RankOptions::kTypeName[]; -PairwiseDiffOptions::PairwiseDiffOptions(double periods, bool check_overflow) +PairwiseDiffOptions::PairwiseDiffOptions(int64_t periods, bool check_overflow) : FunctionOptions(internal::kPairwiseDiffOptionsType), periods(periods), check_overflow(check_overflow) {} diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index a618287529a80..cf8bc9c52a869 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -237,7 +237,7 @@ using CumulativeSumOptions = CumulativeOptions; // For backward compatibility /// \brief Options for pairwise functions class ARROW_EXPORT PairwiseDiffOptions : public FunctionOptions { public: - explicit PairwiseDiffOptions(double periods = 1, bool check_overflow = false); + explicit PairwiseDiffOptions(int64_t periods = 1, bool check_overflow = false); static constexpr char const kTypeName[] = "PairwiseDiffOptions"; static PairwiseDiffOptions Defaults() { return PairwiseDiffOptions(); } From 4a61a8f294ffd3d43819053b9f471ff018290742 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Fri, 26 May 2023 22:07:20 +0800 Subject: [PATCH 05/17] refactor to two different functions for better extending to other ops --- cpp/src/arrow/compute/api_vector.cc | 22 ++-- cpp/src/arrow/compute/api_vector.h | 15 +-- .../arrow/compute/kernels/vector_pairwise.cc | 116 +++++++++--------- .../compute/kernels/vector_pairwise_test.cc | 29 ++--- docs/source/cpp/compute.rst | 31 +++-- docs/source/python/api/compute.rst | 2 +- python/pyarrow/_compute.pyx | 18 ++- python/pyarrow/compute.py | 2 +- python/pyarrow/includes/libarrow.pxd | 7 +- python/pyarrow/tests/test_compute.py | 4 +- 10 files changed, 121 insertions(+), 125 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index d5296a50286c5..67595c3308f9b 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -151,9 +151,8 @@ static auto kRankOptionsType = GetFunctionOptionsType( DataMember("sort_keys", &RankOptions::sort_keys), DataMember("null_placement", &RankOptions::null_placement), DataMember("tiebreaker", &RankOptions::tiebreaker)); -static auto kPairwiseDiffOptionsType = GetFunctionOptionsType( - DataMember("periods", &PairwiseDiffOptions::periods), - DataMember("check_overflow", &PairwiseDiffOptions::check_overflow)); +static auto kPairwiseOptionsType = GetFunctionOptionsType( + DataMember("periods", &PairwiseOptions::periods)); } // namespace } // namespace internal @@ -221,11 +220,9 @@ RankOptions::RankOptions(std::vector sort_keys, NullPlacement null_plac tiebreaker(tiebreaker) {} constexpr char RankOptions::kTypeName[]; -PairwiseDiffOptions::PairwiseDiffOptions(int64_t periods, bool check_overflow) - : FunctionOptions(internal::kPairwiseDiffOptionsType), - periods(periods), - check_overflow(check_overflow) {} -constexpr char PairwiseDiffOptions::kTypeName[]; +PairwiseOptions::PairwiseOptions(int64_t periods) + : FunctionOptions(internal::kPairwiseOptionsType), periods(periods) {} +constexpr char PairwiseOptions::kTypeName[]; namespace internal { void RegisterVectorOptions(FunctionRegistry* registry) { @@ -239,7 +236,7 @@ void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kSelectKOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kCumulativeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kRankOptionsType)); - DCHECK_OK(registry->AddFunctionOptionsType(kPairwiseDiffOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kPairwiseOptionsType)); } } // namespace internal @@ -350,10 +347,11 @@ Result> ValueCounts(const Datum& value, ExecContext } Result> PairwiseDiff(const Array& array, - const PairwiseDiffOptions& options, - ExecContext* ctx) { + const PairwiseOptions& options, + bool check_overflow, ExecContext* ctx) { + auto func_name = check_overflow ? "pairwise_diff_checked" : "pairwise_diff"; ARROW_ASSIGN_OR_RAISE(Datum result, - CallFunction("pairwise_diff", {Datum(array)}, &options, ctx)); + CallFunction(func_name, {Datum(array)}, &options, ctx)); return result.make_array(); } diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index cf8bc9c52a869..566d3d8cf9bcd 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -235,18 +235,14 @@ class ARROW_EXPORT CumulativeOptions : public FunctionOptions { using CumulativeSumOptions = CumulativeOptions; // For backward compatibility /// \brief Options for pairwise functions -class ARROW_EXPORT PairwiseDiffOptions : public FunctionOptions { +class ARROW_EXPORT PairwiseOptions : public FunctionOptions { public: - explicit PairwiseDiffOptions(int64_t periods = 1, bool check_overflow = false); - static constexpr char const kTypeName[] = "PairwiseDiffOptions"; - static PairwiseDiffOptions Defaults() { return PairwiseDiffOptions(); } + explicit PairwiseOptions(int64_t periods = 1); + static constexpr char const kTypeName[] = "PairwiseOptions"; + static PairwiseOptions Defaults() { return PairwiseOptions(); } /// Periods to shift for applying the binary operation, accepts negative values. int64_t periods = 1; - - /// When true, returns an Invalid Status when overflow is detected, otherwise result is - /// wrapped around. - bool check_overflow = false; }; /// @} @@ -678,7 +674,8 @@ Result CumulativeMin( /// \return result as array ARROW_EXPORT Result> PairwiseDiff(const Array& array, - const PairwiseDiffOptions& options, + const PairwiseOptions& options, + bool check_overflow = false, ExecContext* ctx = NULLPTR); // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/kernels/vector_pairwise.cc b/cpp/src/arrow/compute/kernels/vector_pairwise.cc index 9269df6b74589..2df3d6fd2cc89 100644 --- a/cpp/src/arrow/compute/kernels/vector_pairwise.cc +++ b/cpp/src/arrow/compute/kernels/vector_pairwise.cc @@ -108,21 +108,15 @@ Status PairwiseKernelImpl(const ArraySpan& input, int64_t periods, return Status::OK(); } -template +template Status PairwiseDiffKernel(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - const PairwiseDiffOptions& options = OptionsWrapper::Get(ctx); + const PairwiseOptions& options = OptionsWrapper::Get(ctx); std::shared_ptr result; auto input = batch[0].array; ARROW_ASSIGN_OR_RAISE(auto output_type, GetDiffOutputType(input.type->GetSharedPtr())); - if (options.check_overflow) { - RETURN_NOT_OK((PairwiseKernelImpl( - batch[0].array, options.periods, output_type, &result))); - } else { - RETURN_NOT_OK((PairwiseKernelImpl( - batch[0].array, options.periods, output_type, &result))); - } + RETURN_NOT_OK((PairwiseKernelImpl( + batch[0].array, options.periods, output_type, &result))); out->value = std::move(result); return Status::OK(); @@ -130,18 +124,29 @@ Status PairwiseDiffKernel(KernelContext* ctx, const ExecSpan& batch, ExecResult* const FunctionDoc pairwise_diff_doc( "Compute first order difference of an array", - ("This function computes the first order difference of an array, i.e. output[i]\n" - "= input[i] - input[i - p] if i >= p, otherwise output[i] = null, where p is \n" - "the period. The period can also be negative. It internally calls the scalar \n" - "function Subtract to compute the differences, so its behavior and supported \n" - "types are the same as Subtract.\n" + ("Computes the first order difference of an array, It internally calls \n" + "the scalar function \"subtract\" to compute \n differences, so its \n" + "behavior and supported types are the same as \n" + "\"subtract\". The period can be specified in :struct:`PairwiseOptions`.\n" + "\n" + "Results will wrap around on integer overflow. Use function \n" + "\"pairwise_diff_checked\" if you want overflow to return an error."), + {"input"}, "PairwiseOptions"); + +const FunctionDoc pairwise_diff_checked_doc( + "Compute first order difference of an array", + ("Computes the first order difference of an array, It internally calls \n" + "the scalar function \"subtract_checked\" (or the checked variant) to compute \n" + "differences, so its behavior and supported types are the same as \n" + "\"subtract_checked\". The period can be specified in :struct:`PairwiseOptions`.\n" "\n" - "The period and handling of overflow can be specified in PairwiseDiffOptions."), - {"input"}, "PairwiseDiffOptions"); + "This function returns an error on overflow. For a variant that doesn't \n" + "fail on overflow, use function \"pairwise_diff\"."), + {"input"}, "PairwiseOptions"); -const PairwiseDiffOptions* GetDefaultPairwiseDiffOptions() { - static const auto kDefaultPairwiseDiffOptions = PairwiseDiffOptions::Defaults(); - return &kDefaultPairwiseDiffOptions; +const PairwiseOptions* GetDefaultPairwiseOptions() { + static const auto kDefaultPairwiseOptions = PairwiseOptions::Defaults(); + return &kDefaultPairwiseOptions; } struct PairwiseKernelData { @@ -150,27 +155,20 @@ struct PairwiseKernelData { ArrayKernelExec exec; }; -void RegisterPairwiseDiffKernels(FunctionRegistry* registry) { +template +void RegisterPairwiseDiffKernels(std::string_view func_name, const FunctionDoc& doc, + FunctionRegistry* registry) { std::vector pairwise_diff_kernels = { - {int8(), int8(), PairwiseDiffKernel}, - {uint8(), uint8(), - PairwiseDiffKernel}, - {int16(), int16(), - PairwiseDiffKernel}, - {uint16(), uint16(), - PairwiseDiffKernel}, - {int32(), int32(), - PairwiseDiffKernel}, - {uint32(), uint32(), - PairwiseDiffKernel}, - {int64(), int64(), - PairwiseDiffKernel}, - {uint64(), uint64(), - PairwiseDiffKernel}, - {float32(), float32(), - PairwiseDiffKernel}, - {float64(), float64(), - PairwiseDiffKernel}, + {int8(), int8(), PairwiseDiffKernel}, + {uint8(), uint8(), PairwiseDiffKernel}, + {int16(), int16(), PairwiseDiffKernel}, + {uint16(), uint16(), PairwiseDiffKernel}, + {int32(), int32(), PairwiseDiffKernel}, + {uint32(), uint32(), PairwiseDiffKernel}, + {int64(), int64(), PairwiseDiffKernel}, + {uint64(), uint64(), PairwiseDiffKernel}, + {float32(), float32(), PairwiseDiffKernel}, + {float64(), float64(), PairwiseDiffKernel}, }; auto decimal_resolver = [](KernelContext*, @@ -181,12 +179,12 @@ void RegisterPairwiseDiffKernels(FunctionRegistry* registry) { return decimal(decimal_type->precision() + 1, decimal_type->scale()); }; - pairwise_diff_kernels.emplace_back(PairwiseKernelData{ - Type::DECIMAL128, OutputType(decimal_resolver), - PairwiseDiffKernel}); - pairwise_diff_kernels.emplace_back(PairwiseKernelData{ - Type::DECIMAL256, OutputType(decimal_resolver), - PairwiseDiffKernel}); + pairwise_diff_kernels.emplace_back( + PairwiseKernelData{Type::DECIMAL128, OutputType(decimal_resolver), + PairwiseDiffKernel}); + pairwise_diff_kernels.emplace_back( + PairwiseKernelData{Type::DECIMAL256, OutputType(decimal_resolver), + PairwiseDiffKernel}); auto identity_resolver = [](KernelContext*, const std::vector& types) -> Result { @@ -197,8 +195,7 @@ void RegisterPairwiseDiffKernels(FunctionRegistry* registry) { for (auto unit : TimeUnit::values()) { InputType in_type(match::TimestampTypeUnit(unit)); OutputType out_type(duration(unit)); - auto exec = - PairwiseDiffKernel; + auto exec = PairwiseDiffKernel; pairwise_diff_kernels.emplace_back(PairwiseKernelData{in_type, out_type, exec}); } @@ -206,7 +203,7 @@ void RegisterPairwiseDiffKernels(FunctionRegistry* registry) { for (auto unit : TimeUnit::values()) { InputType in_type(match::DurationTypeUnit(unit)); OutputType out_type(identity_resolver); - auto exec = PairwiseDiffKernel; + auto exec = PairwiseDiffKernel; pairwise_diff_kernels.emplace_back( PairwiseKernelData{in_type, out_type, std::move(exec)}); } @@ -215,7 +212,7 @@ void RegisterPairwiseDiffKernels(FunctionRegistry* registry) { for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI}) { InputType in_type(match::Time32TypeUnit(unit)); OutputType out_type(duration(unit)); - auto exec = PairwiseDiffKernel; + auto exec = PairwiseDiffKernel; pairwise_diff_kernels.emplace_back( PairwiseKernelData{in_type, out_type, std::move(exec)}); } @@ -224,7 +221,7 @@ void RegisterPairwiseDiffKernels(FunctionRegistry* registry) { for (auto unit : {TimeUnit::MICRO, TimeUnit::NANO}) { InputType in_type(match::Time64TypeUnit(unit)); OutputType out_type(duration(unit)); - auto exec = PairwiseDiffKernel; + auto exec = PairwiseDiffKernel; pairwise_diff_kernels.emplace_back( PairwiseKernelData{in_type, out_type, std::move(exec)}); } @@ -233,8 +230,7 @@ void RegisterPairwiseDiffKernels(FunctionRegistry* registry) { { InputType in_type(date32()); OutputType out_type(duration(TimeUnit::SECOND)); - auto exec = PairwiseDiffKernel; + auto exec = PairwiseDiffKernel; pairwise_diff_kernels.emplace_back( PairwiseKernelData{in_type, out_type, std::move(exec)}); } @@ -243,7 +239,7 @@ void RegisterPairwiseDiffKernels(FunctionRegistry* registry) { { InputType in_type(date64()); OutputType out_type(duration(TimeUnit::MILLI)); - auto exec = PairwiseDiffKernel; + auto exec = PairwiseDiffKernel; pairwise_diff_kernels.emplace_back( PairwiseKernelData{in_type, out_type, std::move(exec)}); } @@ -252,10 +248,9 @@ void RegisterPairwiseDiffKernels(FunctionRegistry* registry) { base_kernel.can_execute_chunkwise = false; base_kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; base_kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; - base_kernel.init = OptionsWrapper::Init; - auto func = - std::make_shared("pairwise_diff", Arity::Unary(), pairwise_diff_doc, - GetDefaultPairwiseDiffOptions()); + base_kernel.init = OptionsWrapper::Init; + auto func = std::make_shared(std::string(func_name), Arity::Unary(), + doc, GetDefaultPairwiseOptions()); for (const auto& kernel_data : pairwise_diff_kernels) { base_kernel.signature = @@ -268,7 +263,10 @@ void RegisterPairwiseDiffKernels(FunctionRegistry* registry) { } void RegisterVectorPairwise(FunctionRegistry* registry) { - RegisterPairwiseDiffKernels(registry); + RegisterPairwiseDiffKernels("pairwise_diff", + pairwise_diff_doc, registry); + RegisterPairwiseDiffKernels( + "pairwise_diff_checked", pairwise_diff_checked_doc, registry); } } // namespace arrow::compute::internal diff --git a/cpp/src/arrow/compute/kernels/vector_pairwise_test.cc b/cpp/src/arrow/compute/kernels/vector_pairwise_test.cc index b20cff00f18bf..be0f118184833 100644 --- a/cpp/src/arrow/compute/kernels/vector_pairwise_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_pairwise_test.cc @@ -87,7 +87,7 @@ class TestDiffKernel : public ::testing::Test { TEST_F(TestDiffKernel, Empty) { for (int64_t period = -2; period <= 2; ++period) { - PairwiseDiffOptions options(period); + PairwiseOptions options(period); for (auto input_type : test_input_types_) { ASSERT_OK_AND_ASSIGN(auto output_type, GetOutputType(input_type)); auto input = ArrayFromJSON(input_type, "[]"); @@ -99,7 +99,7 @@ TEST_F(TestDiffKernel, Empty) { TEST_F(TestDiffKernel, AllNull) { for (int64_t period = -2; period <= 2; ++period) { - PairwiseDiffOptions options(period); + PairwiseOptions options(period); for (auto input_type : test_input_types_) { ASSERT_OK_AND_ASSIGN(auto output_type, GetOutputType(input_type)); auto input = ArrayFromJSON(input_type, "[null, null, null]"); @@ -111,7 +111,7 @@ TEST_F(TestDiffKernel, AllNull) { TEST_F(TestDiffKernel, Numeric) { { - PairwiseDiffOptions options(1); + PairwiseOptions options(1); for (auto input_type : test_numerical_types_) { ASSERT_OK_AND_ASSIGN(auto output_type, GetOutputType(input_type)); auto input = ArrayFromJSON(input_type, "[null, 1, 2, null, 4, 5, 6]"); @@ -121,7 +121,7 @@ TEST_F(TestDiffKernel, Numeric) { } { - PairwiseDiffOptions options(2); + PairwiseOptions options(2); for (auto input_type : test_numerical_types_) { ASSERT_OK_AND_ASSIGN(auto output_type, GetOutputType(input_type)); auto input = ArrayFromJSON(input_type, "[null, 1, 2, null, 4, 5, 6]"); @@ -131,7 +131,7 @@ TEST_F(TestDiffKernel, Numeric) { } { - PairwiseDiffOptions options(-1); + PairwiseOptions options(-1); for (auto input_type : test_numerical_types_) { ASSERT_OK_AND_ASSIGN(auto output_type, GetOutputType(input_type)); auto input = ArrayFromJSON(input_type, "[6, 5, 4, null, 2, 1, null]"); @@ -141,7 +141,7 @@ TEST_F(TestDiffKernel, Numeric) { } { - PairwiseDiffOptions options(-2); + PairwiseOptions options(-2); for (auto input_type : test_numerical_types_) { ASSERT_OK_AND_ASSIGN(auto output_type, GetOutputType(input_type)); auto input = ArrayFromJSON(input_type, "[6, 5, 4, null, 2, 1, null]"); @@ -153,24 +153,25 @@ TEST_F(TestDiffKernel, Numeric) { TEST_F(TestDiffKernel, Overflow) { { - PairwiseDiffOptions options(1); + PairwiseOptions options(1); auto input = ArrayFromJSON(uint8(), "[3, 2, 1]"); auto output = ArrayFromJSON(uint8(), "[null, 255, 255]"); CheckVectorUnary("pairwise_diff", input, output, &options); } { - PairwiseDiffOptions options(1, /*check_overflow=*/true); + PairwiseOptions options(1); auto input = ArrayFromJSON(uint8(), "[3, 2, 1]"); auto output = ArrayFromJSON(uint8(), "[null, 255, 255]"); - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("overflow"), - CallFunction("pairwise_diff", {input}, &options)); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("overflow"), + CallFunction("pairwise_diff_checked", {input}, &options)); } } TEST_F(TestDiffKernel, Temporal) { { - PairwiseDiffOptions options(1); + PairwiseOptions options(1); for (auto input_type : test_temporal_types_) { ASSERT_OK_AND_ASSIGN(auto output_type, GetOutputType(input_type)); auto input = ArrayFromJSON(input_type, "[null, 5, 1, null, 9, 6, 37]"); @@ -186,14 +187,14 @@ TEST_F(TestDiffKernel, Temporal) { TEST_F(TestDiffKernel, Decimal) { { - PairwiseDiffOptions options(1); + PairwiseOptions options(1); auto input = ArrayFromJSON(decimal(4, 2), R"(["11.00", "22.11", "-10.25", "33.45"])"); auto output = ArrayFromJSON(decimal(5, 2), R"([null, "11.11", "-32.36", "43.70"])"); CheckVectorUnary("pairwise_diff", input, output, &options); } { - PairwiseDiffOptions options(-1); + PairwiseOptions options(-1); auto input = ArrayFromJSON( decimal(40, 30), R"(["1111111111.222222222222222222222222222222", "2222222222.333333333333333333333333333333"])"); @@ -203,7 +204,7 @@ TEST_F(TestDiffKernel, Decimal) { } { /// Out of range decimal precision - PairwiseDiffOptions options(1); + PairwiseOptions options(1); auto input = ArrayFromJSON(decimal(38, 0), R"(["1e38"])"); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index e76335f4f0ad2..63c1a4abc0a45 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1851,16 +1851,21 @@ replaced, based on the remaining inputs. Pairwise functions ~~~~~~~~~~~~~~~~~~~~ Pairwise functions are unary vector functions that performs a binary operation on -a pair of elements in the input array, typically on adjacent elements. - -+------------------------+-------+----------------------+----------------------+--------------------------------+-------+ -| Function name | Arity | Input types | Output type | Options class | Notes | -+========================+=======+======================+======================+================================+=======+ -| pairwise_diff | Unary | Numeric/Temporal | Numeric/Temporal | :struct:`PairwiseDiffOptions` | \(1) | -+------------------------+-------+----------------------+----------------------+--------------------------------+-------+ -* \(1) Computes the first order difference of an array, i.e. output[i] = - input[i] - input[i-period]. The default period is 1. The period can also - be negative. It internally calls the scalar function ``Subtract`` to compute - the differences, so its behavior and supported types are the same as - ``Subtract``. The period and handling of overflow can be specified in - :struct:`PairwiseDiffOptions`. +a pair of elements in the input array, typically on adjacent elements. The n-th +output is computed by applying the binary operation on the n-th and (n-p)-th, +where p is the period. The default period is 1. The period can also be negative. + ++------------------------+-------+----------------------+----------------------+--------------------------------+----------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++========================+=======+======================+======================+================================+==========+ +| pairwise_diff | Unary | Numeric/Temporal | Numeric/Temporal | :struct:`PairwiseOptions` | \(1)(2) | ++------------------------+-------+----------------------+----------------------+--------------------------------+----------+ +| pairwise_diff_checked | Unary | Numeric/Temporal | Numeric/Temporal | :struct:`PairwiseOptions` | \(1)(3) | ++------------------------+-------+----------------------+----------------------+--------------------------------+----------+ + +* \(1) Computes the first order difference of an array, It internally calls + the scalar function ``Subtract`` (or the checked variant) to compute + differences, so its behavior and supported types are the same as + ``Subtract``. The period can be specified in :struct:`PairwiseOptions`. +* \(2) Wraps around the result when overflow is detected. +* \(3) Returns an ``Invalid`` :class:`Status` when overflow is detected. diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 655a049ba4724..f29d4db3941fd 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -555,7 +555,7 @@ Compute Options ModeOptions NullOptions PadOptions - PairwiseDiffOptions + PairwiseOptions PartitionNthOptions QuantileOptions ReplaceSliceOptions diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index c3ba51081d147..a33a09548d629 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -1969,25 +1969,23 @@ class CumulativeOptions(_CumulativeOptions): self._set_options(start, skip_nulls) -cdef class _PairwiseDiffOptions(FunctionOptions): - def _set_options(self, period, check_overflow): - self.wrapped.reset(new CPairwiseDiffOptions(period, check_overflow)) +cdef class _PairwiseOptions(FunctionOptions): + def _set_options(self, period): + self.wrapped.reset(new CPairwiseOptions(period)) -class PairwiseDiffOptions(_PairwiseDiffOptions): +class PairwiseOptions(_PairwiseOptions): """ - Options for `pairwise_diff` function. + Options for `pairwise` functions. Parameters ---------- period : int, default 1 - Period for computing difference. - check_overflow : bool, default False - When true, return error on overflow. When false, wrap around on overflow. + Period for applying the period function. """ - def __init__(self, period=1, check_overflow=False): - self._set_options(period, check_overflow) + def __init__(self, period=1): + self._set_options(period) cdef class _ArraySortOptions(FunctionOptions): diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index b7a8cf38095f4..0fefa18dd1136 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -50,7 +50,7 @@ ModeOptions, NullOptions, PadOptions, - PairwiseDiffOptions, + PairwiseOptions, PartitionNthOptions, QuantileOptions, RandomOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index a45c9703ca671..da46cdcb750d5 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2407,11 +2407,10 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: optional[shared_ptr[CScalar]] start c_bool skip_nulls - cdef cppclass CPairwiseDiffOptions \ - "arrow::compute::PairwiseDiffOptions"(CFunctionOptions): - CPairwiseDiffOptions(int64_t period, c_bool skip_nulls) + cdef cppclass CPairwiseOptions \ + "arrow::compute::PairwiseOptions"(CFunctionOptions): + CPairwiseOptions(int64_t period) int64_t period - c_bool skip_nulls cdef cppclass CArraySortOptions \ "arrow::compute::ArraySortOptions"(CFunctionOptions): diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index da064abe4da57..d9209ada24a5c 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -155,7 +155,7 @@ def test_option_class_equality(): pc.ModeOptions(), pc.NullOptions(), pc.PadOptions(5), - pc.PairwiseDiffOptions(period=1, check_overflow=False), + pc.PairwiseOptions(period=1), pc.PartitionNthOptions(1, null_placement="at_start"), pc.CumulativeOptions(start=None, skip_nulls=False), pc.QuantileOptions(), @@ -3511,4 +3511,4 @@ def test_pairwise_diff(): arr = pa.array([1, 2, 3, None, 4, 5], type=pa.uint8()) with pytest.raises(pa.ArrowInvalid, match="overflow"): - pa.compute.pairwise_diff(arr, period=-1, check_overflow=True) + pa.compute.pairwise_diff_checked(arr, period=-1) From 6741ba26a539863a1aaef4a563a6a6d8127e7fc0 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Fri, 26 May 2023 22:20:36 +0800 Subject: [PATCH 06/17] fix doc --- cpp/src/arrow/compute/api_vector.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 566d3d8cf9bcd..5b0dbbddb9d48 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -670,6 +670,7 @@ Result CumulativeMin( /// /// \param[in] array array input /// \param[in] options options, specifying overflow behavior and period +/// \param[in] check_overflow whether to return error on overflow /// \param[in] ctx the function execution context, optional /// \return result as array ARROW_EXPORT From fabd43a036b547da052937f16b7ce49ad0e68ea0 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Sat, 24 Jun 2023 20:54:06 +0800 Subject: [PATCH 07/17] use subtract kernel exec --- cpp/src/arrow/compute/exec.h | 3 + cpp/src/arrow/compute/kernel.h | 11 +- .../arrow/compute/kernels/vector_pairwise.cc | 256 ++++++------------ .../compute/kernels/vector_pairwise_test.cc | 14 +- 4 files changed, 103 insertions(+), 181 deletions(-) diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h index c2c514dbb9f2f..3fbefe4a1ab7b 100644 --- a/cpp/src/arrow/compute/exec.h +++ b/cpp/src/arrow/compute/exec.h @@ -356,6 +356,9 @@ struct ARROW_EXPORT ExecResult { const std::shared_ptr& array_data() const { return std::get>(this->value); } + ArrayData* array_data_mutable() { + return std::get>(this->value).get(); + } bool is_array_data() const { return this->value.index() == 1; } }; diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index a52636aeb6bd2..8ae5cc3538714 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -283,7 +283,12 @@ class ARROW_EXPORT OutputType { /// /// This function SHOULD _not_ be used to check for arity, that is to be /// performed one or more layers above. - using Resolver = Result (*)(KernelContext*, const std::vector&); + using Resolver = + std::function(KernelContext*, const std::vector&)>; + + // For backward compatibility + using ResolverFuncPtr = Result (*)(KernelContext*, + const std::vector&); /// \brief Output an exact type OutputType(std::shared_ptr type) // NOLINT implicit construction @@ -293,6 +298,10 @@ class ARROW_EXPORT OutputType { OutputType(Resolver resolver) // NOLINT implicit construction : kind_(COMPUTED), resolver_(std::move(resolver)) {} + /// \brief For backward compatibility + OutputType(ResolverFuncPtr resolver) // NOLINT implicit construction + : kind_(COMPUTED), resolver_(std::move(resolver)) {} + OutputType(const OutputType& other) { this->kind_ = other.kind_; this->type_ = other.type_; diff --git a/cpp/src/arrow/compute/kernels/vector_pairwise.cc b/cpp/src/arrow/compute/kernels/vector_pairwise.cc index 2df3d6fd2cc89..8db96b288f2c7 100644 --- a/cpp/src/arrow/compute/kernels/vector_pairwise.cc +++ b/cpp/src/arrow/compute/kernels/vector_pairwise.cc @@ -17,6 +17,7 @@ // Vector kernels for pairwise computation +#include #include #include "arrow/builder.h" #include "arrow/compute/api_vector.h" @@ -26,99 +27,75 @@ #include "arrow/compute/kernels/base_arithmetic_internal.h" #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/registry.h" +#include "arrow/compute/util.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_fwd.h" #include "arrow/type_traits.h" +#include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" #include "arrow/visit_type_inline.h" namespace arrow::compute::internal { -template -Result> GetDiffOutputType( - const std::shared_ptr& type) { - std::shared_ptr output_type; - if constexpr (is_timestamp_type::value) { // timestamp -> duration with same - // time unit - const auto* real_type = checked_cast(type.get()); - return std::make_shared(real_type->unit()); - } else if constexpr (is_time_type::value) { // time -> duration with same - // time unit - const auto* real_type = checked_cast(type.get()); - return std::make_shared(real_type->unit()); - } else if constexpr (is_date_type::value) { // date -> duration - if constexpr (InputType::type_id == Type::DATE32) { // date32 -> second - return duration(TimeUnit::SECOND); - } else { // date64 -> millisecond - return duration(TimeUnit::MILLI); - } - } else if constexpr (is_decimal_type::value) { // decimal -> decimal with - // precision + 1 - const auto* real_type = checked_cast(type.get()); - if constexpr (InputType::type_id == Type::DECIMAL128) { - return Decimal128Type::Make(real_type->precision() + 1, real_type->scale()); - } else { - return Decimal256Type::Make(real_type->precision() + 1, real_type->scale()); - } - } else { - return type; - } -} +// We reuse the kernel exec function of a scalar binary function to compute pairwise +// results. For example, for pairwise_diff, we reuse subtract's kernel exec. +struct PairwiseState : KernelState { + PairwiseState(const PairwiseOptions& options, const ArrayKernelExec& scalar_exec) + : periods(options.periods), scalar_exec(scalar_exec) {} -/// A generic pairwise implementation that can be reused by different Ops. -template -Status PairwiseKernelImpl(const ArraySpan& input, int64_t periods, - const std::shared_ptr& output_type, - std::shared_ptr* result) { - typename TypeTraits::BuilderType builder(output_type, - default_memory_pool()); - RETURN_NOT_OK(builder.Reserve(input.length)); + int64_t periods; + const ArrayKernelExec& scalar_exec; +}; - Status status; - auto valid_func = [&](typename GetViewType::T left, - typename GetViewType::T right) { - auto result = Op::template Call::T>( - nullptr, left, right, &status); - builder.UnsafeAppend(result); +KernelInit GeneratePairwiseInit(const ArrayKernelExec& scalar_exec) { + return [&scalar_exec](KernelContext* ctx, const KernelInitArgs& args) { + return std::make_unique( + checked_cast(*args.options), scalar_exec); }; - auto null_func = [&]() { builder.UnsafeAppendNull(); }; +} - if (periods > 0) { - periods = std::min(periods, input.length); - RETURN_NOT_OK(builder.AppendNulls(periods)); - ArraySpan left(input); - left.SetSlice(periods, input.length - periods); - ArraySpan right(input); - right.SetSlice(0, input.length - periods); - VisitTwoArrayValuesInline(left, right, valid_func, null_func); - RETURN_NOT_OK(status); - } else { - periods = std::max(periods, -input.length); - ArraySpan left(input); - left.SetSlice(0, input.length + periods); - ArraySpan right(input); - right.SetSlice(-periods, input.length + periods); - VisitTwoArrayValuesInline(left, right, valid_func, null_func); - RETURN_NOT_OK(status); - RETURN_NOT_OK(builder.AppendNulls(-periods)); +/// A generic pairwise implementation that can be reused by different ops. +Status PairwiseExecImpl(KernelContext* ctx, const ArraySpan& input, + const ArrayKernelExec& scalar_exec, int64_t periods, + ArrayData* result) { + auto offset = abs(periods); + offset = std::min(offset, input.length); + auto exec_length = input.length - offset; + // prepare bitmap + auto null_start = periods > 0 ? 0 : exec_length; + auto non_null_start = periods > 0 ? offset : 0; + bit_util::ClearBitmap(result->buffers[0]->mutable_data(), null_start, offset); + for (int64_t i = non_null_start; i < non_null_start + exec_length; i++) { + if (input.IsValid(i) && input.IsValid(i - periods)) { + bit_util::SetBit(result->buffers[0]->mutable_data(), i); + } else { + bit_util::ClearBit(result->buffers[0]->mutable_data(), i); + } } - RETURN_NOT_OK(builder.FinishInternal(result)); + // prepare input span + ArraySpan left(input); + left.SetSlice(periods > 0 ? offset : 0, exec_length); + ArraySpan right(input); + right.SetSlice(periods > 0 ? 0 : offset, exec_length); + // prepare output span + ArraySpan output_span; + output_span.SetMembers(*result); + output_span.offset = periods > 0 ? offset : 0; + output_span.length = exec_length; + ExecResult output{output_span}; + // execute scalar function + RETURN_NOT_OK(scalar_exec(ctx, ExecSpan({left, right}, exec_length), &output)); + return Status::OK(); } -template -Status PairwiseDiffKernel(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - const PairwiseOptions& options = OptionsWrapper::Get(ctx); - std::shared_ptr result; +Status PairwiseExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const auto& state = checked_cast(*ctx->state()); auto input = batch[0].array; - ARROW_ASSIGN_OR_RAISE(auto output_type, - GetDiffOutputType(input.type->GetSharedPtr())); - RETURN_NOT_OK((PairwiseKernelImpl( - batch[0].array, options.periods, output_type, &result))); - - out->value = std::move(result); + RETURN_NOT_OK(PairwiseExecImpl(ctx, batch[0].array, state.scalar_exec, state.periods, + out->array_data_mutable())); return Status::OK(); } @@ -155,118 +132,51 @@ struct PairwiseKernelData { ArrayKernelExec exec; }; -template -void RegisterPairwiseDiffKernels(std::string_view func_name, const FunctionDoc& doc, +void RegisterPairwiseDiffKernels(std::string_view func_name, + std::string_view base_func_name, const FunctionDoc& doc, FunctionRegistry* registry) { - std::vector pairwise_diff_kernels = { - {int8(), int8(), PairwiseDiffKernel}, - {uint8(), uint8(), PairwiseDiffKernel}, - {int16(), int16(), PairwiseDiffKernel}, - {uint16(), uint16(), PairwiseDiffKernel}, - {int32(), int32(), PairwiseDiffKernel}, - {uint32(), uint32(), PairwiseDiffKernel}, - {int64(), int64(), PairwiseDiffKernel}, - {uint64(), uint64(), PairwiseDiffKernel}, - {float32(), float32(), PairwiseDiffKernel}, - {float64(), float64(), PairwiseDiffKernel}, - }; - - auto decimal_resolver = [](KernelContext*, - const std::vector& types) -> Result { - // Subtract increase decimal precision by one - DCHECK(is_decimal(types[0].id())); - auto decimal_type = checked_cast(types[0].type); - return decimal(decimal_type->precision() + 1, decimal_type->scale()); - }; - - pairwise_diff_kernels.emplace_back( - PairwiseKernelData{Type::DECIMAL128, OutputType(decimal_resolver), - PairwiseDiffKernel}); - pairwise_diff_kernels.emplace_back( - PairwiseKernelData{Type::DECIMAL256, OutputType(decimal_resolver), - PairwiseDiffKernel}); - - auto identity_resolver = - [](KernelContext*, const std::vector& types) -> Result { - return types[0]; - }; - - // timestamp -> duration - for (auto unit : TimeUnit::values()) { - InputType in_type(match::TimestampTypeUnit(unit)); - OutputType out_type(duration(unit)); - auto exec = PairwiseDiffKernel; - pairwise_diff_kernels.emplace_back(PairwiseKernelData{in_type, out_type, exec}); - } - - // duration -> duration - for (auto unit : TimeUnit::values()) { - InputType in_type(match::DurationTypeUnit(unit)); - OutputType out_type(identity_resolver); - auto exec = PairwiseDiffKernel; - pairwise_diff_kernels.emplace_back( - PairwiseKernelData{in_type, out_type, std::move(exec)}); - } - - // time32 -> duration - for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI}) { - InputType in_type(match::Time32TypeUnit(unit)); - OutputType out_type(duration(unit)); - auto exec = PairwiseDiffKernel; - pairwise_diff_kernels.emplace_back( - PairwiseKernelData{in_type, out_type, std::move(exec)}); - } - - // time64 -> duration - for (auto unit : {TimeUnit::MICRO, TimeUnit::NANO}) { - InputType in_type(match::Time64TypeUnit(unit)); - OutputType out_type(duration(unit)); - auto exec = PairwiseDiffKernel; - pairwise_diff_kernels.emplace_back( - PairwiseKernelData{in_type, out_type, std::move(exec)}); - } - - // date32 -> duration(TimeUnit::SECOND) - { - InputType in_type(date32()); - OutputType out_type(duration(TimeUnit::SECOND)); - auto exec = PairwiseDiffKernel; - pairwise_diff_kernels.emplace_back( - PairwiseKernelData{in_type, out_type, std::move(exec)}); - } - - // date64 -> duration(TimeUnit::MILLI) - { - InputType in_type(date64()); - OutputType out_type(duration(TimeUnit::MILLI)); - auto exec = PairwiseDiffKernel; - pairwise_diff_kernels.emplace_back( - PairwiseKernelData{in_type, out_type, std::move(exec)}); - } - VectorKernel base_kernel; base_kernel.can_execute_chunkwise = false; - base_kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; - base_kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; + base_kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE; + base_kernel.mem_allocation = MemAllocation::PREALLOCATE; base_kernel.init = OptionsWrapper::Init; auto func = std::make_shared(std::string(func_name), Arity::Unary(), doc, GetDefaultPairwiseOptions()); - for (const auto& kernel_data : pairwise_diff_kernels) { - base_kernel.signature = - KernelSignature::Make({kernel_data.input}, kernel_data.output); - base_kernel.exec = kernel_data.exec; - DCHECK_OK(func->AddKernel(base_kernel)); + auto base_func_result = registry->GetFunction(std::string(base_func_name)); + DCHECK_OK(base_func_result.status()); + const auto& base_func = checked_cast(**base_func_result); + DCHECK(base_func.arity().num_args == 2); + + for (const auto& base_func_kernel : base_func.kernels()) { + const auto& base_func_kernel_sig = base_func_kernel->signature; + if (base_func_kernel_sig->in_types()[0].Equals(base_func_kernel_sig->in_types()[1])) { + OutputType out_type(base_func_kernel_sig->out_type()); + // Need to wrap base output resolver + if (out_type.kind() == OutputType::COMPUTED) { + const auto& base_resolver = base_func_kernel_sig->out_type().resolver(); + auto resolver = [&base_resolver](KernelContext* ctx, + const std::vector& input_types) { + return base_resolver(ctx, {input_types[0], input_types[0]}); + }; + out_type = OutputType(resolver); + } + + base_kernel.signature = + KernelSignature::Make({base_func_kernel_sig->in_types()[0]}, out_type); + base_kernel.exec = PairwiseExec; + base_kernel.init = GeneratePairwiseInit(base_func_kernel->exec); + DCHECK_OK(func->AddKernel(base_kernel)); + } } DCHECK_OK(registry->AddFunction(std::move(func))); } void RegisterVectorPairwise(FunctionRegistry* registry) { - RegisterPairwiseDiffKernels("pairwise_diff", - pairwise_diff_doc, registry); - RegisterPairwiseDiffKernels( - "pairwise_diff_checked", pairwise_diff_checked_doc, registry); + RegisterPairwiseDiffKernels("pairwise_diff", "subtract", pairwise_diff_doc, registry); + RegisterPairwiseDiffKernels("pairwise_diff_checked", "subtract_checked", + pairwise_diff_checked_doc, registry); } } // namespace arrow::compute::internal diff --git a/cpp/src/arrow/compute/kernels/vector_pairwise_test.cc b/cpp/src/arrow/compute/kernels/vector_pairwise_test.cc index be0f118184833..c77f8ecc1a403 100644 --- a/cpp/src/arrow/compute/kernels/vector_pairwise_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_pairwise_test.cc @@ -63,7 +63,7 @@ Result> GetOutputType( } } -class TestDiffKernel : public ::testing::Test { +class TestPairwiseDiff : public ::testing::Test { public: void SetUp() override { test_numerical_types_ = NumericTypes(); @@ -85,7 +85,7 @@ class TestDiffKernel : public ::testing::Test { std::vector> test_input_types_; }; -TEST_F(TestDiffKernel, Empty) { +TEST_F(TestPairwiseDiff, Empty) { for (int64_t period = -2; period <= 2; ++period) { PairwiseOptions options(period); for (auto input_type : test_input_types_) { @@ -97,7 +97,7 @@ TEST_F(TestDiffKernel, Empty) { } } -TEST_F(TestDiffKernel, AllNull) { +TEST_F(TestPairwiseDiff, AllNull) { for (int64_t period = -2; period <= 2; ++period) { PairwiseOptions options(period); for (auto input_type : test_input_types_) { @@ -109,7 +109,7 @@ TEST_F(TestDiffKernel, AllNull) { } } -TEST_F(TestDiffKernel, Numeric) { +TEST_F(TestPairwiseDiff, Numeric) { { PairwiseOptions options(1); for (auto input_type : test_numerical_types_) { @@ -151,7 +151,7 @@ TEST_F(TestDiffKernel, Numeric) { } } -TEST_F(TestDiffKernel, Overflow) { +TEST_F(TestPairwiseDiff, Overflow) { { PairwiseOptions options(1); auto input = ArrayFromJSON(uint8(), "[3, 2, 1]"); @@ -169,7 +169,7 @@ TEST_F(TestDiffKernel, Overflow) { } } -TEST_F(TestDiffKernel, Temporal) { +TEST_F(TestPairwiseDiff, Temporal) { { PairwiseOptions options(1); for (auto input_type : test_temporal_types_) { @@ -185,7 +185,7 @@ TEST_F(TestDiffKernel, Temporal) { } } -TEST_F(TestDiffKernel, Decimal) { +TEST_F(TestPairwiseDiff, Decimal) { { PairwiseOptions options(1); auto input = ArrayFromJSON(decimal(4, 2), R"(["11.00", "22.11", "-10.25", "33.45"])"); From a6ad857b780c6f902035706f7de7b5162d11d39e Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Wed, 28 Jun 2023 22:48:01 +0800 Subject: [PATCH 08/17] Update docs/source/cpp/compute.rst Co-authored-by: Benjamin Kietzman --- docs/source/cpp/compute.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 63c1a4abc0a45..34699eb715336 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1850,7 +1850,7 @@ replaced, based on the remaining inputs. Pairwise functions ~~~~~~~~~~~~~~~~~~~~ -Pairwise functions are unary vector functions that performs a binary operation on +Pairwise functions are unary vector functions that perform a binary operation on a pair of elements in the input array, typically on adjacent elements. The n-th output is computed by applying the binary operation on the n-th and (n-p)-th, where p is the period. The default period is 1. The period can also be negative. From fe641f04bf25e9d3331ac98b066401be2cfd96a7 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Wed, 28 Jun 2023 22:48:12 +0800 Subject: [PATCH 09/17] Update cpp/src/arrow/compute/api_vector.h Co-authored-by: Benjamin Kietzman --- cpp/src/arrow/compute/api_vector.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 5b0dbbddb9d48..a4e7bf819e66a 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -663,10 +663,13 @@ Result CumulativeMin( /// \brief Return the first order difference of an array. /// -/// Computes the first order difference of an array, i.e. output[i] = input[i] - input[i - -/// p] if i >= p, otherwise output[i] = null, where p is the period. For example, with p = -/// 1, Diff([1, 4, 9, 10, 15]) = [null, 3, 5, 1, 5]. With p = 2, Diff([1, 4, 9, 10, 15]) = -/// [null, null, 8, 6, 6] +/// Computes the first order difference of an array, i.e. +/// output[i] = input[i] - input[i - p] if i >= p +/// output[i] = null otherwise +/// where p is the period. For example, with p = 1, +/// Diff([1, 4, 9, 10, 15]) = [null, 3, 5, 1, 5]. +/// With p = 2, +/// Diff([1, 4, 9, 10, 15]) = [null, null, 8, 6, 6] /// /// \param[in] array array input /// \param[in] options options, specifying overflow behavior and period From 17464af70086b9182d952798a89af06d9ceac733 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Sat, 24 Jun 2023 20:58:36 +0800 Subject: [PATCH 10/17] update doc --- cpp/src/arrow/compute/api_vector.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index a4e7bf819e66a..c85db1aa3ba88 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -670,7 +670,8 @@ Result CumulativeMin( /// Diff([1, 4, 9, 10, 15]) = [null, 3, 5, 1, 5]. /// With p = 2, /// Diff([1, 4, 9, 10, 15]) = [null, null, 8, 6, 6] -/// +/// p can also be negative, in which case the diff is computed in +/// the opposite direction. /// \param[in] array array input /// \param[in] options options, specifying overflow behavior and period /// \param[in] check_overflow whether to return error on overflow From c7577484aee3563d8dca1e0ac914d0714b7587ca Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Sat, 24 Jun 2023 21:09:18 +0800 Subject: [PATCH 11/17] lint --- cpp/src/arrow/compute/kernels/vector_pairwise.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/vector_pairwise.cc b/cpp/src/arrow/compute/kernels/vector_pairwise.cc index 8db96b288f2c7..d662d7d6f0660 100644 --- a/cpp/src/arrow/compute/kernels/vector_pairwise.cc +++ b/cpp/src/arrow/compute/kernels/vector_pairwise.cc @@ -146,7 +146,7 @@ void RegisterPairwiseDiffKernels(std::string_view func_name, auto base_func_result = registry->GetFunction(std::string(base_func_name)); DCHECK_OK(base_func_result.status()); const auto& base_func = checked_cast(**base_func_result); - DCHECK(base_func.arity().num_args == 2); + DCHECK_EQ(base_func.arity().num_args, 2); for (const auto& base_func_kernel : base_func.kernels()) { const auto& base_func_kernel_sig = base_func_kernel->signature; From dadd72092a911a2710e8a9e919a7c72541585877 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Sat, 24 Jun 2023 21:22:34 +0800 Subject: [PATCH 12/17] use template for resolver --- cpp/src/arrow/compute/kernel.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index 8ae5cc3538714..5b5b5718e19dc 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -286,20 +286,13 @@ class ARROW_EXPORT OutputType { using Resolver = std::function(KernelContext*, const std::vector&)>; - // For backward compatibility - using ResolverFuncPtr = Result (*)(KernelContext*, - const std::vector&); - /// \brief Output an exact type OutputType(std::shared_ptr type) // NOLINT implicit construction : kind_(FIXED), type_(std::move(type)) {} /// \brief Output a computed type depending on actual input types - OutputType(Resolver resolver) // NOLINT implicit construction - : kind_(COMPUTED), resolver_(std::move(resolver)) {} - - /// \brief For backward compatibility - OutputType(ResolverFuncPtr resolver) // NOLINT implicit construction + template + OutputType(Fn resolver) // NOLINT implicit construction : kind_(COMPUTED), resolver_(std::move(resolver)) {} OutputType(const OutputType& other) { From 3d8179adb9346dc459c1470649505fbf1f3d3278 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Sat, 24 Jun 2023 22:28:28 +0800 Subject: [PATCH 13/17] pr feedback --- .../arrow/compute/kernels/vector_pairwise.cc | 84 ++++++++++--------- 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_pairwise.cc b/cpp/src/arrow/compute/kernels/vector_pairwise.cc index d662d7d6f0660..8917db3fd754a 100644 --- a/cpp/src/arrow/compute/kernels/vector_pairwise.cc +++ b/cpp/src/arrow/compute/kernels/vector_pairwise.cc @@ -49,25 +49,21 @@ struct PairwiseState : KernelState { const ArrayKernelExec& scalar_exec; }; -KernelInit GeneratePairwiseInit(const ArrayKernelExec& scalar_exec) { - return [&scalar_exec](KernelContext* ctx, const KernelInitArgs& args) { - return std::make_unique( - checked_cast(*args.options), scalar_exec); - }; -} - /// A generic pairwise implementation that can be reused by different ops. Status PairwiseExecImpl(KernelContext* ctx, const ArraySpan& input, const ArrayKernelExec& scalar_exec, int64_t periods, ArrayData* result) { - auto offset = abs(periods); - offset = std::min(offset, input.length); - auto exec_length = input.length - offset; + // We only compute values in the region where the input-with-offset overlaps + // the original input. The margin where these do not overlap gets filled with null. + auto margin_length = std::min(abs(periods), input.length); + auto computed_length = input.length - margin_length; + auto margin_start = periods > 0 ? 0 : computed_length; + auto computed_start = periods > 0 ? margin_length : 0; + auto left_start = periods > 0 ? margin_length : 0; + auto right_start = periods > 0 ? 0 : margin_length; // prepare bitmap - auto null_start = periods > 0 ? 0 : exec_length; - auto non_null_start = periods > 0 ? offset : 0; - bit_util::ClearBitmap(result->buffers[0]->mutable_data(), null_start, offset); - for (int64_t i = non_null_start; i < non_null_start + exec_length; i++) { + bit_util::ClearBitmap(result->buffers[0]->mutable_data(), margin_start, margin_length); + for (int64_t i = computed_start; i < computed_start + computed_length; i++) { if (input.IsValid(i) && input.IsValid(i - periods)) { bit_util::SetBit(result->buffers[0]->mutable_data(), i); } else { @@ -76,17 +72,17 @@ Status PairwiseExecImpl(KernelContext* ctx, const ArraySpan& input, } // prepare input span ArraySpan left(input); - left.SetSlice(periods > 0 ? offset : 0, exec_length); + left.SetSlice(left_start, computed_length); ArraySpan right(input); - right.SetSlice(periods > 0 ? 0 : offset, exec_length); + right.SetSlice(right_start, computed_length); // prepare output span ArraySpan output_span; output_span.SetMembers(*result); - output_span.offset = periods > 0 ? offset : 0; - output_span.length = exec_length; + output_span.offset = computed_start; + output_span.length = computed_length; ExecResult output{output_span}; // execute scalar function - RETURN_NOT_OK(scalar_exec(ctx, ExecSpan({left, right}, exec_length), &output)); + RETURN_NOT_OK(scalar_exec(ctx, ExecSpan({left, right}, computed_length), &output)); return Status::OK(); } @@ -135,11 +131,11 @@ struct PairwiseKernelData { void RegisterPairwiseDiffKernels(std::string_view func_name, std::string_view base_func_name, const FunctionDoc& doc, FunctionRegistry* registry) { - VectorKernel base_kernel; - base_kernel.can_execute_chunkwise = false; - base_kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE; - base_kernel.mem_allocation = MemAllocation::PREALLOCATE; - base_kernel.init = OptionsWrapper::Init; + VectorKernel kernel; + kernel.can_execute_chunkwise = false; + kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE; + kernel.mem_allocation = MemAllocation::PREALLOCATE; + kernel.init = OptionsWrapper::Init; auto func = std::make_shared(std::string(func_name), Arity::Unary(), doc, GetDefaultPairwiseOptions()); @@ -150,24 +146,30 @@ void RegisterPairwiseDiffKernels(std::string_view func_name, for (const auto& base_func_kernel : base_func.kernels()) { const auto& base_func_kernel_sig = base_func_kernel->signature; - if (base_func_kernel_sig->in_types()[0].Equals(base_func_kernel_sig->in_types()[1])) { - OutputType out_type(base_func_kernel_sig->out_type()); - // Need to wrap base output resolver - if (out_type.kind() == OutputType::COMPUTED) { - const auto& base_resolver = base_func_kernel_sig->out_type().resolver(); - auto resolver = [&base_resolver](KernelContext* ctx, - const std::vector& input_types) { - return base_resolver(ctx, {input_types[0], input_types[0]}); - }; - out_type = OutputType(resolver); - } - - base_kernel.signature = - KernelSignature::Make({base_func_kernel_sig->in_types()[0]}, out_type); - base_kernel.exec = PairwiseExec; - base_kernel.init = GeneratePairwiseInit(base_func_kernel->exec); - DCHECK_OK(func->AddKernel(base_kernel)); + if (!base_func_kernel_sig->in_types()[0].Equals( + base_func_kernel_sig->in_types()[1])) { + continue; } + OutputType out_type(base_func_kernel_sig->out_type()); + // Need to wrap base output resolver + if (out_type.kind() == OutputType::COMPUTED) { + out_type = + OutputType([base_resolver = base_func_kernel_sig->out_type().resolver()]( + KernelContext* ctx, const std::vector& input_types) { + return base_resolver(ctx, {input_types[0], input_types[0]}); + }); + } + + kernel.signature = + KernelSignature::Make({base_func_kernel_sig->in_types()[0]}, out_type); + kernel.exec = PairwiseExec; + kernel.init = [scalar_exec = base_func_kernel->exec](KernelContext* ctx, + const KernelInitArgs& args) { + return std::make_unique( + checked_cast(*args.options), scalar_exec); + }; + ; + DCHECK_OK(func->AddKernel(kernel)); } DCHECK_OK(registry->AddFunction(std::move(func))); From 6da2dc24365cf18af84acbf835d0aed81a57d820 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Sat, 24 Jun 2023 22:31:25 +0800 Subject: [PATCH 14/17] save exec by value --- cpp/src/arrow/compute/kernels/vector_pairwise.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_pairwise.cc b/cpp/src/arrow/compute/kernels/vector_pairwise.cc index 8917db3fd754a..259cf66ba2d5c 100644 --- a/cpp/src/arrow/compute/kernels/vector_pairwise.cc +++ b/cpp/src/arrow/compute/kernels/vector_pairwise.cc @@ -42,11 +42,11 @@ namespace arrow::compute::internal { // We reuse the kernel exec function of a scalar binary function to compute pairwise // results. For example, for pairwise_diff, we reuse subtract's kernel exec. struct PairwiseState : KernelState { - PairwiseState(const PairwiseOptions& options, const ArrayKernelExec& scalar_exec) + PairwiseState(const PairwiseOptions& options, ArrayKernelExec scalar_exec) : periods(options.periods), scalar_exec(scalar_exec) {} int64_t periods; - const ArrayKernelExec& scalar_exec; + ArrayKernelExec scalar_exec; }; /// A generic pairwise implementation that can be reused by different ops. From d16d034fafc1cbfafd4638765d0812cbf6a29034 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Sat, 24 Jun 2023 22:34:51 +0800 Subject: [PATCH 15/17] update var name --- cpp/src/arrow/compute/kernels/vector_pairwise.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_pairwise.cc b/cpp/src/arrow/compute/kernels/vector_pairwise.cc index 259cf66ba2d5c..0aae5c8810a4e 100644 --- a/cpp/src/arrow/compute/kernels/vector_pairwise.cc +++ b/cpp/src/arrow/compute/kernels/vector_pairwise.cc @@ -59,8 +59,8 @@ Status PairwiseExecImpl(KernelContext* ctx, const ArraySpan& input, auto computed_length = input.length - margin_length; auto margin_start = periods > 0 ? 0 : computed_length; auto computed_start = periods > 0 ? margin_length : 0; - auto left_start = periods > 0 ? margin_length : 0; - auto right_start = periods > 0 ? 0 : margin_length; + auto left_start = computed_start; + auto right_start = margin_length - computed_start; // prepare bitmap bit_util::ClearBitmap(result->buffers[0]->mutable_data(), margin_start, margin_length); for (int64_t i = computed_start; i < computed_start + computed_length; i++) { From d335071954ce2fc9c93de449c6edf1e7514462ec Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Thu, 29 Jun 2023 11:19:13 +0800 Subject: [PATCH 16/17] Update docs/source/cpp/compute.rst Co-authored-by: Benjamin Kietzman --- docs/source/cpp/compute.rst | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 34699eb715336..55e29588129b8 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1852,8 +1852,11 @@ Pairwise functions ~~~~~~~~~~~~~~~~~~~~ Pairwise functions are unary vector functions that perform a binary operation on a pair of elements in the input array, typically on adjacent elements. The n-th -output is computed by applying the binary operation on the n-th and (n-p)-th, -where p is the period. The default period is 1. The period can also be negative. +output is computed by applying the binary operation to the n-th and (n-p)-th inputs, +where p is the period. The default period is 1, in which case the binary +operation is applied to adjacent pairs of inputs. The period can also be +negative, in which case the n-th output is computed by applying the binary +operation to the n-th and (n+abs(p))-th inputs. +------------------------+-------+----------------------+----------------------+--------------------------------+----------+ | Function name | Arity | Input types | Output type | Options class | Notes | From c51f65973fb08bc0f48f844a2eab69ab588cfe43 Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Sun, 25 Jun 2023 00:23:34 +0800 Subject: [PATCH 17/17] lint --- cpp/src/arrow/compute/kernels/vector_pairwise.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/vector_pairwise.cc b/cpp/src/arrow/compute/kernels/vector_pairwise.cc index 0aae5c8810a4e..440b1393a3ab2 100644 --- a/cpp/src/arrow/compute/kernels/vector_pairwise.cc +++ b/cpp/src/arrow/compute/kernels/vector_pairwise.cc @@ -168,7 +168,6 @@ void RegisterPairwiseDiffKernels(std::string_view func_name, return std::make_unique( checked_cast(*args.options), scalar_exec); }; - ; DCHECK_OK(func->AddKernel(kernel)); }