From 7635a71d27a3f95f1db6940d1cd2227889529a3d Mon Sep 17 00:00:00 2001 From: Laith Sakka Date: Wed, 13 Apr 2022 15:33:16 -0700 Subject: [PATCH 001/198] Fix typo in vectorCompare benchmark. Summary: title. Reviewed By: mbasmanova Differential Revision: D35623222 fbshipit-source-id: e26b5ac0a04c57c898db5afa1adb74e9bcb7cd0b --- velox/benchmarks/basic/VectorCompare.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/velox/benchmarks/basic/VectorCompare.cpp b/velox/benchmarks/basic/VectorCompare.cpp index 6af4470aedbb..6d79d87940dc 100644 --- a/velox/benchmarks/basic/VectorCompare.cpp +++ b/velox/benchmarks/basic/VectorCompare.cpp @@ -91,7 +91,7 @@ BENCHMARK_MULTI(compareSimilarSimpleFlat) { return benchmark->run(benchmark->flatVector_); } -BENCHMARK_MULTI(compareSimilarSSimpleFlatNoDispatch) { +BENCHMARK_MULTI(compareSimilarSimpleFlatNoDispatch) { return benchmark->runFastFlat(); } From 16e1f2ebfeacb3ddc130bbede6302a5d0eca26fc Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Wed, 13 Apr 2022 16:03:11 -0700 Subject: [PATCH 002/198] Fix March update feature of the month (#1395) Summary: `Feature Of The Month` was incorrectly tagged as a chapter and hence it appeared in the main `Monthly Updates` list. It is now tagged as a section. Pull Request resolved: https://github.com/facebookincubator/velox/pull/1395 Reviewed By: laithsakka Differential Revision: D35627299 Pulled By: mbasmanova fbshipit-source-id: 73f2d96fa0f4a925807f1ececca7096bfe2b4b16 --- velox/docs/monthly-updates/march-2022.rst | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/velox/docs/monthly-updates/march-2022.rst b/velox/docs/monthly-updates/march-2022.rst index 9c4d8e72dbf3..d19b8a369d92 100644 --- a/velox/docs/monthly-updates/march-2022.rst +++ b/velox/docs/monthly-updates/march-2022.rst @@ -3,12 +3,12 @@ March 2022 Update ******************** Documentation -------------- +============= * Document :doc:`printPlanWithStats <../develop/debugging/print-plan-with-stats>` debugging tool. Core Library ------------- +============ * Add support for UNNEST WITH ORDINALITY for single array or map. * Add support for filter to INNER and LEFT merge joins. @@ -29,7 +29,7 @@ Core Library * Fix crashes and correctness bugs in joins and aggregations. Presto Functions ----------------- +================ * Add support for CAST(x as JSON). * Add :func:`arrays_overlap` function. @@ -38,7 +38,7 @@ Presto Functions * Fix :func:`approx_distinct` to return 0 instead of null with all inputs are null. Performance and Correctness Testing ------------------------------------ +=================================== * Add linux-benchmarks-basic CircleCI job to run micro benchmarks on each PR. Initial set of benchmarks covers SelectivityVector, DecodedVector, simple comparisons and conjuncts. * Add TPC-H benchmark with support for q1, q6 and q18. @@ -46,7 +46,7 @@ Performance and Correctness Testing * Add support for ORDER BY SQL clauses to PlanBuilder::localMerge() and PlanBuilder::orderBy(). Debugging Experience --------------------- +==================== * Improve error messages in expression evaluation by including the expression being evaluated. :pr:`1138` * Improve error messages in CAST expression by including the to and from types. :pr:`1150` @@ -56,7 +56,7 @@ Debugging Experience * Add runtime statistic "loadedToValueHook" to track number of values processed via aggregation pushdown into scan. Credits -------- +======= Aditi Pandit, Amit Dutta, Amlan Nayak, Chad Austin, Chao Chen, David Greenberg, Deepak Majeti, Dimitri Bouche, Gilson Takaasi Gil, Hanqi Wu, Huameng Jiang, @@ -68,10 +68,8 @@ Sridhar Anumandla, Victor Zverovich, Wei He, Wenlei Xie, Yoav Helfman, Yuan Chao Chou, Zhenyuan Zhao, tanjialiang -******************** Feature Of The Month -******************** - +==================== Using vector readers/writers to simplify dealing with Velox vectors. -------------------------------------------------------------------- From 525acae8235c8a9c74ea098e3f3577c0be3d2de2 Mon Sep 17 00:00:00 2001 From: Sergey Pershin Date: Wed, 13 Apr 2022 16:08:37 -0700 Subject: [PATCH 003/198] Support non-Flat vector in CountAggregate::addSingleGroupIntermediateResults() (#1419) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/1419 Some queries might have local exchanges before INTERMEDIATE aggregation. The local exchanges produce dictionaries, so we need to expect these in CountAggregate::addSingleGroupIntermediateResults() Example query: https://www.internalfb.com/intern/presto/query/?query_id=20220411_214537_01209_dehmi#sql The diff: * Adds support for any encoded vector in CountAggregate::addSingleGroupIntermediateResults(). * Adds support for RoundRobin Local Partition Node in PlanBuilder. * Removes the check Partial Aggregation before the Final Aggregation. Reviewed By: mbasmanova Differential Revision: D35603243 fbshipit-source-id: 273932df0c6faabf2e8135d9d4a33216bc9255cf --- velox/exec/tests/utils/PlanBuilder.cpp | 42 ++++++++++++++----- velox/exec/tests/utils/PlanBuilder.h | 4 ++ .../prestosql/aggregates/CountAggregate.cpp | 14 +++---- .../aggregates/tests/CountAggregationTest.cpp | 26 ++++++++++++ 4 files changed, 69 insertions(+), 17 deletions(-) diff --git a/velox/exec/tests/utils/PlanBuilder.cpp b/velox/exec/tests/utils/PlanBuilder.cpp index ae39111800f9..73fc87a33b51 100644 --- a/velox/exec/tests/utils/PlanBuilder.cpp +++ b/velox/exec/tests/utils/PlanBuilder.cpp @@ -15,11 +15,12 @@ */ #include "velox/exec/tests/utils/PlanBuilder.h" -#include -#include #include "velox/common/memory/Memory.h" #include "velox/connectors/hive/HiveConnector.h" #include "velox/duckdb/conversion/DuckParser.h" +#include "velox/exec/Aggregate.h" +#include "velox/exec/HashPartitionFunction.h" +#include "velox/exec/RoundRobinPartitionFunction.h" #include "velox/expression/SignatureBinder.h" #include "velox/parse/Expressions.h" #include "velox/type/tests/FilterBuilder.h" @@ -647,14 +648,15 @@ PlanBuilder& PlanBuilder::finalAggregation() { const auto* aggNode = findPartialAggregation(planNode_.get()); if (!exec::isRawInput(aggNode->step())) { - // Check the source node. - aggNode = - dynamic_cast(aggNode->sources()[0].get()); - VELOX_CHECK_NOT_NULL( - aggNode, - "Plan node before current plan node must be a partial aggregation."); - VELOX_CHECK(exec::isRawInput(aggNode->step())); - VELOX_CHECK(exec::isPartialOutput(aggNode->step())); + // If aggregation node is not the partial aggregation, keep looking again. + aggNode = findPartialAggregation(aggNode->sources()[0].get()); + if (!exec::isRawInput(aggNode->step())) { + VELOX_CHECK_NOT_NULL( + aggNode, + "Plan node before current plan node must be a partial aggregation."); + VELOX_CHECK(exec::isRawInput(aggNode->step())); + VELOX_CHECK(exec::isPartialOutput(aggNode->step())); + } } auto step = core::AggregationNode::Step::kFinal; @@ -931,6 +933,26 @@ PlanBuilder& PlanBuilder::localPartition( return *this; } +PlanBuilder& PlanBuilder::localPartitionRoundRobin( + const std::vector>& sources, + const std::vector& outputLayout) { + VELOX_CHECK_NULL(planNode_, "localPartition() must be the first call"); + auto inputType = sources[0]->outputType(); + auto outputType = outputLayout.empty() ? sources[0]->outputType() + : extract(inputType, outputLayout); + auto partitionFunctionFactory = [](auto numPartitions) { + return std::make_unique( + numPartitions); + }; + planNode_ = std::make_shared( + nextPlanNodeId(), + core::LocalPartitionNode::Type::kRepartition, + partitionFunctionFactory, + outputType, + sources); + return *this; +} + PlanBuilder& PlanBuilder::hashJoin( const std::vector& leftKeys, const std::vector& rightKeys, diff --git a/velox/exec/tests/utils/PlanBuilder.h b/velox/exec/tests/utils/PlanBuilder.h index 3d58f2bc1928..921552e022dd 100644 --- a/velox/exec/tests/utils/PlanBuilder.h +++ b/velox/exec/tests/utils/PlanBuilder.h @@ -334,6 +334,10 @@ class PlanBuilder { const std::vector>& sources, const std::vector& outputLayout = {}); + PlanBuilder& localPartitionRoundRobin( + const std::vector>& sources, + const std::vector& outputLayout = {}); + // 'leftKeys' and 'rightKeys' are column names of the output of the // previous PlanNode and 'build', respectively. 'output' is a subset of // column names from the left and right sides of the join to project out. diff --git a/velox/functions/prestosql/aggregates/CountAggregate.cpp b/velox/functions/prestosql/aggregates/CountAggregate.cpp index b0f681141327..6a73566f97ef 100644 --- a/velox/functions/prestosql/aggregates/CountAggregate.cpp +++ b/velox/functions/prestosql/aggregates/CountAggregate.cpp @@ -120,19 +120,19 @@ class CountAggregate : public SimpleNumericAggregate { const SelectivityVector& rows, const std::vector& args, bool /*mayPushdown*/) override { - VELOX_CHECK_EQ(args[0]->encoding(), VectorEncoding::Simple::FLAT); + decodedIntermediate_.decode(*args[0], rows); - auto vector = args[0]->asUnchecked>(); - auto rawValues = vector->rawValues(); int64_t count = 0; - if (vector->mayHaveNulls()) { + if (decodedIntermediate_.mayHaveNulls()) { rows.applyToSelected([&](vector_size_t i) { - if (!vector->isNullAt(i)) { - count += rawValues[i]; + if (!decodedIntermediate_.isNullAt(i)) { + count += decodedIntermediate_.valueAt(i); } }); } else { - rows.applyToSelected([&](vector_size_t i) { count += rawValues[i]; }); + rows.applyToSelected([&](vector_size_t i) { + count += decodedIntermediate_.valueAt(i); + }); } addToGroup(group, count); diff --git a/velox/functions/prestosql/aggregates/tests/CountAggregationTest.cpp b/velox/functions/prestosql/aggregates/tests/CountAggregationTest.cpp index 82451316c743..edb9d244753c 100644 --- a/velox/functions/prestosql/aggregates/tests/CountAggregationTest.cpp +++ b/velox/functions/prestosql/aggregates/tests/CountAggregationTest.cpp @@ -117,6 +117,32 @@ TEST_F(CountAggregation, count) { assertQuery(params, "SELECT c0 % 10, count(1) FROM tmp GROUP BY 1"); } + + // Add local exchange before intermediate aggregation. Single group. + { + auto planNodeIdGenerator = std::make_shared(); + + CursorParameters params; + params.planNode = + PlanBuilder(planNodeIdGenerator) + .localPartition( + {}, + {PlanBuilder(planNodeIdGenerator) + .localPartitionRoundRobin( + {PlanBuilder(planNodeIdGenerator) + .values(vectors) + .project({"c0"}) + .partialAggregation({}, {"count(1)"}) + .planNode()}) + .intermediateAggregation() + .planNode()}) + .finalAggregation() + .planNode(); + + params.maxDrivers = 2; + + assertQuery(params, "SELECT count(1) FROM tmp"); + } } } // namespace From 6f2ee9e340a2c4f0b044274f2bf889686c8df038 Mon Sep 17 00:00:00 2001 From: Laith Sakka Date: Wed, 13 Apr 2022 16:11:22 -0700 Subject: [PATCH 004/198] Use FunctionSignaturePtr to simplify (#1413) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/1413 title Reviewed By: kevinwilfong Differential Revision: D35479812 fbshipit-source-id: 1de41765ac69939d43e408b8a9c4b22e73672ece --- velox/docs/develop/scalar-functions.rst | 30 ++++++++++++------------- velox/expression/FunctionSignature.cpp | 2 +- velox/expression/FunctionSignature.h | 4 +++- velox/expression/VectorFunction.cpp | 8 +++---- velox/expression/VectorFunction.h | 10 ++++----- 5 files changed, 28 insertions(+), 26 deletions(-) diff --git a/velox/docs/develop/scalar-functions.rst b/velox/docs/develop/scalar-functions.rst index 323beab16a1f..0a53b84330d2 100644 --- a/velox/docs/develop/scalar-functions.rst +++ b/velox/docs/develop/scalar-functions.rst @@ -142,7 +142,7 @@ an array: template struct ArrayMinFunction { VELOX_DEFINE_FUNCTION_TYPES(TExecParams); - + template FOLLY_ALWAYS_INLINE bool callNullFree( TInput& out, @@ -182,7 +182,7 @@ An example of such function is rand(): template struct RandFunction { static constexpr bool is_deterministic = false; - + FOLLY_ALWAYS_INLINE bool call(double& result) { result = folly::Random::randDouble01(); return true; @@ -214,10 +214,10 @@ Here is an example of a trim function: template struct TrimFunction { VELOX_DEFINE_FUNCTION_TYPES(TExecParams); - + // ASCII input always produces ASCII result. static constexpr bool is_default_ascii_behavior = true; - + // Properly handles multi-byte characters. FOLLY_ALWAYS_INLINE bool call( out_type& result, @@ -225,7 +225,7 @@ Here is an example of a trim function: stringImpl::trimUnicodeWhiteSpace(result, input); return true; } - + // Assumes input is all ASCII. FOLLY_ALWAYS_INLINE bool callAscii( out_type& result, @@ -276,15 +276,15 @@ properties and using it when processing inputs. template struct HourFunction { VELOX_DEFINE_FUNCTION_TYPES(TExecParams); - + const date::time_zone* timeZone_ = nullptr; - + FOLLY_ALWAYS_INLINE void initialize( const core::QueryConfig& config, const arg_type* /*timestamp*/) { timeZone_ = getTimeZoneFromConfig(config); } - + FOLLY_ALWAYS_INLINE bool call( int64_t& result, const arg_type& timestamp) { @@ -305,10 +305,10 @@ individual rows. template struct DateTruncFunction { VELOX_DEFINE_FUNCTION_TYPES(TExecParams); - + const date::time_zone* timeZone_ = nullptr; std::optional unit_; - + FOLLY_ALWAYS_INLINE void initialize( const core::QueryConfig& config, const arg_type* unitString, @@ -318,7 +318,7 @@ individual rows. unit_ = fromDateTimeUnitString(*unitString); } } - + FOLLY_ALWAYS_INLINE bool call( out_type& result, const arg_type& unitString, @@ -371,7 +371,7 @@ Here is an example with ceil function. .. code-block:: c++ #include "velox/functions/prestosql/ArithmeticImpl.h" - + template struct CeilFunction { template @@ -504,7 +504,7 @@ ArrayView supports the following: template struct ArraySum { VELOX_DEFINE_FUNCTION_TYPES(T); - + bool call(const int64_t& output, const arg_type>& array) { output = 0; for (const auto& value : array.skipNulls()) { @@ -1023,7 +1023,7 @@ Use f4d::exec::registerVectorFunction to register a stateless vector function. bool registerVectorFunction( const std::string& name, - std::vector> signatures, + std::vector signatures, std::unique_ptr func, bool overwrite = true) @@ -1039,7 +1039,7 @@ function. bool registerStatefulVectorFunction( const std::string& name, - std::vector> signatures, + std::vector signatures, VectorFunctionFactory factory, bool overwrite = true) diff --git a/velox/expression/FunctionSignature.cpp b/velox/expression/FunctionSignature.cpp index 7030a807bd44..e6348524d154 100644 --- a/velox/expression/FunctionSignature.cpp +++ b/velox/expression/FunctionSignature.cpp @@ -174,7 +174,7 @@ FunctionSignature::FunctionSignature( validate(typeVariableConstants_, returnType_, argumentTypes_); } -std::shared_ptr FunctionSignatureBuilder::build() { +FunctionSignaturePtr FunctionSignatureBuilder::build() { VELOX_CHECK(returnType_.has_value()); return std::make_shared( std::move(typeVariableConstants_), diff --git a/velox/expression/FunctionSignature.h b/velox/expression/FunctionSignature.h index e5b278d256fa..cb551ceb5e5d 100644 --- a/velox/expression/FunctionSignature.h +++ b/velox/expression/FunctionSignature.h @@ -120,6 +120,8 @@ class FunctionSignature { const bool variableArity_; }; +using FunctionSignaturePtr = std::shared_ptr; + class AggregateFunctionSignature : public FunctionSignature { public: AggregateFunctionSignature( @@ -194,7 +196,7 @@ class FunctionSignatureBuilder { return *this; } - std::shared_ptr build(); + FunctionSignaturePtr build(); private: std::vector typeVariableConstants_; diff --git a/velox/expression/VectorFunction.cpp b/velox/expression/VectorFunction.cpp index eee914539da8..3a53fa442471 100644 --- a/velox/expression/VectorFunction.cpp +++ b/velox/expression/VectorFunction.cpp @@ -25,8 +25,8 @@ VectorFunctionMap& vectorFunctionFactories() { return factories; } -std::optional>> -getVectorFunctionSignatures(const std::string& name) { +std::optional> getVectorFunctionSignatures( + const std::string& name) { return vectorFunctionFactories().withRLock([&name](auto& functions) -> auto { auto it = functions.find(name); return it == functions.end() ? std::nullopt @@ -68,7 +68,7 @@ std::shared_ptr getVectorFunction( /// Returns true iff an insertion actually happened bool registerStatefulVectorFunction( const std::string& name, - std::vector> signatures, + std::vector signatures, VectorFunctionFactory factory, bool overwrite) { if (overwrite) { @@ -89,7 +89,7 @@ bool registerStatefulVectorFunction( // Returns true iff an insertion actually happened bool registerVectorFunction( const std::string& name, - std::vector> signatures, + std::vector signatures, std::unique_ptr func, bool overwrite) { std::shared_ptr sharedFunc = std::move(func); diff --git a/velox/expression/VectorFunction.h b/velox/expression/VectorFunction.h index a15ec82bb4a8..420f23b573d7 100644 --- a/velox/expression/VectorFunction.h +++ b/velox/expression/VectorFunction.h @@ -131,8 +131,8 @@ class SimpleFunctionAdapterFactory { /// Returns a list of signatures supported by VectorFunction with the specified /// name. Returns std::nullopt if there is no function with the specified name. -std::optional>> -getVectorFunctionSignatures(const std::string& name); +std::optional> getVectorFunctionSignatures( + const std::string& name); /// Returns an instance of VectorFunction for the given name, input types and /// optionally constant input values. @@ -150,7 +150,7 @@ std::shared_ptr getVectorFunction( /// Returns true iff an new function is inserted bool registerVectorFunction( const std::string& name, - std::vector> signatures, + std::vector signatures, std::unique_ptr func, bool overwrite = true); @@ -166,7 +166,7 @@ using VectorFunctionFactory = std::function( const std::vector& inputArgs)>; struct VectorFunctionEntry { - std::vector> signatures; + std::vector signatures; VectorFunctionFactory factory; }; @@ -198,7 +198,7 @@ VectorFunctionFactory makeVectorFunctionFactory() { // Returns true iff the function was inserted bool registerStatefulVectorFunction( const std::string& name, - std::vector> signatures, + std::vector signatures, VectorFunctionFactory factory, bool overwrite = true); From 39ed3fedce537c8a4d4b08239c8117f41d03332b Mon Sep 17 00:00:00 2001 From: Laith Sakka Date: Wed, 13 Apr 2022 16:52:55 -0700 Subject: [PATCH 005/198] Add materialize for StringView (#1418) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/1418 to be consistent with ArrayView and MapView, materialize is added as well to StringView. Reviewed By: kevinwilfong Differential Revision: D35623645 fbshipit-source-id: 900d6f16f2da7060206730faa2ee25ebd56ba235 --- velox/type/StringView.h | 20 ++++++++++++-------- velox/type/tests/StringViewTest.cpp | 1 + 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/velox/type/StringView.h b/velox/type/StringView.h index 44855319b98d..0927da62276a 100644 --- a/velox/type/StringView.h +++ b/velox/type/StringView.h @@ -111,10 +111,6 @@ struct StringView { return size_; } - std::string getString() const { - return std::string(data(), size()); - } - friend std::ostream& operator<<( std::ostream& os, const StringView& stringView) { @@ -191,6 +187,18 @@ struct StringView { return std::string(data(), size()); } + std::string str() const { + return *this; + } + + std::string getString() const { + return *this; + } + + std::string materialize() const { + return *this; + } + operator folly::dynamic() const { return folly::dynamic(folly::StringPiece(data(), size())); } @@ -199,10 +207,6 @@ struct StringView { return std::string_view(data(), size()); } - std::string str() const { - return std::string(data(), size()); - } - const char* begin() const { return data(); } diff --git a/velox/type/tests/StringViewTest.cpp b/velox/type/tests/StringViewTest.cpp index 3a639270cc0a..657df2229a00 100644 --- a/velox/type/tests/StringViewTest.cpp +++ b/velox/type/tests/StringViewTest.cpp @@ -34,6 +34,7 @@ TEST(StringView, basic) { } else { EXPECT_EQ(view.data(), subText.data()); } + EXPECT_EQ(view.materialize(), subText); EXPECT_EQ(view.getString(), subText); EXPECT_EQ(view, StringView(subText.data(), subText.size())); From b4823acb7b91f5e1cb1d9e2f2892121f87b9d787 Mon Sep 17 00:00:00 2001 From: Masha Basmanova Date: Wed, 13 Apr 2022 18:11:27 -0700 Subject: [PATCH 006/198] Add runtime stats to expressions (#1412) Summary: Start tracking CPU and wall time spent evaluating each expression. Also, track total number of rows processed. Pull Request resolved: https://github.com/facebookincubator/velox/pull/1412 Reviewed By: pedroerp Differential Revision: D35622286 Pulled By: mbasmanova fbshipit-source-id: 3c2a9cbc377757cf08fb086ce0c3941cdcfa924c --- velox/expression/Expr.cpp | 3 +++ velox/expression/Expr.h | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/velox/expression/Expr.cpp b/velox/expression/Expr.cpp index d0a922ba8093..3564c37b705e 100644 --- a/velox/expression/Expr.cpp +++ b/velox/expression/Expr.cpp @@ -1070,6 +1070,9 @@ void Expr::applyFunction( const SelectivityVector& rows, EvalCtx* context, VectorPtr* result) { + stats_.numProcessedRows += rows.countSelected(); + CpuWallTimer timer(stats_.timing); + computeIsAsciiForInputs(vectorFunction_.get(), inputValues_, rows); auto isAscii = type()->isVarchar() ? computeIsAsciiForResult(vectorFunction_.get(), inputValues_, rows) diff --git a/velox/expression/Expr.h b/velox/expression/Expr.h index 149e781f85a1..fb6ceea7f70a 100644 --- a/velox/expression/Expr.h +++ b/velox/expression/Expr.h @@ -20,6 +20,7 @@ #include +#include "velox/common/time/CpuWallTimer.h" #include "velox/core/Expressions.h" #include "velox/expression/DecodedArgs.h" #include "velox/expression/EvalCtx.h" @@ -31,6 +32,11 @@ class ExprSet; class FieldReference; class VectorFunction; +struct ExprStats { + CpuWallTiming timing; + uint64_t numProcessedRows{0}; +}; + // An executable expression. class Expr { public: @@ -318,6 +324,9 @@ class Expr { // Count of times the cacheable vector is seen for a non-first time. int32_t numCacheableRepeats_{0}; + + /// Runtime statistics. CPU time, wall time and number of processed rows. + ExprStats stats_; }; using ExprPtr = std::shared_ptr; From e6800370ec3891b42e2d7a3eabfd02f4c9bea93f Mon Sep 17 00:00:00 2001 From: Masha Basmanova Date: Wed, 13 Apr 2022 18:11:32 -0700 Subject: [PATCH 007/198] Fix global array_agg running on empty dataset (#1420) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/1420 Reviewed By: spershin Differential Revision: D35631589 Pulled By: mbasmanova fbshipit-source-id: 0fb04f11ca44a22b92c72464be68fcd56fdbd752 --- .../prestosql/aggregates/ArrayAggAggregate.cpp | 14 +++++++++----- .../prestosql/aggregates/tests/ArrayAggTest.cpp | 14 ++++++++++++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/velox/functions/prestosql/aggregates/ArrayAggAggregate.cpp b/velox/functions/prestosql/aggregates/ArrayAggAggregate.cpp index 7b53d467a337..4c21f9d3a6ad 100644 --- a/velox/functions/prestosql/aggregates/ArrayAggAggregate.cpp +++ b/velox/functions/prestosql/aggregates/ArrayAggAggregate.cpp @@ -68,12 +68,16 @@ class ArrayAggAggregate : public exec::Aggregate { auto& values = value(groups[i])->elements; auto arraySize = values.size(); - ValueListReader reader(values); - for (auto index = 0; index < arraySize; ++index) { - reader.next(*elements, offset + index); + if (arraySize) { + ValueListReader reader(values); + for (auto index = 0; index < arraySize; ++index) { + reader.next(*elements, offset + index); + } + vector->setOffsetAndSize(i, offset, arraySize); + offset += arraySize; + } else { + vector->setOffsetAndSize(i, offset, 0); } - vector->setOffsetAndSize(i, offset, arraySize); - offset += arraySize; } } diff --git a/velox/functions/prestosql/aggregates/tests/ArrayAggTest.cpp b/velox/functions/prestosql/aggregates/tests/ArrayAggTest.cpp index 4063f1ad3793..45f1c471ea16 100644 --- a/velox/functions/prestosql/aggregates/tests/ArrayAggTest.cpp +++ b/velox/functions/prestosql/aggregates/tests/ArrayAggTest.cpp @@ -121,5 +121,19 @@ TEST_F(ArrayAggTest, global) { ASSERT_EQ(velox::variant::array(expected), value); } +TEST_F(ArrayAggTest, globalNoData) { + std::vector vectors = { + vectorMaker_.rowVector(ROW({"c0"}, {INTEGER()}), 0)}; + + auto op = PlanBuilder() + .values(vectors) + .partialAggregation({}, {"array_agg(c0)"}) + .finalAggregation() + .planNode(); + + auto value = readSingleValue(op); + ASSERT_EQ(velox::variant::array({}), value); +} + } // namespace } // namespace facebook::velox::aggregate::test From fae085c4303e06632d62c58344611d13b2a1801b Mon Sep 17 00:00:00 2001 From: Masha Basmanova Date: Wed, 13 Apr 2022 19:56:22 -0700 Subject: [PATCH 008/198] Fix global map_agg running on empty dataset (#1421) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/1421 Reviewed By: spershin Differential Revision: D35632552 Pulled By: mbasmanova fbshipit-source-id: de9545cb6330caeecab3e62d67a252f058d70e69 --- .../prestosql/aggregates/MapAggAggregate.cpp | 18 +++++++++++------- .../prestosql/aggregates/tests/MapAggTest.cpp | 18 ++++++++++++++++++ 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/velox/functions/prestosql/aggregates/MapAggAggregate.cpp b/velox/functions/prestosql/aggregates/MapAggAggregate.cpp index d4e5e1f847da..6813f0d90d54 100644 --- a/velox/functions/prestosql/aggregates/MapAggAggregate.cpp +++ b/velox/functions/prestosql/aggregates/MapAggAggregate.cpp @@ -76,14 +76,18 @@ class MapAggAggregate : public exec::Aggregate { auto accumulator = value(group); auto mapSize = accumulator->keys.size(); - ValueListReader keysReader(accumulator->keys); - ValueListReader valuesReader(accumulator->values); - for (auto index = 0; index < mapSize; ++index) { - keysReader.next(*mapKeys, offset + index); - valuesReader.next(*mapValues, offset + index); + if (mapSize) { + ValueListReader keysReader(accumulator->keys); + ValueListReader valuesReader(accumulator->values); + for (auto index = 0; index < mapSize; ++index) { + keysReader.next(*mapKeys, offset + index); + valuesReader.next(*mapValues, offset + index); + } + mapVector->setOffsetAndSize(i, offset, mapSize); + offset += mapSize; + } else { + mapVector->setOffsetAndSize(i, offset, 0); } - mapVector->setOffsetAndSize(i, offset, mapSize); - offset += mapSize; } // canonicalize requires a singly referenced MapVector. std::move diff --git a/velox/functions/prestosql/aggregates/tests/MapAggTest.cpp b/velox/functions/prestosql/aggregates/tests/MapAggTest.cpp index 860b91343c92..1ff550fbd5a7 100644 --- a/velox/functions/prestosql/aggregates/tests/MapAggTest.cpp +++ b/velox/functions/prestosql/aggregates/tests/MapAggTest.cpp @@ -131,6 +131,24 @@ TEST_F(MapAggTest, global) { ASSERT_EQ(mapExpected, readSingleValue(op)); } +TEST_F(MapAggTest, globalNoData) { + std::vector vectors = { + vectorMaker_.rowVector(ROW({"c0", "c1"}, {INTEGER(), DOUBLE()}), 0)}; + + auto op = PlanBuilder() + .values(vectors) + .partialAggregation({}, {"map_agg(c0, c1)"}) + .planNode(); + ASSERT_EQ(velox::variant::map({}), readSingleValue(op)); + + op = PlanBuilder() + .values(vectors) + .partialAggregation({}, {"map_agg(c0, c1)"}) + .finalAggregation() + .planNode(); + ASSERT_EQ(velox::variant::map({}), readSingleValue(op)); +} + TEST_F(MapAggTest, globalDuplicateKeys) { vector_size_t size = 10; From 9d2ae599ad56d2837089b99cd6406185c4bd487b Mon Sep 17 00:00:00 2001 From: Masha Basmanova Date: Wed, 13 Apr 2022 20:28:54 -0700 Subject: [PATCH 009/198] Use ContinuePromise (#1410) Summary: Use ContinuePromise instead of VeloxPromise throughout the code base for consistency (with ContinueFuture) and readability. Pull Request resolved: https://github.com/facebookincubator/velox/pull/1410 Reviewed By: kagamiori Differential Revision: D35617219 Pulled By: mbasmanova fbshipit-source-id: 5bce0aad8f7bb9f97d94bc688751c5594aa23783 --- velox/common/future/VeloxPromise.h | 3 +++ velox/exec/CrossJoinBuild.cpp | 4 ++-- velox/exec/Exchange.h | 2 +- velox/exec/HashBuild.cpp | 6 +++--- velox/exec/HashProbe.cpp | 2 +- velox/exec/JoinBridge.cpp | 4 ++-- velox/exec/JoinBridge.h | 4 ++-- velox/exec/LocalPartition.cpp | 20 +++++++++---------- velox/exec/LocalPartition.h | 6 +++--- velox/exec/MergeSource.cpp | 10 +++++----- velox/exec/MergeSource.h | 4 ++-- velox/exec/PartitionedOutputBufferManager.cpp | 12 +++++------ velox/exec/PartitionedOutputBufferManager.h | 4 ++-- velox/exec/Task.cpp | 6 +++--- velox/exec/Task.h | 8 +++----- velox/exec/TaskStructs.h | 4 ++-- velox/exec/tests/utils/Cursor.cpp | 4 ++-- velox/exec/tests/utils/Cursor.h | 4 ++-- 18 files changed, 54 insertions(+), 53 deletions(-) diff --git a/velox/common/future/VeloxPromise.h b/velox/common/future/VeloxPromise.h index 18cd3ff9f539..45ca6373afba 100644 --- a/velox/common/future/VeloxPromise.h +++ b/velox/common/future/VeloxPromise.h @@ -68,4 +68,7 @@ std::pair, folly::SemiFuture> makeVeloxPromiseContract( auto f = p.getSemiFuture(); return std::make_pair(std::move(p), std::move(f)); } + +using ContinuePromise = VeloxPromise; + } // namespace facebook::velox diff --git a/velox/exec/CrossJoinBuild.cpp b/velox/exec/CrossJoinBuild.cpp index a0d7ccd6dcf2..c26f1299b3db 100644 --- a/velox/exec/CrossJoinBuild.cpp +++ b/velox/exec/CrossJoinBuild.cpp @@ -19,7 +19,7 @@ namespace facebook::velox::exec { void CrossJoinBridge::setData(std::vector data) { - std::vector> promises; + std::vector promises; { std::lock_guard l(mutex_); VELOX_CHECK(!data_.has_value(), "setData may be called only once"); @@ -72,7 +72,7 @@ BlockingReason CrossJoinBuild::isBlocked(ContinueFuture* future) { void CrossJoinBuild::noMoreInput() { Operator::noMoreInput(); - std::vector> promises; + std::vector promises; std::vector> peers; // The last Driver to hit CrossJoinBuild::finish gathers the data from // all build Drivers and hands it over to the probe side. At this diff --git a/velox/exec/Exchange.h b/velox/exec/Exchange.h index afd003192947..67a85d038f9f 100644 --- a/velox/exec/Exchange.h +++ b/velox/exec/Exchange.h @@ -177,7 +177,7 @@ class ExchangeQueue { bool atEnd_ = false; std::mutex mutex_; std::deque> queue_; - std::vector> promises_; + std::vector promises_; // When set, all promises will be realized and the next dequeue will // throw an exception with this message. std::string error_; diff --git a/velox/exec/HashBuild.cpp b/velox/exec/HashBuild.cpp index c300681dfcde..81c9956bdb61 100644 --- a/velox/exec/HashBuild.cpp +++ b/velox/exec/HashBuild.cpp @@ -23,7 +23,7 @@ namespace facebook::velox::exec { void HashJoinBridge::setHashTable(std::unique_ptr table) { VELOX_CHECK(table, "setHashTable called with null table"); - std::vector> promises; + std::vector promises; { std::lock_guard l(mutex_); VELOX_CHECK(!table_, "setHashTable may be called only once"); @@ -35,7 +35,7 @@ void HashJoinBridge::setHashTable(std::unique_ptr table) { } void HashJoinBridge::setAntiJoinHasNullKeys() { - std::vector> promises; + std::vector promises; { std::lock_guard l(mutex_); VELOX_CHECK( @@ -191,7 +191,7 @@ void HashBuild::noMoreInput() { } Operator::noMoreInput(); - std::vector> promises; + std::vector promises; std::vector> peers; // The last Driver to hit HashBuild::finish gathers the data from // all build Drivers and hands it over to the probe side. At this diff --git a/velox/exec/HashProbe.cpp b/velox/exec/HashProbe.cpp index b2dd59738f03..a1476a9361ac 100644 --- a/velox/exec/HashProbe.cpp +++ b/velox/exec/HashProbe.cpp @@ -576,7 +576,7 @@ void HashProbe::ensureLoadedIfNotAtEnd(ChannelIndex channel) { void HashProbe::noMoreInput() { Operator::noMoreInput(); if (isRightJoin(joinType_) || isFullJoin(joinType_)) { - std::vector> promises; + std::vector promises; std::vector> peers; // The last Driver to hit HashProbe::finish is responsible for producing // non-matching build-side rows for the right join. diff --git a/velox/exec/JoinBridge.cpp b/velox/exec/JoinBridge.cpp index 0ac21ee0d738..95f9af9c25f4 100644 --- a/velox/exec/JoinBridge.cpp +++ b/velox/exec/JoinBridge.cpp @@ -18,14 +18,14 @@ namespace facebook::velox::exec { // static -void JoinBridge::notify(std::vector> promises) { +void JoinBridge::notify(std::vector promises) { for (auto& promise : promises) { promise.setValue(true); } } void JoinBridge::cancel() { - std::vector> promises; + std::vector promises; { std::lock_guard l(mutex_); cancelled_ = true; diff --git a/velox/exec/JoinBridge.h b/velox/exec/JoinBridge.h index f1823b8b5071..3cf50773bc13 100644 --- a/velox/exec/JoinBridge.h +++ b/velox/exec/JoinBridge.h @@ -28,10 +28,10 @@ class JoinBridge { void cancel(); protected: - static void notify(std::vector> promises); + static void notify(std::vector promises); std::mutex mutex_; - std::vector> promises_; + std::vector promises_; bool cancelled_{false}; }; } // namespace facebook::velox::exec diff --git a/velox/exec/LocalPartition.cpp b/velox/exec/LocalPartition.cpp index 19979f1c3b94..28b8cff42019 100644 --- a/velox/exec/LocalPartition.cpp +++ b/velox/exec/LocalPartition.cpp @@ -19,7 +19,7 @@ namespace facebook::velox::exec { namespace { -void notify(std::vector>& promises) { +void notify(std::vector& promises) { for (auto& promise : promises) { promise.setValue(true); } @@ -42,7 +42,7 @@ bool LocalExchangeMemoryManager::increaseMemoryUsage( } void LocalExchangeMemoryManager::decreaseMemoryUsage(int64_t removed) { - std::vector> promises; + std::vector promises; { std::lock_guard l(mutex_); bufferedBytes_ -= removed; @@ -64,8 +64,8 @@ void LocalExchangeQueue::addProducer() { } void LocalExchangeQueue::noMoreProducers() { - std::vector> consumerPromises; - std::vector> producerPromises; + std::vector consumerPromises; + std::vector producerPromises; queue_.withWLock([&](auto& queue) { VELOX_CHECK(!noMoreProducers_, "noMoreProducers can be called only once"); noMoreProducers_ = true; @@ -88,7 +88,7 @@ BlockingReason LocalExchangeQueue::enqueue( ContinueFuture* future) { auto inputBytes = input->retainedSize(); - std::vector> consumerPromises; + std::vector consumerPromises; bool isClosed = queue_.withWLock([&](auto& queue) { if (closed_) { return true; @@ -112,8 +112,8 @@ BlockingReason LocalExchangeQueue::enqueue( } void LocalExchangeQueue::noMoreData() { - std::vector> consumerPromises; - std::vector> producerPromises; + std::vector consumerPromises; + std::vector producerPromises; queue_.withWLock([&](auto queue) { VELOX_CHECK_GT(pendingProducers_, 0); --pendingProducers_; @@ -132,7 +132,7 @@ BlockingReason LocalExchangeQueue::next( ContinueFuture* future, memory::MemoryPool* pool, RowVectorPtr* data) { - std::vector> producerPromises; + std::vector producerPromises; auto blockingReason = queue_.withWLock([&](auto& queue) { *data = nullptr; if (queue.empty()) { @@ -192,8 +192,8 @@ bool LocalExchangeQueue::isFinished() { } void LocalExchangeQueue::close() { - std::vector> producerPromises; - std::vector> consumerPromises; + std::vector producerPromises; + std::vector consumerPromises; queue_.withWLock([&](auto& queue) { uint64_t freedBytes = 0; while (!queue.empty()) { diff --git a/velox/exec/LocalPartition.h b/velox/exec/LocalPartition.h index 9d6e5c0748d0..e030bbc60e10 100644 --- a/velox/exec/LocalPartition.h +++ b/velox/exec/LocalPartition.h @@ -37,7 +37,7 @@ class LocalExchangeMemoryManager { const int64_t maxBufferSize_; std::mutex mutex_; int64_t bufferedBytes_{0}; - std::vector> promises_; + std::vector promises_; }; /// Buffers data for a single partition produced by local exchange. Allows @@ -102,11 +102,11 @@ class LocalExchangeQueue { // Satisfied when data becomes available or all producers report that they // finished producing, e.g. queue_ is not empty or noMoreProducers_ is true // and pendingProducers_ is zero. - std::vector> consumerPromises_; + std::vector consumerPromises_; // Satisfied when all data has been fetched and no more data will be produced, // e.g. queue_ is empty, noMoreProducers_ is true and pendingProducers_ is // zero. - std::vector> producerPromises_; + std::vector producerPromises_; int pendingProducers_{0}; bool noMoreProducers_{false}; bool closed_{false}; diff --git a/velox/exec/MergeSource.cpp b/velox/exec/MergeSource.cpp index bdfc14e4bb1f..2872ca1e20e3 100644 --- a/velox/exec/MergeSource.cpp +++ b/velox/exec/MergeSource.cpp @@ -104,8 +104,8 @@ class LocalMergeSource : public MergeSource { bool atEnd_{false}; boost::circular_buffer data_; - std::vector> consumerPromises_; - std::vector> producerPromises_; + std::vector consumerPromises_; + std::vector producerPromises_; }; folly::Synchronized queue_; @@ -192,7 +192,7 @@ std::shared_ptr MergeSource::createMergeExchangeSource( } namespace { -void notify(std::optional>& promise) { +void notify(std::optional& promise) { if (promise) { promise->setValue(true); promise.reset(); @@ -215,7 +215,7 @@ BlockingReason MergeJoinSource::next( return BlockingReason::kNotBlocked; } - consumerPromise_ = VeloxPromise("MergeJoinSource::next"); + consumerPromise_ = ContinuePromise("MergeJoinSource::next"); *future = consumerPromise_->getSemiFuture(); return BlockingReason::kWaitForExchange; }); @@ -242,7 +242,7 @@ BlockingReason MergeJoinSource::enqueue( state.data = std::move(data); notify(consumerPromise_); - producerPromise_ = VeloxPromise("MergeJoinSource::enqueue"); + producerPromise_ = ContinuePromise("MergeJoinSource::enqueue"); *future = producerPromise_->getSemiFuture(); return BlockingReason::kWaitForConsumer; }); diff --git a/velox/exec/MergeSource.h b/velox/exec/MergeSource.h index 77b2e10ac716..1661fa256b51 100644 --- a/velox/exec/MergeSource.h +++ b/velox/exec/MergeSource.h @@ -65,11 +65,11 @@ class MergeJoinSource { // Satisfied when data becomes available or the producer reports that it // finished producing, e.g. state_.data is not nullptr or state_.atEnd is // true. - std::optional> consumerPromise_; + std::optional consumerPromise_; // Satisfied when previously enqueued data has been consumed, e.g. state_.data // is nullptr. - std::optional> producerPromise_; + std::optional producerPromise_; }; } // namespace facebook::velox::exec diff --git a/velox/exec/PartitionedOutputBufferManager.cpp b/velox/exec/PartitionedOutputBufferManager.cpp index 1bbad17001d0..627fd4067b14 100644 --- a/velox/exec/PartitionedOutputBufferManager.cpp +++ b/velox/exec/PartitionedOutputBufferManager.cpp @@ -138,7 +138,7 @@ namespace { // producers which will allocate more memory. void releaseAfterAcknowledge( std::vector>& freed, - std::vector>& promises) { + std::vector& promises) { freed.clear(); for (auto& promise : promises) { promise.setValue(true); @@ -167,7 +167,7 @@ void PartitionedOutputBuffer::updateBroadcastOutputBuffers( bool noMoreBuffers) { VELOX_CHECK(broadcast_); - std::vector> promises; + std::vector promises; { std::lock_guard l(mutex_); @@ -316,7 +316,7 @@ bool PartitionedOutputBuffer::isFinishedLocked() { void PartitionedOutputBuffer::acknowledge(int destination, int64_t sequence) { std::vector> freed; - std::vector> promises; + std::vector promises; { std::lock_guard l(mutex_); VELOX_CHECK_LT(destination, buffers_.size()); @@ -334,7 +334,7 @@ void PartitionedOutputBuffer::acknowledge(int destination, int64_t sequence) { void PartitionedOutputBuffer::updateAfterAcknowledgeLocked( const std::vector>& freed, - std::vector>& promises) { + std::vector& promises) { uint64_t totalFreed = 0; for (const auto& free : freed) { if (free.unique()) { @@ -359,7 +359,7 @@ void PartitionedOutputBuffer::updateAfterAcknowledgeLocked( bool PartitionedOutputBuffer::deleteResults(int destination) { std::vector> freed; - std::vector> promises; + std::vector promises; bool isFinished; { std::lock_guard l(mutex_); @@ -393,7 +393,7 @@ void PartitionedOutputBuffer::getData( DataAvailableCallback notify) { std::vector> data; std::vector> freed; - std::vector> promises; + std::vector promises; { std::lock_guard l(mutex_); diff --git a/velox/exec/PartitionedOutputBufferManager.h b/velox/exec/PartitionedOutputBufferManager.h index dac60b31fd04..a3984ea811b0 100644 --- a/velox/exec/PartitionedOutputBufferManager.h +++ b/velox/exec/PartitionedOutputBufferManager.h @@ -153,7 +153,7 @@ class PartitionedOutputBuffer { // 'promises'. void updateAfterAcknowledgeLocked( const std::vector>& freed, - std::vector>& promises); + std::vector& promises); /// Given an updated total number of broadcast buffers, add any missing ones /// and enqueue data that has been produced so far (e.g. dataToBroadcast_). @@ -181,7 +181,7 @@ class PartitionedOutputBuffer { std::mutex mutex_; // Actual data size in 'buffers_'. uint64_t totalSize_ = 0; - std::vector> promises_; + std::vector promises_; // One buffer per destination std::vector> buffers_; uint32_t numFinished_{0}; diff --git a/velox/exec/Task.cpp b/velox/exec/Task.cpp index ca3501b5f353..debd0d305084 100644 --- a/velox/exec/Task.cpp +++ b/velox/exec/Task.cpp @@ -891,7 +891,7 @@ bool Task::allPeersFinished( const core::PlanNodeId& planNodeId, Driver* caller, ContinueFuture* future, - std::vector>& promises, + std::vector& promises, std::vector>& peers) { std::lock_guard l(mutex_); if (exception_) { @@ -993,8 +993,8 @@ std::string Task::shortId(const std::string& id) { /// Moves split promises from one vector to another. static void movePromisesOut( - std::vector>& from, - std::vector>& to) { + std::vector& from, + std::vector& to) { for (auto& promise : from) { to.push_back(std::move(promise)); } diff --git a/velox/exec/Task.h b/velox/exec/Task.h index 68901a029729..521add4305fb 100644 --- a/velox/exec/Task.h +++ b/velox/exec/Task.h @@ -31,8 +31,6 @@ class PartitionedOutputBufferManager; class HashJoinBridge; class CrossJoinBridge; -using ContinuePromise = VeloxPromise; - class Task : public std::enable_shared_from_this { public: /// Creates a task to execute a plan fragment, but doesn't start execution @@ -333,7 +331,7 @@ class Task : public std::enable_shared_from_this { const core::PlanNodeId& planNodeId, Driver* FOLLY_NONNULL caller, ContinueFuture* FOLLY_NONNULL future, - std::vector>& promises, + std::vector& promises, std::vector>& peers); // Adds HashJoinBridge's for all the specified plan node IDs. @@ -654,7 +652,7 @@ class Task : public std::enable_shared_from_this { /// Stores separate splits state for each plan node. std::unordered_map splitsStates_; - std::vector> stateChangePromises_; + std::vector stateChangePromises_; TaskStats taskStats_; std::unique_ptr pool_; @@ -686,7 +684,7 @@ class Task : public std::enable_shared_from_this { // Promises for the futures returned to callers of requestPause() or // terminate(). They are fulfilled when the last thread stops // running for 'this'. - std::vector> threadFinishPromises_; + std::vector threadFinishPromises_; }; /// Listener invoked on task completion. diff --git a/velox/exec/TaskStructs.h b/velox/exec/TaskStructs.h index f9d586ec0b54..f2890dedeea2 100644 --- a/velox/exec/TaskStructs.h +++ b/velox/exec/TaskStructs.h @@ -34,7 +34,7 @@ enum TaskState { kRunning, kFinished, kCanceled, kAborted, kFailed }; struct BarrierState { int32_t numRequested; std::vector> drivers; - std::vector> promises; + std::vector promises; }; /// Structure to accumulate splits for distribution. @@ -44,7 +44,7 @@ struct SplitsStore { /// Signal, that no more splits will arrive. bool noMoreSplits{false}; /// Blocking promises given out when out of splits to distribute. - std::vector> splitPromises; + std::vector splitPromises; }; /// Structure contains the current info on splits for a particular plan node. diff --git a/velox/exec/tests/utils/Cursor.cpp b/velox/exec/tests/utils/Cursor.cpp index 6f1fbd3ba718..1d85aa6f05f3 100644 --- a/velox/exec/tests/utils/Cursor.cpp +++ b/velox/exec/tests/utils/Cursor.cpp @@ -57,7 +57,7 @@ exec::BlockingReason TaskQueue::enqueue( RowVectorPtr TaskQueue::dequeue() { for (;;) { RowVectorPtr vector; - std::vector> mayContinue; + std::vector mayContinue; { std::lock_guard l(mutex_); if (!queue_.empty()) { @@ -74,7 +74,7 @@ RowVectorPtr TaskQueue::dequeue() { } if (!vector) { consumerBlocked_ = true; - consumerPromise_ = VeloxPromise(); + consumerPromise_ = ContinuePromise(); consumerFuture_ = consumerPromise_.getFuture(); } } diff --git a/velox/exec/tests/utils/Cursor.h b/velox/exec/tests/utils/Cursor.h index c4eb00274654..5920b1de9910 100644 --- a/velox/exec/tests/utils/Cursor.h +++ b/velox/exec/tests/utils/Cursor.h @@ -92,9 +92,9 @@ class TaskQueue { // adding the result. uint64_t maxBytes_; std::mutex mutex_; - std::vector> producerUnblockPromises_; + std::vector producerUnblockPromises_; bool consumerBlocked_ = false; - VeloxPromise consumerPromise_; + ContinuePromise consumerPromise_; folly::Future consumerFuture_; bool closed_ = false; }; From 8b84b39d66677691deed3be439db3eceac947a53 Mon Sep 17 00:00:00 2001 From: Masha Basmanova Date: Thu, 14 Apr 2022 03:14:05 -0700 Subject: [PATCH 010/198] Fix Parquet reader (#1423) Summary: Parquet reader didn't use ScanSpec to figure out the output layout. In case of q18 this resulted in the reader returning columns in the wrong order, i.e. (custkey, orderkey) instead of (orderkey, custkey). Also, added support for kBigintValuesUsingHashTable filter, a contribution from Ge (gggrace14). Pull Request resolved: https://github.com/facebookincubator/velox/pull/1423 Reviewed By: gggrace14 Differential Revision: D35635866 Pulled By: mbasmanova fbshipit-source-id: ba6306a7539b2d5bbaa4b197f6f1b893ed8a0d0f --- velox/dwio/parquet/reader/ParquetReader.cpp | 40 +++++++++++++--- velox/dwio/parquet/reader/ParquetReader.h | 1 + .../dwio/parquet/tests/ParquetReaderTest.cpp | 46 +++++++++++++++---- velox/dwio/parquet/tests/ParquetTpchTest.cpp | 5 +- velox/type/Filter.h | 4 ++ 5 files changed, 75 insertions(+), 21 deletions(-) diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index eed32dd716ad..8b43a96eb076 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -140,14 +140,29 @@ void toDuckDbFilter( } break; } + case common::FilterKind::kBigintValuesUsingHashTable: { + auto valuesFilter = + static_cast(filter); + const auto& values = valuesFilter->values(); + if (values.size() == 1) { + filters.PushFilter( + colIdx, constantEqualFilter(makeValue(type, *values.begin()))); + } else { + auto duckFilter = std::make_unique<::duckdb::ConjunctionOrFilter>(); + for (const auto& value : values) { + duckFilter->child_filters.push_back( + constantEqualFilter(makeValue(type, value))); + } + filters.PushFilter(colIdx, std::move(duckFilter)); + } + break; + } case common::FilterKind::kAlwaysFalse: case common::FilterKind::kAlwaysTrue: case common::FilterKind::kIsNull: case common::FilterKind::kIsNotNull: case common::FilterKind::kBoolValue: - case common::FilterKind::kBigintValuesUsingHashTable: - case common::FilterKind::kBigintValuesUsingBitmask: case common::FilterKind::kFloatRange: case common::FilterKind::kBigintMultiRange: case common::FilterKind::kMultiRange: @@ -180,7 +195,9 @@ ParquetRowReader::ParquetRowReader( std::shared_ptr<::duckdb::ParquetReader> reader, const dwio::common::RowReaderOptions& options, memory::MemoryPool& pool) - : reader_(std::move(reader)), pool_(pool) { + : reader_(std::move(reader)), + pool_(pool), + scanSpec_{options.getScanSpec()} { auto& selector = *options.getSelector(); rowType_ = selector.buildSelectedReordered(); duckdbRowType_.reserve(rowType_->size()); @@ -233,10 +250,19 @@ uint64_t ParquetRowReader::next(uint64_t /*size*/, velox::VectorPtr& result) { if (output.size() > 0) { std::vector columns; - columns.reserve(output.data.size()); - for (int i = 0; i < output.data.size(); i++) { - columns.emplace_back(duckdb::toVeloxVector( - output.size(), output.data[i], rowType_->childAt(i), &pool_)); + columns.resize(output.data.size()); + for (auto& spec : scanSpec_->children()) { + if (spec->isConstant()) { + columns[spec->channel()] = + BaseVector::wrapInConstant(output.size(), 0, spec->constantValue()); + } else if (spec->projectOut()) { + auto index = rowType_->getChildIdx(spec->fieldName()); + columns[spec->channel()] = duckdb::toVeloxVector( + output.size(), + output.data[index], + rowType_->childAt(index), + &pool_); + } } result = std::make_shared( diff --git a/velox/dwio/parquet/reader/ParquetReader.h b/velox/dwio/parquet/reader/ParquetReader.h index 68556f79bd19..872ecc05d51a 100644 --- a/velox/dwio/parquet/reader/ParquetReader.h +++ b/velox/dwio/parquet/reader/ParquetReader.h @@ -51,6 +51,7 @@ class ParquetRowReader : public dwio::common::RowReader { memory::MemoryPool& pool_; RowTypePtr rowType_; std::vector<::duckdb::LogicalType> duckdbRowType_; + velox::common::ScanSpec* scanSpec_; }; class ParquetReader : public dwio::common::Reader { diff --git a/velox/dwio/parquet/tests/ParquetReaderTest.cpp b/velox/dwio/parquet/tests/ParquetReaderTest.cpp index b6f2b4c55802..4e823b8f77d3 100644 --- a/velox/dwio/parquet/tests/ParquetReaderTest.cpp +++ b/velox/dwio/parquet/tests/ParquetReaderTest.cpp @@ -41,6 +41,7 @@ class ParquetReaderTest : public testing::Test { RowReaderOptions rowReaderOpts; rowReaderOpts.select( std::make_shared(rowType, rowType->names())); + return rowReaderOpts; } @@ -98,6 +99,19 @@ class ParquetReaderTest : public testing::Test { EXPECT_EQ(reader.next(1000, result), 0); } + std::unique_ptr makeScanSpec(const RowTypePtr& rowType) { + auto scanSpec = std::make_unique(""); + + for (auto i = 0; i < rowType->size(); ++i) { + auto child = + scanSpec->getOrCreateChild(common::Subfield(rowType->nameOf(i))); + child->setProjectOut(true); + child->setChannel(i); + } + + return scanSpec; + } + using FilterMap = std::unordered_map>; @@ -112,14 +126,14 @@ class ParquetReaderTest : public testing::Test { ParquetReader reader( std::make_unique(filePath), readerOptions); - common::ScanSpec scanSpec(""); + auto scanSpec = makeScanSpec(fileSchema); for (auto&& [column, filter] : filters) { - scanSpec.getOrCreateChild(common::Subfield(column)) + scanSpec->getOrCreateChild(common::Subfield(column)) ->setFilter(std::move(filter)); } - RowReaderOptions rowReaderOpts = getReaderOpts(fileSchema); - rowReaderOpts.setScanSpec(&scanSpec); + auto rowReaderOpts = getReaderOpts(fileSchema); + rowReaderOpts.setScanSpec(scanSpec.get()); auto rowReader = reader.createRowReader(rowReaderOpts); assertReadExpected(*rowReader, expected); } @@ -159,7 +173,9 @@ TEST_F(ParquetReaderTest, readSampleFull) { EXPECT_EQ(type->childByName("a"), col0); EXPECT_EQ(type->childByName("b"), col1); - RowReaderOptions rowReaderOpts = getReaderOpts(sampleSchema()); + auto rowReaderOpts = getReaderOpts(sampleSchema()); + auto scanSpec = makeScanSpec(sampleSchema()); + rowReaderOpts.setScanSpec(scanSpec.get()); auto rowReader = reader.createRowReader(rowReaderOpts); auto expected = vectorMaker_->rowVector( {rangeVector(20, 1), rangeVector(20, 1)}); @@ -173,7 +189,9 @@ TEST_F(ParquetReaderTest, readSampleRange1) { ParquetReader reader( std::make_unique(sample), readerOptions); - RowReaderOptions rowReaderOpts = getReaderOpts(sampleSchema()); + auto rowReaderOpts = getReaderOpts(sampleSchema()); + auto scanSpec = makeScanSpec(sampleSchema()); + rowReaderOpts.setScanSpec(scanSpec.get()); rowReaderOpts.range(0, 200); auto rowReader = reader.createRowReader(rowReaderOpts); auto expected = vectorMaker_->rowVector( @@ -188,7 +206,9 @@ TEST_F(ParquetReaderTest, readSampleRange2) { ParquetReader reader( std::make_unique(sample), readerOptions); - RowReaderOptions rowReaderOpts = getReaderOpts(sampleSchema()); + auto rowReaderOpts = getReaderOpts(sampleSchema()); + auto scanSpec = makeScanSpec(sampleSchema()); + rowReaderOpts.setScanSpec(scanSpec.get()); rowReaderOpts.range(200, 500); auto rowReader = reader.createRowReader(rowReaderOpts); auto expected = vectorMaker_->rowVector( @@ -203,7 +223,9 @@ TEST_F(ParquetReaderTest, readSampleEmptyRange) { ParquetReader reader( std::make_unique(sample), readerOptions); - RowReaderOptions rowReaderOpts = getReaderOpts(sampleSchema()); + auto rowReaderOpts = getReaderOpts(sampleSchema()); + auto scanSpec = makeScanSpec(sampleSchema()); + rowReaderOpts.setScanSpec(scanSpec.get()); rowReaderOpts.range(300, 10); auto rowReader = reader.createRowReader(rowReaderOpts); @@ -253,7 +275,9 @@ TEST_F(ParquetReaderTest, dateRead) { auto col0 = type->childAt(0); EXPECT_EQ(col0->type->kind(), TypeKind::DATE); - RowReaderOptions rowReaderOpts = getReaderOpts(dateSchema()); + auto rowReaderOpts = getReaderOpts(dateSchema()); + auto scanSpec = makeScanSpec(dateSchema()); + rowReaderOpts.setScanSpec(scanSpec.get()); auto rowReader = reader.createRowReader(rowReaderOpts); auto expected = vectorMaker_->rowVector({rangeVector(25, -5)}); @@ -292,7 +316,9 @@ TEST_F(ParquetReaderTest, intRead) { auto col1 = type->childAt(1); EXPECT_EQ(col1->type->kind(), TypeKind::BIGINT); - RowReaderOptions rowReaderOpts = getReaderOpts(intSchema()); + auto rowReaderOpts = getReaderOpts(intSchema()); + auto scanSpec = makeScanSpec(intSchema()); + rowReaderOpts.setScanSpec(scanSpec.get()); auto rowReader = reader.createRowReader(rowReaderOpts); auto expected = vectorMaker_->rowVector( diff --git a/velox/dwio/parquet/tests/ParquetTpchTest.cpp b/velox/dwio/parquet/tests/ParquetTpchTest.cpp index a63b2c7166da..8ae993ef1198 100644 --- a/velox/dwio/parquet/tests/ParquetTpchTest.cpp +++ b/velox/dwio/parquet/tests/ParquetTpchTest.cpp @@ -161,10 +161,7 @@ TEST_F(ParquetTpchTest, Q6) { assertQuery(6, 2, 10); } -// This test started to fail after dynamic filters -// are always pushed that was introduced in -// https://github.com/facebookincubator/velox/pull/1314 -TEST_F(ParquetTpchTest, DISABLED_Q18) { +TEST_F(ParquetTpchTest, Q18) { assertQuery(18, 5, 30); } diff --git a/velox/type/Filter.h b/velox/type/Filter.h index febafb61c007..a174f2b3ad70 100644 --- a/velox/type/Filter.h +++ b/velox/type/Filter.h @@ -683,6 +683,10 @@ class BigintValuesUsingHashTable final : public Filter { return max_; } + const std::vector& values() const { + return values_; + } + std::string toString() const final { return fmt::format( "BigintValuesUsingHashTable: [{}, {}] {}", From ae8a8d33a0a52122556204f4584e6d68b5cff706 Mon Sep 17 00:00:00 2001 From: Masha Basmanova Date: Thu, 14 Apr 2022 09:03:14 -0700 Subject: [PATCH 011/198] Improve PlanBuilder::tableScan (#1414) Summary: Add a version of PlanBuilder::tableScan() that allows to specify table name and column aliases in addition to SQL expressions for range and remaining filters. This allows to write TPC-H query plans more easily. Here is an example from q6: ``` .tableScan( kLineitem, selectedRowType, fileColumnNames, {"l_discount between 0.05 and 0.07", "l_quantity < 24.0"}) ``` Pull Request resolved: https://github.com/facebookincubator/velox/pull/1414 Reviewed By: kgpai Differential Revision: D35628758 Pulled By: mbasmanova fbshipit-source-id: 183fcbd836dbd505d5aee74b14c9c0a695adcefc --- velox/exec/tests/TableScanTest.cpp | 19 +-- velox/exec/tests/utils/PlanBuilder.cpp | 42 ++++++- velox/exec/tests/utils/PlanBuilder.h | 22 ++++ velox/exec/tests/utils/TpchQueryBuilder.cpp | 124 +++++--------------- 4 files changed, 97 insertions(+), 110 deletions(-) diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp index 2ab0461c6ac1..d0b1a9c65f14 100644 --- a/velox/exec/tests/TableScanTest.cpp +++ b/velox/exec/tests/TableScanTest.cpp @@ -220,30 +220,21 @@ TEST_P(TableScanTest, columnAliases) { writeToFile(filePath->path, vectors); createDuckDbTable(vectors); - ColumnHandleMap assignments = {{"a", regularColumn("c0", BIGINT())}}; + std::string tableName = "t"; + std::unordered_map aliases = {{"a", "c0"}}; auto outputType = ROW({"a"}, {BIGINT()}); - auto op = PlanBuilder() - .tableScan( - outputType, makeTableHandle(SubfieldFilters{}), assignments) - .planNode(); + auto op = PlanBuilder().tableScan(tableName, outputType, aliases).planNode(); assertQuery(op, {filePath}, "SELECT c0 FROM tmp"); // Use aliased column in a range filter. - auto filters = singleSubfieldFilter("c0", lessThanOrEqual(10)); - op = PlanBuilder() - .tableScan( - outputType, makeTableHandle(std::move(filters)), assignments) + .tableScan(tableName, outputType, aliases, {"a < 10"}) .planNode(); assertQuery(op, {filePath}, "SELECT c0 FROM tmp WHERE c0 <= 10"); // Use aliased column in remaining filter. op = PlanBuilder() - .tableScan( - outputType, - makeTableHandle( - SubfieldFilters{}, parseExpr("c0 % 2 = 1", rowType_)), - assignments) + .tableScan(tableName, outputType, aliases, {}, "a % 2 = 1") .planNode(); assertQuery(op, {filePath}, "SELECT c0 FROM tmp WHERE c0 % 2 = 1"); } diff --git a/velox/exec/tests/utils/PlanBuilder.cpp b/velox/exec/tests/utils/PlanBuilder.cpp index 73fc87a33b51..2dde679c8b27 100644 --- a/velox/exec/tests/utils/PlanBuilder.cpp +++ b/velox/exec/tests/utils/PlanBuilder.cpp @@ -143,6 +143,8 @@ std::unique_ptr makeLessThanOrEqualFilter( return common::test::lessThanOrEqualFloat(singleValue(upper)); case TypeKind::VARCHAR: return common::test::lessThanOrEqual(singleValue(upper)); + case TypeKind::DATE: + return common::test::lessThanOrEqual(singleValue(upper).days()); default: VELOX_NYI( "Unsupported value for less than or equals filter: {} <= {}", @@ -166,6 +168,8 @@ std::unique_ptr makeLessThanFilter( return common::test::lessThanFloat(singleValue(upper)); case TypeKind::VARCHAR: return common::test::lessThan(singleValue(upper)); + case TypeKind::DATE: + return common::test::lessThan(singleValue(upper).days()); default: VELOX_NYI( "Unsupported value for less than filter: {} <= {}", @@ -189,6 +193,8 @@ std::unique_ptr makeGreaterThanOrEqualFilter( return common::test::greaterThanOrEqualFloat(singleValue(lower)); case TypeKind::VARCHAR: return common::test::greaterThanOrEqual(singleValue(lower)); + case TypeKind::DATE: + return common::test::greaterThanOrEqual(singleValue(lower).days()); default: VELOX_NYI( "Unsupported value for greater than or equals filter: {} >= {}", @@ -212,6 +218,8 @@ std::unique_ptr makeGreaterThanFilter( return common::test::greaterThanFloat(singleValue(lower)); case TypeKind::VARCHAR: return common::test::greaterThan(singleValue(lower)); + case TypeKind::DATE: + return common::test::greaterThan(singleValue(lower).days()); default: VELOX_NYI( "Unsupported value for greater than filter: {} > {}", @@ -233,6 +241,8 @@ std::unique_ptr makeEqualFilter( return common::test::equal(singleValue(value)); case TypeKind::VARCHAR: return common::test::equal(singleValue(value)); + case TypeKind::DATE: + return common::test::equal(singleValue(value).days()); default: VELOX_NYI( "Unsupported value for equals filter: {} = {}", @@ -257,6 +267,9 @@ std::unique_ptr makeBetweenFilter( case TypeKind::REAL: return common::test::betweenFloat( singleValue(lower), singleValue(upper)); + case TypeKind::DATE: + return common::test::between( + singleValue(lower).days(), singleValue(upper).days()); default: VELOX_NYI( "Unsupported value for 'between' filter: {} BETWEEN {} AND {}", @@ -332,15 +345,32 @@ PlanBuilder& PlanBuilder::tableScan( const RowTypePtr& outputType, const std::vector& subfieldFilters, const std::string& remainingFilter) { + return tableScan( + "hive_table", outputType, {}, subfieldFilters, remainingFilter); +} + +PlanBuilder& PlanBuilder::tableScan( + const std::string& tableName, + const RowTypePtr& outputType, + const std::unordered_map& columnAliases, + const std::vector& subfieldFilters, + const std::string& remainingFilter) { std::unordered_map> assignments; for (uint32_t i = 0; i < outputType->size(); ++i) { const auto& name = outputType->nameOf(i); const auto& type = outputType->childAt(i); + + std::string hiveColumnName = name; + auto it = columnAliases.find(name); + if (it != columnAliases.end()) { + hiveColumnName = it->second; + } + assignments.insert( {name, std::make_shared( - name, HiveColumnHandle::ColumnType::kRegular, type)}); + hiveColumnName, HiveColumnHandle::ColumnType::kRegular, type)}); } SubfieldFilters filters; @@ -349,16 +379,22 @@ PlanBuilder& PlanBuilder::tableScan( auto filterExpr = parseExpr(filter, outputType, pool_); auto [subfield, subfieldFilter] = toSubfieldFilter(filterExpr); + auto it = columnAliases.find(subfield.toString()); + if (it != columnAliases.end()) { + subfield = common::Subfield(it->second); + } + filters[std::move(subfield)] = std::move(subfieldFilter); } std::shared_ptr remainingFilterExpr; if (!remainingFilter.empty()) { - remainingFilterExpr = parseExpr(remainingFilter, outputType, pool_); + remainingFilterExpr = parseExpr(remainingFilter, outputType, pool_) + ->rewriteInputNames(columnAliases); } auto tableHandle = std::make_shared( - "hive_table", true, std::move(filters), remainingFilterExpr); + tableName, true, std::move(filters), remainingFilterExpr); return tableScan(outputType, tableHandle, assignments); } diff --git a/velox/exec/tests/utils/PlanBuilder.h b/velox/exec/tests/utils/PlanBuilder.h index 921552e022dd..a7344f3386ae 100644 --- a/velox/exec/tests/utils/PlanBuilder.h +++ b/velox/exec/tests/utils/PlanBuilder.h @@ -69,6 +69,28 @@ class PlanBuilder { const std::vector& subfieldFilters = {}, const std::string& remainingFilter = ""); + /// Add a TableScanNode to scan a Hive table. + /// + /// @param tableName The name of the table to scan. + /// @param outputType List of column names and types to read from the table. + /// @param columnAliases Optional aliases for the column names. The key is the + /// alias (name in 'outputType'), value is the name in the files. + /// @param subfieldFilters A list of SQL expressions for the range filters to + /// apply to individual columns. Should use column name aliases, not column + /// names in the files. Supported filters are: column <= value, column < + /// value, column >= value, column > value, column = value, column IN (v1, + /// v2,.. vN), column < v1 OR column >= v2. + /// @param remainingFilter SQL expression for the additional conjunct. May + /// include multiple columns and SQL functions. Should use column name + /// aliases, not column names in the files. The remainingFilter is AND'ed + /// with all the subfieldFilters. + PlanBuilder& tableScan( + const std::string& tableName, + const RowTypePtr& outputType, + const std::unordered_map& columnAliases = {}, + const std::vector& subfieldFilters = {}, + const std::string& remainingFilter = ""); + PlanBuilder& tableScan( const RowTypePtr& outputType, const std::shared_ptr& tableHandle, diff --git a/velox/exec/tests/utils/TpchQueryBuilder.cpp b/velox/exec/tests/utils/TpchQueryBuilder.cpp index 8fba3ee1d7e2..169674432028 100644 --- a/velox/exec/tests/utils/TpchQueryBuilder.cpp +++ b/velox/exec/tests/utils/TpchQueryBuilder.cpp @@ -16,8 +16,7 @@ #include "velox/exec/tests/utils/TpchQueryBuilder.h" #include "velox/common/base/tests/Fs.h" #include "velox/connectors/hive/HiveConnector.h" -#include "velox/type/tests/FilterBuilder.h" -#include "velox/type/tests/SubfieldFiltersBuilder.h" + namespace facebook::velox::exec::test { static int64_t toDate(std::string_view stringDate) { @@ -88,39 +87,6 @@ TpchPlan TpchQueryBuilder::getQueryPlan(int queryId) const { } } -namespace { -using ColumnHandleMap = - std::unordered_map>; - -std::shared_ptr makeTableHandle( - const std::string& tableName, - common::test::SubfieldFilters subfieldFilters, - const std::shared_ptr& remainingFilter = nullptr) { - return std::make_shared( - tableName, true, std::move(subfieldFilters), remainingFilter); -} - -std::shared_ptr regularColumn( - const std::string& name, - const TypePtr& type) { - return std::make_shared( - name, connector::hive::HiveColumnHandle::ColumnType::kRegular, type); -} - -ColumnHandleMap allRegularColumns( - const std::shared_ptr& rowType, - const std::unordered_map& fileColumnNames) { - ColumnHandleMap assignments; - assignments.reserve(rowType->size()); - for (uint32_t i = 0; i < rowType->size(); ++i) { - const auto& name = rowType->nameOf(i); - assignments[name] = - regularColumn(fileColumnNames.at(name), rowType->childAt(i)); - } - return assignments; -} -} // namespace - TpchPlan TpchQueryBuilder::getQ1Plan() const { static const std::string kLineitem = "lineitem"; std::vector selectedColumns = { @@ -137,27 +103,20 @@ TpchPlan TpchQueryBuilder::getQ1Plan() const { // shipdate <= '1998-09-02' const auto shipDate = "l_shipdate"; - common::test::SubfieldFiltersBuilder filtersBuilder; + std::string filter; // DWRF does not support Date type. Use Varchar instead. if (selectedRowType->findChild(shipDate)->isVarchar()) { - filtersBuilder.add( - fileColumnNames.at(shipDate), - common::test::lessThanOrEqual("1998-09-02")); + filter = "l_shipdate <= '1998-09-02'"; } else { - filtersBuilder.add( - fileColumnNames.at(shipDate), - common::test::lessThanOrEqual(toDate("1998-09-02"))); + filter = "l_shipdate <= '1998-09-02'::DATE"; } - auto filters = filtersBuilder.build(); + auto planNodeIdGenerator = std::make_shared(); core::PlanNodeId lineitemPlanNodeId; const auto partialAggStage = PlanBuilder(planNodeIdGenerator) - .tableScan( - selectedRowType, - makeTableHandle(kLineitem, std::move(filters)), - allRegularColumns(selectedRowType, fileColumnNames)) + .tableScan(kLineitem, selectedRowType, fileColumnNames, {filter}) .capturePlanNodeId(lineitemPlanNodeId) .project( {"l_returnflag", @@ -218,43 +177,34 @@ TpchPlan TpchQueryBuilder::getQ6Plan() const { const auto& fileColumnNames = getFileColumnNames(kLineitem); const auto shipDate = "l_shipdate"; - common::test::SubfieldFiltersBuilder filtersBuilder; + std::string shipDateFilter; // DWRF does not support Date type. Use Varchar instead. if (selectedRowType->findChild(shipDate)->isVarchar()) { - filtersBuilder.add( - fileColumnNames.at(shipDate), - common::test::between("1994-01-01", "1994-12-31")); + shipDateFilter = "l_shipdate between '1994-01-01' and '1994-12-31'"; } else { - filtersBuilder.add( - fileColumnNames.at(shipDate), - common::test::between(toDate("1994-01-01"), toDate("1994-12-31"))); + shipDateFilter = + "l_shipdate between '1994-01-01'::DATE and '1994-12-31'::DATE"; } - auto filters = filtersBuilder - .add( - fileColumnNames.at("l_discount"), - common::test::betweenDouble(0.05, 0.07)) - .add( - fileColumnNames.at("l_quantity"), - common::test::lessThanDouble(24.0)) - .build(); auto planNodeIdGenerator = std::make_shared(); core::PlanNodeId lineitemPlanNodeId; - auto plan = - PlanBuilder(planNodeIdGenerator) - .localPartition( - {}, - {PlanBuilder(planNodeIdGenerator) - .tableScan( - selectedRowType, - makeTableHandle(kLineitem, std::move(filters)), - allRegularColumns(selectedRowType, fileColumnNames)) - .capturePlanNodeId(lineitemPlanNodeId) - .project({"l_extendedprice * l_discount"}) - .partialAggregation({}, {"sum(p0)"}) - .planNode()}) - .finalAggregation({}, {"sum(a0)"}, {DOUBLE()}) - .planNode(); + auto plan = PlanBuilder(planNodeIdGenerator) + .localPartition( + {}, + {PlanBuilder(planNodeIdGenerator) + .tableScan( + kLineitem, + selectedRowType, + fileColumnNames, + {shipDateFilter, + "l_discount between 0.05 and 0.07", + "l_quantity < 24.0"}) + .capturePlanNodeId(lineitemPlanNodeId) + .project({"l_extendedprice * l_discount"}) + .partialAggregation({}, {"sum(p0)"}) + .planNode()}) + .finalAggregation({}, {"sum(a0)"}, {DOUBLE()}) + .planNode(); TpchPlan context; context.plan = std::move(plan); context.dataFiles[lineitemPlanNodeId] = getTableFilePaths(kLineitem); @@ -291,11 +241,7 @@ TpchPlan TpchQueryBuilder::getQ18Plan() const { {"l_orderkey"}, {PlanBuilder(planNodeIdGenerator) .tableScan( - lineitemSelectedRowType, - makeTableHandle( - kLineitem, common::test::SubfieldFilters{}), - allRegularColumns( - lineitemSelectedRowType, lineitemFileColumns)) + kLineitem, lineitemSelectedRowType, lineitemFileColumns) .capturePlanNodeId(lineitemScanNodeId) .partialAggregation({0}, {"sum(l_quantity) AS partial_sum"}) .planNode()}) @@ -308,12 +254,7 @@ TpchPlan TpchQueryBuilder::getQ18Plan() const { .localPartition( {}, {PlanBuilder(planNodeIdGenerator) - .tableScan( - ordersSelectedRowType, - makeTableHandle( - kOrders, common::test::SubfieldFilters{}), - allRegularColumns( - ordersSelectedRowType, ordersFileColumns)) + .tableScan(kOrders, ordersSelectedRowType, ordersFileColumns) .capturePlanNodeId(ordersScanNodeId) .hashJoin( {"o_orderkey"}, @@ -331,12 +272,9 @@ TpchPlan TpchQueryBuilder::getQ18Plan() const { {"c_custkey"}, PlanBuilder(planNodeIdGenerator) .tableScan( + kCustomer, customerSelectedRowType, - makeTableHandle( - kCustomer, common::test::SubfieldFilters{}), - allRegularColumns( - customerSelectedRowType, - customerFileColumns)) + customerFileColumns) .capturePlanNodeId(customerScanNodeId) .planNode(), "", From 69bdb8094c17d42bfc9edfc511f58e9ab298379d Mon Sep 17 00:00:00 2001 From: Sergey Pershin Date: Thu, 14 Apr 2022 09:24:48 -0700 Subject: [PATCH 012/198] Support non-Flat vector for intermediate array_agg, map_agg, checksum (#1424) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/1424 Some queries might have local exchanges before INTERMEDIATE aggregation. The local exchanges produce dictionaries, so we need to expect these in addSingleGroupIntermediateResults() of the array_agg, map_agg and checksum aggregations. Reviewed By: mbasmanova Differential Revision: D35636100 fbshipit-source-id: 2c18bf01faba407e4070feecaf7acd0c904c286a --- velox/exec/tests/utils/QueryAssertions.cpp | 4 +- velox/exec/tests/utils/QueryAssertions.h | 3 +- .../aggregates/ArrayAggAggregate.cpp | 12 +- .../aggregates/ChecksumAggregate.cpp | 12 +- .../prestosql/aggregates/MapAggAggregate.cpp | 11 +- .../aggregates/tests/ArrayAggTest.cpp | 19 ++ .../tests/ChecksumAggregateTest.cpp | 126 +++++++------ .../aggregates/tests/CountAggregationTest.cpp | 166 ++++++++---------- .../prestosql/aggregates/tests/MapAggTest.cpp | 18 ++ 9 files changed, 198 insertions(+), 173 deletions(-) diff --git a/velox/exec/tests/utils/QueryAssertions.cpp b/velox/exec/tests/utils/QueryAssertions.cpp index e0e2072c776e..db3b7b6a8530 100644 --- a/velox/exec/tests/utils/QueryAssertions.cpp +++ b/velox/exec/tests/utils/QueryAssertions.cpp @@ -787,9 +787,11 @@ std::shared_ptr assertQuery( } velox::variant readSingleValue( - const std::shared_ptr& plan) { + const std::shared_ptr& plan, + int32_t maxDrivers) { CursorParameters params; params.planNode = plan; + params.maxDrivers = maxDrivers; auto result = readCursor(params, [](Task*) {}); EXPECT_EQ(1, result.second.size()); diff --git a/velox/exec/tests/utils/QueryAssertions.h b/velox/exec/tests/utils/QueryAssertions.h index d729fb21cbe2..84dc72b76aa9 100644 --- a/velox/exec/tests/utils/QueryAssertions.h +++ b/velox/exec/tests/utils/QueryAssertions.h @@ -164,7 +164,8 @@ std::shared_ptr assertQuery( const std::vector& expectedResults); velox::variant readSingleValue( - const std::shared_ptr& plan); + const std::shared_ptr& plan, + int32_t maxDrivers = 1); void assertEqualResults( const std::vector& expected, diff --git a/velox/functions/prestosql/aggregates/ArrayAggAggregate.cpp b/velox/functions/prestosql/aggregates/ArrayAggAggregate.cpp index 4c21f9d3a6ad..c0406079afab 100644 --- a/velox/functions/prestosql/aggregates/ArrayAggAggregate.cpp +++ b/velox/functions/prestosql/aggregates/ArrayAggAggregate.cpp @@ -140,17 +140,17 @@ class ArrayAggAggregate : public exec::Aggregate { const SelectivityVector& rows, const std::vector& args, bool /* mayPushdown */) override { - auto& values = value(group)->elements; + decodedIntermediate_.decode(*args[0], rows); + auto arrayVector = decodedIntermediate_.base()->as(); - VELOX_CHECK_EQ(args[0]->encoding(), VectorEncoding::Simple::ARRAY); - auto arrayVector = args[0]->as(); + auto& values = value(group)->elements; auto elements = arrayVector->elements(); - auto tracker = trackRowSize(group); rows.applyToSelected([&](vector_size_t row) { + auto decodedRow = decodedIntermediate_.index(row); values.appendRange( elements, - arrayVector->offsetAt(row), - arrayVector->sizeAt(row), + arrayVector->offsetAt(decodedRow), + arrayVector->sizeAt(decodedRow), allocator_); }); } diff --git a/velox/functions/prestosql/aggregates/ChecksumAggregate.cpp b/velox/functions/prestosql/aggregates/ChecksumAggregate.cpp index 6b2b63fba273..8a8c323ea34e 100644 --- a/velox/functions/prestosql/aggregates/ChecksumAggregate.cpp +++ b/velox/functions/prestosql/aggregates/ChecksumAggregate.cpp @@ -159,15 +159,13 @@ class ChecksumAggregate : public exec::Aggregate { const SelectivityVector& rows, const std::vector& args, bool /*mayPushDown*/) override { - const auto& arg = args[0]; - auto vector = arg->asUnchecked>(); - VELOX_CHECK(vector); - auto rawValues = vector->rawValues(); + decodedIntermediate_.decode(*args[0], rows); + int64_t result = 0; bool clearGroupNull = false; - rows.applyToSelected([&](vector_size_t i) { - if (!vector->isNullAt(i)) { - safeAdd(result, rawValues[i]); + rows.applyToSelected([&](vector_size_t row) { + if (!decodedIntermediate_.isNullAt(row)) { + safeAdd(result, decodedIntermediate_.valueAt(row)); clearGroupNull = true; } }); diff --git a/velox/functions/prestosql/aggregates/MapAggAggregate.cpp b/velox/functions/prestosql/aggregates/MapAggAggregate.cpp index 6813f0d90d54..c259186c6a83 100644 --- a/velox/functions/prestosql/aggregates/MapAggAggregate.cpp +++ b/velox/functions/prestosql/aggregates/MapAggAggregate.cpp @@ -172,18 +172,19 @@ class MapAggAggregate : public exec::Aggregate { const SelectivityVector& rows, const std::vector& args, bool /* mayPushdown */) override { + decodedIntermediate_.decode(*args[0], rows); + auto accumulator = value(group); + auto mapVector = decodedIntermediate_.base()->as(); auto& keys = accumulator->keys; auto& values = accumulator->values; - VELOX_CHECK_EQ(args[0]->encoding(), VectorEncoding::Simple::MAP); - auto mapVector = args[0]->as(); auto& mapKeys = mapVector->mapKeys(); auto& mapValues = mapVector->mapValues(); - auto tracker = trackRowSize(group); rows.applyToSelected([&](vector_size_t row) { - auto offset = mapVector->offsetAt(row); - auto size = mapVector->sizeAt(row); + auto decodedRow = decodedIntermediate_.index(row); + auto offset = mapVector->offsetAt(decodedRow); + auto size = mapVector->sizeAt(decodedRow); keys.appendRange(mapKeys, offset, size, allocator_); values.appendRange(mapValues, offset, size, allocator_); }); diff --git a/velox/functions/prestosql/aggregates/tests/ArrayAggTest.cpp b/velox/functions/prestosql/aggregates/tests/ArrayAggTest.cpp index 45f1c471ea16..043e50022943 100644 --- a/velox/functions/prestosql/aggregates/tests/ArrayAggTest.cpp +++ b/velox/functions/prestosql/aggregates/tests/ArrayAggTest.cpp @@ -119,6 +119,25 @@ TEST_F(ArrayAggTest, global) { } } ASSERT_EQ(velox::variant::array(expected), value); + + // Add local exchange before intermediate aggregation. Expect the same result. + auto planNodeIdGenerator = std::make_shared(); + op = PlanBuilder(planNodeIdGenerator) + .localPartition( + {}, + {PlanBuilder(planNodeIdGenerator) + .localPartitionRoundRobin( + {PlanBuilder(planNodeIdGenerator) + .values(vectors) + .partialAggregation({}, {"array_agg(c0)"}) + .planNode()}) + .intermediateAggregation() + .planNode()}) + .finalAggregation() + .planNode(); + + value = readSingleValue(op, 2); + ASSERT_EQ(velox::variant::array(expected), value); } TEST_F(ArrayAggTest, globalNoData) { diff --git a/velox/functions/prestosql/aggregates/tests/ChecksumAggregateTest.cpp b/velox/functions/prestosql/aggregates/tests/ChecksumAggregateTest.cpp index 5bf0b565e9f7..1278f8f85228 100644 --- a/velox/functions/prestosql/aggregates/tests/ChecksumAggregateTest.cpp +++ b/velox/functions/prestosql/aggregates/tests/ChecksumAggregateTest.cpp @@ -43,44 +43,57 @@ class ChecksumAggregateTest : public AggregationTestBase { VectorPtr inputVector, const std::string& expectedChecksum) { auto rowVectors = std::vector{makeRowVector({inputVector})}; + const auto expectedDuckDbSql = + fmt::format("VALUES (CAST(\'{}\' AS VARCHAR))", expectedChecksum); // DuckDB doesnt have checksum aggregation, so we will just pass in // expected values to compare. - { - auto agg = PlanBuilder() - .values(rowVectors) - .partialAggregation({}, {"checksum(c0)"}) - .finalAggregation() - .project({"to_base64(a0) as c0"}) - .planNode(); - assertQuery( - agg, - fmt::format("VALUES (CAST(\'{}\' AS VARCHAR))", expectedChecksum)); - } - - { - auto agg = PlanBuilder() - .values(rowVectors) - .singleAggregation({}, {"checksum(c0)"}) - .project({"to_base64(a0) as c0"}) - .planNode(); - assertQuery( - agg, - fmt::format("VALUES (CAST(\'{}\' AS VARCHAR))", expectedChecksum)); - } + auto agg = PlanBuilder() + .values(rowVectors) + .partialAggregation({}, {"checksum(c0)"}) + .finalAggregation() + .project({"to_base64(a0) as c0"}) + .planNode(); + assertQuery(agg, expectedDuckDbSql); + + agg = PlanBuilder() + .values(rowVectors) + .singleAggregation({}, {"checksum(c0)"}) + .project({"to_base64(a0) as c0"}) + .planNode(); + assertQuery(agg, expectedDuckDbSql); + + agg = PlanBuilder() + .values(rowVectors) + .partialAggregation({}, {"checksum(c0)"}) + .intermediateAggregation() + .finalAggregation() + .project({"to_base64(a0) as c0"}) + .planNode(); + assertQuery(agg, expectedDuckDbSql); - { - auto agg = PlanBuilder() - .values(rowVectors) - .partialAggregation({}, {"checksum(c0)"}) + // Add local exchange before intermediate aggregation. + auto planNodeIdGenerator = std::make_shared(); + CursorParameters params; + params.planNode = + PlanBuilder(planNodeIdGenerator) + .localPartition( + {}, + {PlanBuilder(planNodeIdGenerator) + .localPartitionRoundRobin( + {PlanBuilder(planNodeIdGenerator) + .values(rowVectors) + .partialAggregation({}, {"checksum(c0)"}) + .planNode()}) .intermediateAggregation() - .finalAggregation() - .project({"to_base64(a0) as c0"}) - .planNode(); - assertQuery( - agg, - fmt::format("VALUES (CAST(\'{}\' AS VARCHAR))", expectedChecksum)); - } + .planNode()}) + .finalAggregation() + .project({"to_base64(a0) as c0"}) + .planNode(); + + params.maxDrivers = 2; + + assertQuery(params, expectedDuckDbSql); } template @@ -107,32 +120,29 @@ class ChecksumAggregateTest : public AggregationTestBase { assertQuery(agg, "VALUES " + boost::algorithm::join(expectedResults, ",")); // Add local exchange before intermediate aggregation. - { - std::vector expectedInts; - expectedInts.reserve(expectedChecksums.size()); - for (const auto& checksum : expectedChecksums) { - expectedInts.push_back( - fmt::format("({}::bigint)", decodeChecksum(checksum))); - } - - auto planNodeIdGenerator = std::make_shared(); - - CursorParameters params; - params.planNode = PlanBuilder(planNodeIdGenerator) - .localPartition( - {"c0"}, - {PlanBuilder(planNodeIdGenerator) - .values(rowVectors) - .partialAggregation({0}, {"checksum(c1)"}) - .planNode()}) - .intermediateAggregation() - .project({"a0"}) - .planNode(); - params.maxDrivers = 2; - - assertQuery( - params, "VALUES " + boost::algorithm::join(expectedInts, ",")); + std::vector expectedInts; + expectedInts.reserve(expectedChecksums.size()); + for (const auto& checksum : expectedChecksums) { + expectedInts.push_back( + fmt::format("({}::bigint)", decodeChecksum(checksum))); } + + auto planNodeIdGenerator = std::make_shared(); + + CursorParameters params; + params.planNode = PlanBuilder(planNodeIdGenerator) + .localPartition( + {"c0"}, + {PlanBuilder(planNodeIdGenerator) + .values(rowVectors) + .partialAggregation({0}, {"checksum(c1)"}) + .planNode()}) + .intermediateAggregation() + .project({"a0"}) + .planNode(); + params.maxDrivers = 2; + + assertQuery(params, "VALUES " + boost::algorithm::join(expectedInts, ",")); } template diff --git a/velox/functions/prestosql/aggregates/tests/CountAggregationTest.cpp b/velox/functions/prestosql/aggregates/tests/CountAggregationTest.cpp index edb9d244753c..7a5687ccb527 100644 --- a/velox/functions/prestosql/aggregates/tests/CountAggregationTest.cpp +++ b/velox/functions/prestosql/aggregates/tests/CountAggregationTest.cpp @@ -40,109 +40,85 @@ TEST_F(CountAggregation, count) { auto vectors = makeVectors(rowType_, 10, 100); createDuckDbTable(vectors); - { - auto agg = PlanBuilder() - .values(vectors) - .partialAggregation({}, {"count(1)"}) - .planNode(); - assertQuery(agg, "SELECT count(1) FROM tmp"); - } - - { - // count over column with nulls; only non-null rows should be counted - auto agg = PlanBuilder() - .values(vectors) - .partialAggregation({}, {"count(c1)"}) - .finalAggregation() - .planNode(); - assertQuery(agg, "SELECT count(c1) FROM tmp"); - } + auto agg = PlanBuilder() + .values(vectors) + .partialAggregation({}, {"count(1)"}) + .planNode(); + assertQuery(agg, "SELECT count(1) FROM tmp"); + + // count over column with nulls; only non-null rows should be counted + agg = PlanBuilder() + .values(vectors) + .partialAggregation({}, {"count(c1)"}) + .finalAggregation() + .planNode(); + assertQuery(agg, "SELECT count(c1) FROM tmp"); - { - // count over zero rows; the result should be 0, not null - auto agg = PlanBuilder() - .values(vectors) - .filter("c0 % 3 > 5") - .partialAggregation({}, {"count(c1)"}) - .finalAggregation() - .planNode(); - assertQuery(agg, "SELECT count(c1) FROM tmp WHERE c0 % 3 > 5"); - } + // count over zero rows; the result should be 0, not null + agg = PlanBuilder() + .values(vectors) + .filter("c0 % 3 > 5") + .partialAggregation({}, {"count(c1)"}) + .finalAggregation() + .planNode(); + assertQuery(agg, "SELECT count(c1) FROM tmp WHERE c0 % 3 > 5"); - { - // final count aggregation is a sum of partial counts - auto agg = PlanBuilder() - .values(vectors) - .project({"cast(c1 as bigint) AS c1_bigint"}) - .finalAggregation({}, {"count(c1_bigint)"}, {BIGINT()}) - .planNode(); - assertQuery(agg, "SELECT sum(c1) FROM tmp"); - } + // final count aggregation is a sum of partial counts + agg = PlanBuilder() + .values(vectors) + .project({"cast(c1 as bigint) AS c1_bigint"}) + .finalAggregation({}, {"count(c1_bigint)"}, {BIGINT()}) + .planNode(); + assertQuery(agg, "SELECT sum(c1) FROM tmp"); - { - auto agg = PlanBuilder() - .values(vectors) - .project({"c0 % 10", "c1"}) - .partialAggregation({0}, {"count(1)"}) - .finalAggregation() - .planNode(); - assertQuery(agg, "SELECT c0 % 10, count(1) FROM tmp GROUP BY 1"); - } + agg = PlanBuilder() + .values(vectors) + .project({"c0 % 10", "c1"}) + .partialAggregation({0}, {"count(1)"}) + .finalAggregation() + .planNode(); + assertQuery(agg, "SELECT c0 % 10, count(1) FROM tmp GROUP BY 1"); - { - auto agg = PlanBuilder() - .values(vectors) - .project({"c0 % 10 AS c0_mod_10", "c7"}) - .partialAggregation({0}, {"count(c7)"}) - .planNode(); - assertQuery(agg, "SELECT c0 % 10, count(c7) FROM tmp GROUP BY 1"); - } + agg = PlanBuilder() + .values(vectors) + .project({"c0 % 10 AS c0_mod_10", "c7"}) + .partialAggregation({0}, {"count(c7)"}) + .planNode(); + assertQuery(agg, "SELECT c0 % 10, count(c7) FROM tmp GROUP BY 1"); // Add local exchange before intermediate aggregation. - { - auto planNodeIdGenerator = std::make_shared(); - - CursorParameters params; - params.planNode = PlanBuilder(planNodeIdGenerator) - .localPartition( - {"p0"}, - {PlanBuilder(planNodeIdGenerator) - .values(vectors) - .project({"c0 % 10", "c1"}) - .partialAggregation({0}, {"count(1)"}) - .planNode()}) - .intermediateAggregation() - .planNode(); - params.maxDrivers = 2; - - assertQuery(params, "SELECT c0 % 10, count(1) FROM tmp GROUP BY 1"); - } + auto planNodeIdGenerator = std::make_shared(); + + CursorParameters params; + params.planNode = PlanBuilder(planNodeIdGenerator) + .localPartition( + {"p0"}, + {PlanBuilder(planNodeIdGenerator) + .values(vectors) + .project({"c0 % 10", "c1"}) + .partialAggregation({0}, {"count(1)"}) + .planNode()}) + .intermediateAggregation() + .planNode(); + params.maxDrivers = 2; + assertQuery(params, "SELECT c0 % 10, count(1) FROM tmp GROUP BY 1"); // Add local exchange before intermediate aggregation. Single group. - { - auto planNodeIdGenerator = std::make_shared(); - - CursorParameters params; - params.planNode = - PlanBuilder(planNodeIdGenerator) - .localPartition( - {}, - {PlanBuilder(planNodeIdGenerator) - .localPartitionRoundRobin( - {PlanBuilder(planNodeIdGenerator) - .values(vectors) - .project({"c0"}) - .partialAggregation({}, {"count(1)"}) - .planNode()}) - .intermediateAggregation() - .planNode()}) - .finalAggregation() - .planNode(); - - params.maxDrivers = 2; - - assertQuery(params, "SELECT count(1) FROM tmp"); - } + params.planNode = PlanBuilder(planNodeIdGenerator) + .localPartition( + {}, + {PlanBuilder(planNodeIdGenerator) + .localPartitionRoundRobin( + {PlanBuilder(planNodeIdGenerator) + .values(vectors) + .project({"c0"}) + .partialAggregation({}, {"count(1)"}) + .planNode()}) + .intermediateAggregation() + .planNode()}) + .finalAggregation() + .planNode(); + assertQuery(params, "SELECT count(1) FROM tmp"); } } // namespace diff --git a/velox/functions/prestosql/aggregates/tests/MapAggTest.cpp b/velox/functions/prestosql/aggregates/tests/MapAggTest.cpp index 1ff550fbd5a7..c6bd76a022b3 100644 --- a/velox/functions/prestosql/aggregates/tests/MapAggTest.cpp +++ b/velox/functions/prestosql/aggregates/tests/MapAggTest.cpp @@ -129,6 +129,24 @@ TEST_F(MapAggTest, global) { .finalAggregation() .planNode(); ASSERT_EQ(mapExpected, readSingleValue(op)); + + // Add local exchange before intermediate aggregation. Expect the same result. + auto planNodeIdGenerator = std::make_shared(); + op = PlanBuilder(planNodeIdGenerator) + .localPartition( + {}, + {PlanBuilder(planNodeIdGenerator) + .localPartitionRoundRobin( + {PlanBuilder(planNodeIdGenerator) + .values(vectors) + .partialAggregation({}, {"map_agg(c0, c1)"}) + .planNode()}) + .intermediateAggregation() + .planNode()}) + .finalAggregation() + .planNode(); + + ASSERT_EQ(mapExpected, readSingleValue(op, 2)); } TEST_F(MapAggTest, globalNoData) { From d634ca8e6136ea978d8f4da92d89987301e79a24 Mon Sep 17 00:00:00 2001 From: Masha Basmanova Date: Thu, 14 Apr 2022 12:57:34 -0700 Subject: [PATCH 013/198] Add printExprWithStats helper function (#1425) Summary: Add helper function to print expression tree annotation with runtime stats such as cpu time and number of processed rows. Also, add cpu time tracking for Cast expression. Tracking for other built-in expressions such as AND, OR, SWITCH will come in follow-up PRs. Here are some examples: 2 expressions evaluated on flat vectors of size 1024: `(c0 + 3) * c1`, `(c0 + c1) % 2 = 0` ``` multiply [cpu time: 53.70us, rows: 1024] -> BIGINT plus [cpu time: 70.69us, rows: 1024] -> BIGINT cast(c0 as BIGINT) [cpu time: 132.21us, rows: 1024] -> BIGINT c0 [cpu time: 0ns, rows: 0] -> INTEGER 3:BIGINT [cpu time: 0ns, rows: 0] -> BIGINT cast(c1 as BIGINT) [cpu time: 67.17us, rows: 1024] -> BIGINT c1 [cpu time: 0ns, rows: 0] -> INTEGER eq [cpu time: 87.21us, rows: 1024] -> BOOLEAN mod [cpu time: 55.06us, rows: 1024] -> BIGINT cast(plus as BIGINT) [cpu time: 64.71us, rows: 1024] -> BIGINT plus [cpu time: 60.85us, rows: 1024] -> INTEGER c0 [cpu time: 0ns, rows: 0] -> INTEGER c1 [cpu time: 0ns, rows: 0] -> INTEGER 2:BIGINT [cpu time: 0ns, rows: 0] -> BIGINT 0:BIGINT [cpu time: 0ns, rows: 0] -> BIGINT ``` 2 expressions evaluated on dictionary-encoded vectors of size 1024 where each row is repeated 5 times: `(c0 + 3) * c1`, `(c0 + c1) % 2 = 0`. Notice that evaluation happened only on 1/5 of the rows. ``` multiply [cpu time: 35.52us, rows: 205] -> BIGINT plus [cpu time: 46.66us, rows: 205] -> BIGINT cast(c0 as BIGINT) [cpu time: 80.15us, rows: 205] -> BIGINT c0 [cpu time: 0ns, rows: 0] -> INTEGER 3:BIGINT [cpu time: 0ns, rows: 0] -> BIGINT cast(c1 as BIGINT) [cpu time: 34.44us, rows: 205] -> BIGINT c1 [cpu time: 0ns, rows: 0] -> INTEGER eq [cpu time: 43.46us, rows: 205] -> BOOLEAN mod [cpu time: 35.93us, rows: 205] -> BIGINT cast(plus as BIGINT) [cpu time: 34.83us, rows: 205] -> BIGINT plus [cpu time: 34.43us, rows: 205] -> INTEGER c0 [cpu time: 0ns, rows: 0] -> INTEGER c1 [cpu time: 0ns, rows: 0] -> INTEGER 2:BIGINT [cpu time: 0ns, rows: 0] -> BIGINT 0:BIGINT [cpu time: 0ns, rows: 0] -> BIGINT ``` Pull Request resolved: https://github.com/facebookincubator/velox/pull/1425 Reviewed By: pedroerp Differential Revision: D35643641 Pulled By: mbasmanova fbshipit-source-id: f0f563776bb0249f29d30dde5c1e8efa4710cd24 --- velox/expression/CastExpr.cpp | 11 +- velox/expression/CastExpr.h | 2 +- velox/expression/ControlExpr.cpp | 2 +- velox/expression/ControlExpr.h | 2 +- velox/expression/Expr.cpp | 44 +++++- velox/expression/Expr.h | 14 +- velox/expression/tests/CMakeLists.txt | 2 + velox/expression/tests/ExprStatsTest.cpp | 164 +++++++++++++++++++++++ 8 files changed, 230 insertions(+), 11 deletions(-) create mode 100644 velox/expression/tests/ExprStatsTest.cpp diff --git a/velox/expression/CastExpr.cpp b/velox/expression/CastExpr.cpp index c736bdb127f3..b969091f71d3 100644 --- a/velox/expression/CastExpr.cpp +++ b/velox/expression/CastExpr.cpp @@ -632,13 +632,20 @@ void CastExpr::evalSpecialForm( inputs_[0]->eval(rows, context, &input); auto fromType = inputs_[0]->type(); auto toType = std::const_pointer_cast(type_); + + stats_.numProcessedRows += rows.countSelected(); + CpuWallTimer timer(stats_.timing); apply(rows, input, context, fromType, toType, result); } -std::string CastExpr::toString() const { +std::string CastExpr::toString(bool recursive) const { std::stringstream out; out << "cast("; - appendInputs(out); + if (recursive) { + appendInputs(out); + } else { + out << inputs_[0]->toString(false); + } out << " as " << type_->toString() << ")"; return out.str(); } diff --git a/velox/expression/CastExpr.h b/velox/expression/CastExpr.h index 56c1397125a5..03a3e9a0c005 100644 --- a/velox/expression/CastExpr.h +++ b/velox/expression/CastExpr.h @@ -40,7 +40,7 @@ class CastExpr : public SpecialForm { EvalCtx* context, VectorPtr* result) override; - std::string toString() const override; + std::string toString(bool recursive = true) const override; private: /// @tparam To The cast target type diff --git a/velox/expression/ControlExpr.cpp b/velox/expression/ControlExpr.cpp index 752a15c589c8..73230153224d 100644 --- a/velox/expression/ControlExpr.cpp +++ b/velox/expression/ControlExpr.cpp @@ -66,7 +66,7 @@ void ConstantExpr::evalSpecialFormSimplified( } } -std::string ConstantExpr::toString() const { +std::string ConstantExpr::toString(bool /*recursive*/) const { if (sharedSubexprValues_ == nullptr) { return fmt::format("{}:{}", value_.toJson(), type()->toString()); } else { diff --git a/velox/expression/ControlExpr.h b/velox/expression/ControlExpr.h index 76485bd8e301..477c1f027141 100644 --- a/velox/expression/ControlExpr.h +++ b/velox/expression/ControlExpr.h @@ -87,7 +87,7 @@ class ConstantExpr : public SpecialForm { return sharedSubexprValues_; } - std::string toString() const override; + std::string toString(bool recursive = true) const override; private: const variant value_; diff --git a/velox/expression/Expr.cpp b/velox/expression/Expr.cpp index 3564c37b705e..ea15a22f712e 100644 --- a/velox/expression/Expr.cpp +++ b/velox/expression/Expr.cpp @@ -15,6 +15,7 @@ */ #include "velox/expression/Expr.h" +#include "velox/common/base/SuccinctPrinter.h" #include "velox/core/Expressions.h" #include "velox/expression/ControlExpr.h" #include "velox/expression/ExprCompiler.h" @@ -1140,11 +1141,15 @@ void Expr::applySingleConstArgVectorFunction( } } -std::string Expr::toString() const { - std::stringstream out; - out << name_; - appendInputs(out); - return out.str(); +std::string Expr::toString(bool recursive) const { + if (recursive) { + std::stringstream out; + out << name_; + appendInputs(out); + return out.str(); + } + + return name_; } void Expr::appendInputs(std::stringstream& stream) const { @@ -1224,4 +1229,33 @@ std::unique_ptr makeExprSetFromFlag( return std::make_unique(std::move(source), execCtx); } +namespace { +void printExprWithStats( + const exec::Expr& expr, + const std::string& indent, + std::stringstream& out) { + const auto& stats = expr.stats(); + out << indent << expr.toString(false) + << " [cpu time: " << succinctNanos(stats.timing.cpuNanos) + << ", rows: " << stats.numProcessedRows << "] -> " + << expr.type()->toString() << std::endl; + + auto newIndent = indent + " "; + for (const auto& input : expr.inputs()) { + printExprWithStats(*input, newIndent, out); + } +} +} // namespace + +std::string printExprWithStats(const exec::ExprSet& exprSet) { + const auto& exprs = exprSet.exprs(); + std::stringstream out; + for (auto i = 0; i < exprs.size(); ++i) { + if (i > 0) { + out << std::endl; + } + printExprWithStats(*exprs[i], "", out); + } + return out.str(); +} } // namespace facebook::velox::exec diff --git a/velox/expression/Expr.h b/velox/expression/Expr.h index fb6ceea7f70a..65571a058c02 100644 --- a/velox/expression/Expr.h +++ b/velox/expression/Expr.h @@ -157,7 +157,13 @@ class Expr { return inputs_; } - virtual std::string toString() const; + /// @param recursive If true, the output includes input expressions and all + /// their inputs recursively. + virtual std::string toString(bool recursive = true) const; + + const ExprStats& stats() const { + return stats_; + } private: void setAllNulls( @@ -432,4 +438,10 @@ std::unique_ptr makeExprSetFromFlag( std::vector>&& source, core::ExecCtx* execCtx); +/// Returns a string representation of the expression trees annotated with +/// runtime statistics. Expected to be called after calling ExprSet::eval one or +/// more times. If called before ExprSet::eval runtime statistics will be all +/// zeros. +std::string printExprWithStats(const ExprSet& exprSet); + } // namespace facebook::velox::exec diff --git a/velox/expression/tests/CMakeLists.txt b/velox/expression/tests/CMakeLists.txt index bb2e74bde787..9d94b5496781 100644 --- a/velox/expression/tests/CMakeLists.txt +++ b/velox/expression/tests/CMakeLists.txt @@ -15,6 +15,7 @@ add_executable( velox_expression_test ExprTest.cpp + ExprStatsTest.cpp CastExprTest.cpp MapWriterTest.cpp ArrayWriterTest.cpp @@ -59,6 +60,7 @@ target_link_libraries( velox_vector_fuzzer gtest gtest_main + gmock ${FOLLY_WITH_DEPENDENCIES} ${gflags_LIBRARIES} glog::glog diff --git a/velox/expression/tests/ExprStatsTest.cpp b/velox/expression/tests/ExprStatsTest.cpp new file mode 100644 index 000000000000..eb6e6971a3af --- /dev/null +++ b/velox/expression/tests/ExprStatsTest.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "velox/core/Expressions.h" +#include "velox/expression/Expr.h" +#include "velox/functions/prestosql/registration/RegistrationFunctions.h" +#include "velox/parse/Expressions.h" +#include "velox/parse/ExpressionsParser.h" +#include "velox/parse/TypeResolver.h" +#include "velox/vector/tests/VectorTestBase.h" + +using namespace facebook::velox; +using namespace facebook::velox::test; + +class ExprStatsTest : public testing::Test, public VectorTestBase { + protected: + void SetUp() override { + functions::prestosql::registerAllScalarFunctions(); + parse::registerTypeResolver(); + } + + static RowTypePtr asRowType(const TypePtr& type) { + return std::dynamic_pointer_cast(type); + } + + std::shared_ptr parseExpression( + const std::string& text, + const RowTypePtr& rowType) { + auto untyped = parse::parseExpr(text); + return core::Expressions::inferTypes(untyped, rowType, execCtx_->pool()); + } + + std::unique_ptr compileExpressions( + const std::vector& exprs, + const RowTypePtr& rowType) { + std::vector> expressions; + expressions.reserve(exprs.size()); + for (const auto& expr : exprs) { + expressions.emplace_back(parseExpression(expr, rowType)); + } + return std::make_unique( + std::move(expressions), execCtx_.get()); + } + + VectorPtr evaluate(exec::ExprSet& exprSet, const RowVectorPtr& input) { + exec::EvalCtx context(execCtx_.get(), &exprSet, input.get()); + + SelectivityVector rows(input->size()); + std::vector result(1); + exprSet.eval(rows, &context, &result); + return result[0]; + } + + std::shared_ptr queryCtx_{core::QueryCtx::createForTest()}; + std::unique_ptr pool_{ + memory::getDefaultScopedMemoryPool()}; + std::unique_ptr execCtx_{ + std::make_unique(pool_.get(), queryCtx_.get())}; +}; + +TEST_F(ExprStatsTest, printWithStats) { + vector_size_t size = 1'024; + + auto data = makeRowVector({ + makeFlatVector(size, [](auto row) { return row; }), + makeFlatVector(size, [](auto row) { return row % 7; }), + }); + + auto rowType = asRowType(data->type()); + { + auto exprSet = + compileExpressions({"(c0 + 3) * c1", "(c0 + c1) % 2 = 0"}, rowType); + + // Check stats before evaluation. + ASSERT_EQ( + exec::printExprWithStats(*exprSet), + "multiply [cpu time: 0ns, rows: 0] -> BIGINT\n" + " plus [cpu time: 0ns, rows: 0] -> BIGINT\n" + " cast(c0 as BIGINT) [cpu time: 0ns, rows: 0] -> BIGINT\n" + " c0 [cpu time: 0ns, rows: 0] -> INTEGER\n" + " 3:BIGINT [cpu time: 0ns, rows: 0] -> BIGINT\n" + " cast(c1 as BIGINT) [cpu time: 0ns, rows: 0] -> BIGINT\n" + " c1 [cpu time: 0ns, rows: 0] -> INTEGER\n" + "\n" + "eq [cpu time: 0ns, rows: 0] -> BOOLEAN\n" + " mod [cpu time: 0ns, rows: 0] -> BIGINT\n" + " cast(plus as BIGINT) [cpu time: 0ns, rows: 0] -> BIGINT\n" + " plus [cpu time: 0ns, rows: 0] -> INTEGER\n" + " c0 [cpu time: 0ns, rows: 0] -> INTEGER\n" + " c1 [cpu time: 0ns, rows: 0] -> INTEGER\n" + " 2:BIGINT [cpu time: 0ns, rows: 0] -> BIGINT\n" + " 0:BIGINT [cpu time: 0ns, rows: 0] -> BIGINT\n"); + + evaluate(*exprSet, data); + + // Check stats after evaluation. + ASSERT_THAT( + exec::printExprWithStats(*exprSet), + ::testing::MatchesRegex( + "multiply .cpu time: .+, rows: 1024. -> BIGINT\n" + " plus .cpu time: .+, rows: 1024. -> BIGINT\n" + " cast.c0 as BIGINT. .cpu time: .+, rows: 1024. -> BIGINT\n" + " c0 .cpu time: 0ns, rows: 0. -> INTEGER\n" + " 3:BIGINT .cpu time: 0ns, rows: 0. -> BIGINT\n" + " cast.c1 as BIGINT. .cpu time: .+, rows: 1024. -> BIGINT\n" + " c1 .cpu time: 0ns, rows: 0. -> INTEGER\n" + "\n" + "eq .cpu time: .+, rows: 1024. -> BOOLEAN\n" + " mod .cpu time: .+, rows: 1024. -> BIGINT\n" + " cast.plus as BIGINT. .cpu time: .+, rows: 1024. -> BIGINT\n" + " plus .cpu time: .+, rows: 1024. -> INTEGER\n" + " c0 .cpu time: 0ns, rows: 0. -> INTEGER\n" + " c1 .cpu time: 0ns, rows: 0. -> INTEGER\n" + " 2:BIGINT .cpu time: 0ns, rows: 0. -> BIGINT\n" + " 0:BIGINT .cpu time: 0ns, rows: 0. -> BIGINT\n")); + } + + // Use dictionary encoding to repeat each row 5 times. + auto indices = makeIndices(size, [](auto row) { return row / 5; }); + data = makeRowVector({ + wrapInDictionary(indices, size, data->childAt(0)), + wrapInDictionary(indices, size, data->childAt(1)), + }); + + { + auto exprSet = + compileExpressions({"(c0 + 3) * c1", "(c0 + c1) % 2 = 0"}, rowType); + evaluate(*exprSet, data); + + ASSERT_THAT( + exec::printExprWithStats(*exprSet), + ::testing::MatchesRegex( + "multiply .cpu time: .+, rows: 205. -> BIGINT\n" + " plus .cpu time: .+, rows: 205. -> BIGINT\n" + " cast.c0 as BIGINT. .cpu time: .+, rows: 205. -> BIGINT\n" + " c0 .cpu time: 0ns, rows: 0. -> INTEGER\n" + " 3:BIGINT .cpu time: 0ns, rows: 0. -> BIGINT\n" + " cast.c1 as BIGINT. .cpu time: .+, rows: 205. -> BIGINT\n" + " c1 .cpu time: 0ns, rows: 0. -> INTEGER\n" + "\n" + "eq .cpu time: .+, rows: 205. -> BOOLEAN\n" + " mod .cpu time: .+, rows: 205. -> BIGINT\n" + " cast.plus as BIGINT. .cpu time: .+, rows: 205. -> BIGINT\n" + " plus .cpu time: .+, rows: 205. -> INTEGER\n" + " c0 .cpu time: 0ns, rows: 0. -> INTEGER\n" + " c1 .cpu time: 0ns, rows: 0. -> INTEGER\n" + " 2:BIGINT .cpu time: 0ns, rows: 0. -> BIGINT\n" + " 0:BIGINT .cpu time: 0ns, rows: 0. -> BIGINT\n")); + } +} From cbbdc0d88343c577fc429fa1b3f2b0c5ab017647 Mon Sep 17 00:00:00 2001 From: Kevin Wilfong Date: Thu, 14 Apr 2022 14:03:47 -0700 Subject: [PATCH 014/198] Replace SimpleVector metadata with strongly typed SimpleVectorStats (#1347) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/1347 SimpleVector uses a string-ly typed map to store metadata. This seems to be used primarily for storing min/max stats as well as the bias for BiasVectors. This is inefficient because we end up de/serializing back and forth from strings, and doesn't work for all types (e.g. opaque types). This causes issues in VectorMaker as it tries to initialize stats when creating vectors, so we can't create vectors of Opaque types. Since SimpleVector is templated on the underlying type, I add a SimpleVectorStats object, also templated on this type, and pass this to the vector's constructor instead of the map of strings. This way min/max don't need to be serialized/deserialized, and it works for any type. Reviewed By: laithsakka Differential Revision: D35363043 fbshipit-source-id: 674cb6d8c2ac4d47a94c9ac9a2138fae4d618c86 --- .../reader/SelectiveColumnReaderInternal.h | 2 +- velox/expression/EvalCtx.cpp | 2 +- velox/type/Type.h | 17 ++-- velox/vector/BaseVector.cpp | 16 ++-- velox/vector/BaseVector.h | 4 - velox/vector/BiasVector-inl.h | 16 ++-- velox/vector/BiasVector.h | 8 +- velox/vector/CMakeLists.txt | 1 - velox/vector/ConstantVector-inl.h | 2 +- velox/vector/ConstantVector.cpp | 6 ++ velox/vector/ConstantVector.h | 18 ++-- velox/vector/DictionaryVector-inl.h | 8 +- velox/vector/DictionaryVector.h | 14 ++- velox/vector/FlatVector-inl.h | 2 +- velox/vector/FlatVector.h | 10 +-- velox/vector/SequenceVector-inl.h | 8 +- velox/vector/SequenceVector.h | 3 +- velox/vector/SimpleVector.cpp | 39 --------- velox/vector/SimpleVector.h | 87 ++++--------------- velox/vector/arrow/Bridge.cpp | 4 +- velox/vector/tests/VectorMaker-inl.h | 31 ++----- velox/vector/tests/VectorMakerStats.h | 36 +++++--- velox/vector/tests/VectorTest.cpp | 6 +- velox/vector/tests/VectorTestBase.h | 2 +- 24 files changed, 117 insertions(+), 225 deletions(-) delete mode 100644 velox/vector/SimpleVector.cpp diff --git a/velox/dwio/dwrf/reader/SelectiveColumnReaderInternal.h b/velox/dwio/dwrf/reader/SelectiveColumnReaderInternal.h index a8473e976bef..31583c447e42 100644 --- a/velox/dwio/dwrf/reader/SelectiveColumnReaderInternal.h +++ b/velox/dwio/dwrf/reader/SelectiveColumnReaderInternal.h @@ -129,7 +129,7 @@ void SelectiveColumnReader::getFlatValues( true, type, T(), - cdvi::EMPTY_METADATA, + SimpleVectorStats{}, sizeof(TVector) * rows.size()); return; } diff --git a/velox/expression/EvalCtx.cpp b/velox/expression/EvalCtx.cpp index 2e42fe997ee7..ec58e4d15836 100644 --- a/velox/expression/EvalCtx.cpp +++ b/velox/expression/EvalCtx.cpp @@ -141,7 +141,7 @@ void EvalCtx::addError( AlignedBuffer::allocate( size, pool(), ErrorVector::value_type()), std::vector(0), - cdvi::EMPTY_METADATA, + SimpleVectorStats>{}, 1 /*distinctValueCount*/, size /*nullCount*/, false /*isSorted*/, diff --git a/velox/type/Type.h b/velox/type/Type.h index 2d8f5e365762..8c105f4a63cf 100644 --- a/velox/type/Type.h +++ b/velox/type/Type.h @@ -1564,40 +1564,39 @@ static inline int32_t sizeOfTypeKind(TypeKind kind) { return VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(sizeOfTypeKindHelper, kind); } -// Helper for using folly::to with StringView. template -static inline T to(U value) { +static inline T to(const U& value) { return folly::to(value); } template <> -inline Timestamp to(std::string value) { +inline Timestamp to(const std::string& value) { return Timestamp(0, 0); } template <> -inline UnknownValue to(std::string /* value */) { +inline UnknownValue to(const std::string& /* value */) { return UnknownValue(); } template <> -inline std::string to(Timestamp value) { +inline std::string to(const Timestamp& value) { return value.toString(); } template <> -inline std::string to(velox::StringView value) { +inline std::string to(const velox::StringView& value) { return std::string(value.data(), value.size()); } template <> -inline std::string to(ComplexType value) { +inline std::string to(const ComplexType& value) { return std::string("ComplexType"); } template <> -inline ComplexType to(std::string value) { - return ComplexType(); +inline velox::StringView to(const std::string& value) { + return velox::StringView(value); } namespace exec { diff --git a/velox/vector/BaseVector.cpp b/velox/vector/BaseVector.cpp index f468fc1bd11b..8614af4b305d 100644 --- a/velox/vector/BaseVector.cpp +++ b/velox/vector/BaseVector.cpp @@ -111,7 +111,7 @@ static VectorPtr addDictionary( std::move(vector), TypeKind::INTEGER, std::move(indices), - cdvi::EMPTY_METADATA, + SimpleVectorStats::WrapperType>{}, indicesBuffer->size() / sizeof(vector_size_t) /*distinctValueCount*/, base->getNullCount(), false /*isSorted*/, @@ -144,7 +144,7 @@ addSequence(BufferPtr lengths, vector_size_t size, VectorPtr vector) { size, std::move(vector), std::move(lengths), - cdvi::EMPTY_METADATA, + SimpleVectorStats::WrapperType>{}, std::nullopt /*distinctCount*/, std::nullopt, false /*sorted*/, @@ -177,7 +177,7 @@ addConstant(vector_size_t size, vector_size_t index, VectorPtr vector) { auto singleNull = BaseVector::create(vector->type(), 1, pool); singleNull->setNull(0, true); return std::make_shared>( - pool, size, 0, singleNull, cdvi::EMPTY_METADATA); + pool, size, 0, singleNull, SimpleVectorStats{}); } else { return std::make_shared>( pool, size, true, vector->type(), T()); @@ -207,7 +207,7 @@ addConstant(vector_size_t size, vector_size_t index, VectorPtr vector) { } return std::make_shared>( - pool, size, index, std::move(vector), cdvi::EMPTY_METADATA); + pool, size, index, std::move(vector), SimpleVectorStats{}); } // static @@ -234,7 +234,7 @@ static VectorPtr createEmpty( size, std::move(values), std::vector(), - cdvi::EMPTY_METADATA, + SimpleVectorStats{}, 0 /*distinctValueCount*/, 0 /*nullCount*/, false /*isSorted*/, @@ -332,7 +332,7 @@ VectorPtr BaseVector::create( size, BufferPtr(nullptr), std::vector(), - cdvi::EMPTY_METADATA, + SimpleVectorStats{}, 1 /*distinctValueCount*/, size /*nullCount*/, true /*isSorted*/, @@ -539,7 +539,7 @@ VectorPtr newConstant( value.isNull(), Type::create(), std::move(copy), - cdvi::EMPTY_METADATA, + SimpleVectorStats{}, sizeof(T) /*representedByteCount*/); } @@ -557,7 +557,7 @@ VectorPtr newConstant( value.isNull(), capsule.type, std::shared_ptr(capsule.obj), - cdvi::EMPTY_METADATA, + SimpleVectorStats>{}, sizeof(std::shared_ptr) /*representedByteCount*/); } diff --git a/velox/vector/BaseVector.h b/velox/vector/BaseVector.h index fff18c324ef8..07ab34b37164 100644 --- a/velox/vector/BaseVector.h +++ b/velox/vector/BaseVector.h @@ -43,10 +43,6 @@ namespace facebook { namespace velox { -namespace cdvi { -const folly::F14FastMap EMPTY_METADATA; -} // namespace cdvi - template class SimpleVector; diff --git a/velox/vector/BiasVector-inl.h b/velox/vector/BiasVector-inl.h index 16389b87bf88..eb19005f5d3e 100644 --- a/velox/vector/BiasVector-inl.h +++ b/velox/vector/BiasVector-inl.h @@ -46,8 +46,6 @@ static inline __m256i simdAdd(const __m256i& left, const __m256i& right) { * 2 buffers. * * Buffer order: [nullData, valueData] - * - * The bias value is stored in the metaData block via the BIAS_VALUE key. */ template BiasVector::BiasVector( @@ -56,7 +54,8 @@ BiasVector::BiasVector( size_t length, TypeKind valueType, BufferPtr values, - const folly::F14FastMap& metaData, + T bias, + const SimpleVectorStats& stats, std::optional distinctCount, std::optional nullCount, std::optional sorted, @@ -66,23 +65,20 @@ BiasVector::BiasVector( pool, nulls, length, - metaData, + stats, distinctCount, nullCount, sorted, representedBytes, storageByteCount), valueType_(valueType), - values_(std::move(values)) { + values_(std::move(values)), + bias_(bias) { VELOX_CHECK( valueType_ == TypeKind::INTEGER || valueType_ == TypeKind::SMALLINT || valueType_ == TypeKind::TINYINT, "Invalid array type for biased values"); - auto bias = - SimpleVector::template getMetaDataValue(metaData, BIAS_VALUE); - VELOX_CHECK(bias.has_value(), "Bias value is required"); - bias_ = bias.value(); biasBuffer_ = simd::setAll256i(bias_); switch (valueType_) { @@ -122,7 +118,7 @@ std::unique_ptr> BiasVector::hashAll() const { BaseVector::length_, hashes, std::vector(0) /*stringBuffers*/, - cdvi::EMPTY_METADATA, + SimpleVectorStats{}, std::nullopt /*distinctValueCount*/, 0 /* nullCount */, false /*isSorted*/, diff --git a/velox/vector/BiasVector.h b/velox/vector/BiasVector.h index 3e18f1e95b8c..b43efe130e04 100644 --- a/velox/vector/BiasVector.h +++ b/velox/vector/BiasVector.h @@ -92,7 +92,6 @@ class BiasVector : public SimpleVector { // bias"); public: - static const std::string BIAS_VALUE; static constexpr bool can_simd = (std::is_same::value || std::is_same::value || std::is_same::value); @@ -103,8 +102,8 @@ class BiasVector : public SimpleVector { size_t length, TypeKind valueType, BufferPtr values, - const folly::F14FastMap& metaData = - cdvi::EMPTY_METADATA, + T bias, + const SimpleVectorStats& stats = {}, std::optional distinctCount = std::nullopt, std::optional nullCount = std::nullopt, std::optional sorted = std::nullopt, @@ -256,9 +255,6 @@ class BiasVector : public SimpleVector { __m256i biasBuffer_; }; -template -const std::string BiasVector::BIAS_VALUE = "CDV1"; - template using BiasVectorPtr = std::shared_ptr>; diff --git a/velox/vector/CMakeLists.txt b/velox/vector/CMakeLists.txt index ecd845fe7f0d..5355b603a0b5 100644 --- a/velox/vector/CMakeLists.txt +++ b/velox/vector/CMakeLists.txt @@ -21,7 +21,6 @@ add_library( LazyVector.cpp SelectivityVector.cpp SequenceVector.cpp - SimpleVector.cpp VectorEncoding.cpp VectorStream.cpp) diff --git a/velox/vector/ConstantVector-inl.h b/velox/vector/ConstantVector-inl.h index 610f215e29ac..5f960e5ec77a 100644 --- a/velox/vector/ConstantVector-inl.h +++ b/velox/vector/ConstantVector-inl.h @@ -24,7 +24,7 @@ std::unique_ptr> ConstantVector::hashAll() const { BaseVector::length_, false /* isNull */, this->hashValueAt(0), - folly::F14FastMap(), + SimpleVectorStats{}, /* stats */ sizeof(uint64_t) * BaseVector::length_ /* representedBytes */); } diff --git a/velox/vector/ConstantVector.cpp b/velox/vector/ConstantVector.cpp index bf10c36525a8..aeb38b893fe0 100644 --- a/velox/vector/ConstantVector.cpp +++ b/velox/vector/ConstantVector.cpp @@ -36,5 +36,11 @@ void ConstantVector>::setValue( VELOX_NYI(); } +template <> +void ConstantVector::setValue(const std::string& /*string*/) { + VELOX_UNSUPPORTED( + "ConstantVectors of ComplexType cannot be initialized from string values."); +} + } // namespace velox } // namespace facebook diff --git a/velox/vector/ConstantVector.h b/velox/vector/ConstantVector.h index 69966805471c..581c0ecb203c 100644 --- a/velox/vector/ConstantVector.h +++ b/velox/vector/ConstantVector.h @@ -45,8 +45,7 @@ class ConstantVector final : public SimpleVector { size_t length, bool isNull, T&& val, - const folly::F14FastMap& metaData = - cdvi::EMPTY_METADATA, + const SimpleVectorStats& stats = {}, std::optional representedBytes = std::nullopt, std::optional storageByteCount = std::nullopt) : ConstantVector( @@ -55,7 +54,7 @@ class ConstantVector final : public SimpleVector { isNull, CppToType::create(), std::move(val), - metaData, + stats, representedBytes, storageByteCount) {} @@ -65,8 +64,7 @@ class ConstantVector final : public SimpleVector { bool isNull, std::shared_ptr type, T&& val, - const folly::F14FastMap& metaData = - cdvi::EMPTY_METADATA, + const SimpleVectorStats& stats = {}, std::optional representedBytes = std::nullopt, std::optional storageByteCount = std::nullopt) : SimpleVector( @@ -74,7 +72,7 @@ class ConstantVector final : public SimpleVector { type, BufferPtr(nullptr), length, - metaData, + stats, isNull ? 0 : 1 /* distinctValueCount */, isNull ? length : 0 /* nullCount */, true /*isSorted*/, @@ -114,14 +112,13 @@ class ConstantVector final : public SimpleVector { vector_size_t length, vector_size_t index, VectorPtr base, - const folly::F14FastMap& metaData = - cdvi::EMPTY_METADATA) + const SimpleVectorStats& stats = {}) : SimpleVector( pool, base->type(), BufferPtr(nullptr), length, - metaData, + stats, std::nullopt, std::nullopt, true /*isSorted*/, @@ -413,6 +410,9 @@ void ConstantVector::setValue(const std::string& string); template <> void ConstantVector>::setValue(const std::string& string); +template <> +void ConstantVector::setValue(const std::string& string); + template using ConstantVectorPtr = std::shared_ptr>; diff --git a/velox/vector/DictionaryVector-inl.h b/velox/vector/DictionaryVector-inl.h index 1aa300b24050..b64177519471 100644 --- a/velox/vector/DictionaryVector-inl.h +++ b/velox/vector/DictionaryVector-inl.h @@ -55,7 +55,7 @@ DictionaryVector::DictionaryVector( std::shared_ptr dictionaryValues, TypeKind indexType, BufferPtr dictionaryIndices, - const folly::F14FastMap& metaData, + const SimpleVectorStats& stats, std::optional distinctValueCount, std::optional nullCount, std::optional isSorted, @@ -66,13 +66,13 @@ DictionaryVector::DictionaryVector( dictionaryValues->type(), nulls, length, - metaData, + stats, distinctValueCount, nullCount, isSorted, representedBytes, storageByteCount), - dictionaryMetaData_(metaData) { + dictionaryStats_(stats) { VELOX_CHECK(dictionaryValues != nullptr, "dictionaryValues must not be null"); VELOX_CHECK( dictionaryIndices != nullptr, "dictionaryIndices must not be null"); @@ -139,7 +139,7 @@ std::unique_ptr> DictionaryVector::hashAll() const { BaseVector::length_, hashes, std::vector(0) /* stringBuffers */, - folly::F14FastMap(), + SimpleVectorStats{}, BaseVector::distinctValueCount_.value() + (BaseVector::nullCount_.value_or(0) > 0 ? 1 : 0), 0 /* nullCount */, diff --git a/velox/vector/DictionaryVector.h b/velox/vector/DictionaryVector.h index 378539fba4a3..26603079805e 100644 --- a/velox/vector/DictionaryVector.h +++ b/velox/vector/DictionaryVector.h @@ -55,8 +55,7 @@ class DictionaryVector : public SimpleVector { VectorPtr dictionaryValues, TypeKind indexTypeKind, BufferPtr dictionaryIndexArray, - const folly::F14FastMap& metaData = - cdvi::EMPTY_METADATA, + const SimpleVectorStats& stats = {}, std::optional distinctValueCount = std::nullopt, std::optional nullCount = std::nullopt, std::optional isSorted = std::nullopt, @@ -114,13 +113,12 @@ class DictionaryVector : public SimpleVector { } /** - * @return metadata for the internal dictionary value vector. They + * @return stats for the internal dictionary value vector. They * hold the min and max value in the dictionary. */ // TODO (T61713241): Remove this later. - inline const folly::F14FastMap& - getDictionaryMetaData() const { - return dictionaryMetaData_; + inline const SimpleVectorStats& getDictionaryStats() const { + return dictionaryStats_; } inline const BufferPtr& indices() const { @@ -238,8 +236,8 @@ class DictionaryVector : public SimpleVector { void setInternalState(); - // metadata over the contained vector data - folly::F14FastMap dictionaryMetaData_; + // stats over the contained vector data + SimpleVectorStats dictionaryStats_; // the dictionary indices of the vector can be variable types depending on the // size of the dictionary - kept as original and typed diff --git a/velox/vector/FlatVector-inl.h b/velox/vector/FlatVector-inl.h index b0049fd1f0eb..33fc906d0daf 100644 --- a/velox/vector/FlatVector-inl.h +++ b/velox/vector/FlatVector-inl.h @@ -111,7 +111,7 @@ std::unique_ptr> FlatVector::hashAll() const { BaseVector::length_, std::move(hashBuffer), std::vector() /*stringBuffers*/, - folly::F14FastMap(), + SimpleVectorStats{}, /* stats */ std::nullopt /*distinctValueCount*/, 0 /*nullCount*/, false /*sorted*/, diff --git a/velox/vector/FlatVector.h b/velox/vector/FlatVector.h index 395a041a5b00..1d06b04fb6e7 100644 --- a/velox/vector/FlatVector.h +++ b/velox/vector/FlatVector.h @@ -57,8 +57,7 @@ class FlatVector final : public SimpleVector { size_t length, BufferPtr values, std::vector&& stringBuffers, - const folly::F14FastMap& metaData = - cdvi::EMPTY_METADATA, + const SimpleVectorStats& stats = {}, std::optional distinctValueCount = std::nullopt, std::optional nullCount = std::nullopt, std::optional isSorted = std::nullopt, @@ -71,7 +70,7 @@ class FlatVector final : public SimpleVector { length, values, std::move(stringBuffers), - metaData, + stats, distinctValueCount, nullCount, isSorted, @@ -85,8 +84,7 @@ class FlatVector final : public SimpleVector { size_t length, BufferPtr values, std::vector&& stringBuffers, - const folly::F14FastMap& metaData = - cdvi::EMPTY_METADATA, + const SimpleVectorStats& stats = {}, std::optional distinctValueCount = std::nullopt, std::optional nullCount = std::nullopt, std::optional isSorted = std::nullopt, @@ -97,7 +95,7 @@ class FlatVector final : public SimpleVector { type, std::move(nulls), length, - metaData, + stats, distinctValueCount, nullCount, isSorted, diff --git a/velox/vector/SequenceVector-inl.h b/velox/vector/SequenceVector-inl.h index be22c6e780d3..39ef8399a9ad 100644 --- a/velox/vector/SequenceVector-inl.h +++ b/velox/vector/SequenceVector-inl.h @@ -27,7 +27,7 @@ SequenceVector::SequenceVector( size_t length, std::shared_ptr sequenceValues, BufferPtr sequenceLengths, - const folly::F14FastMap& metaData, + const SimpleVectorStats& stats, std::optional distinctCount, std::optional nullCount, std::optional isSorted, @@ -38,7 +38,7 @@ SequenceVector::SequenceVector( sequenceValues->type(), BufferPtr(nullptr), length, - metaData, + stats, distinctCount, nullCount, isSorted, @@ -102,7 +102,7 @@ std::unique_ptr> SequenceVector::hashAll() const { sequenceCount, hashes, std::vector(0) /*stringBuffers*/, - cdvi::EMPTY_METADATA, + SimpleVectorStats{}, /*stats*/ std::nullopt /*distinctValueCount*/, 0 /* nullCount */, false /* sorted */, @@ -113,7 +113,7 @@ std::unique_ptr> SequenceVector::hashAll() const { BaseVector::length_, std::move(hashValues), sequenceLengths_, - folly::F14FastMap(), + SimpleVectorStats{}, /*stats*/ std::nullopt /*distinctCount*/, 0 /* nullCount */, false /* sorted */, diff --git a/velox/vector/SequenceVector.h b/velox/vector/SequenceVector.h index 02bc2ba37868..83f60f031e9d 100644 --- a/velox/vector/SequenceVector.h +++ b/velox/vector/SequenceVector.h @@ -44,8 +44,7 @@ class SequenceVector : public SimpleVector { size_t length, std::shared_ptr sequenceValues, BufferPtr sequenceLengths, - const folly::F14FastMap& metaData = - cdvi::EMPTY_METADATA, + const SimpleVectorStats& stats = {}, std::optional distinctCount = std::nullopt, std::optional nullCount = std::nullopt, std::optional sorted = std::nullopt, diff --git a/velox/vector/SimpleVector.cpp b/velox/vector/SimpleVector.cpp deleted file mode 100644 index a037e3e1d7c9..000000000000 --- a/velox/vector/SimpleVector.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "velox/vector/SimpleVector.h" -#include "velox/common/base/Exceptions.h" - -namespace facebook { -namespace velox { - -template <> -void SimpleVector::setMinMax( - const folly::F14FastMap& metaData) { - auto it = metaData.find(META_MIN); - if (it != metaData.end()) { - minString_ = it->second; - min_ = StringView(minString_); - } - it = metaData.find(META_MAX); - if (it != metaData.end()) { - maxString_ = it->second; - max_ = StringView(maxString_); - } -} - -} // namespace velox -} // namespace facebook diff --git a/velox/vector/SimpleVector.h b/velox/vector/SimpleVector.h index 6512a67b5c08..a0abeb926283 100644 --- a/velox/vector/SimpleVector.h +++ b/velox/vector/SimpleVector.h @@ -38,33 +38,11 @@ namespace exec { class EvalCtx; } -/** - * Adds the given value to the metaData in a proper encoding. Will overwrite - * any existing value in the map. - */ -template -void encodeMetaData( - folly::F14FastMap& metaData, - const std::string& key, - V value) { - metaData.insert_or_assign(key, velox::to(value)); -} - -template <> -inline void encodeMetaData( - folly::F14FastMap& metaData, - const std::string& key, - StringView value) { - metaData.insert_or_assign(key, std::string(value.data(), value.size())); -} - -template <> -inline void encodeMetaData( - folly::F14FastMap& /*metaData*/, - const std::string& /*key*/, - std::shared_ptr /*value*/) { - VELOX_NYI(); -} +template +struct SimpleVectorStats { + std::optional min; + std::optional max; +}; // This class abstracts over various Columnar Storage Formats such that Velox // can select the most appropriate one on a per field / per block basis. @@ -78,15 +56,12 @@ inline void encodeMetaData( template class SimpleVector : public BaseVector { public: - constexpr static auto META_MIN = folly::makeFixedString("CTV1"); - constexpr static auto META_MAX = folly::makeFixedString("CTV2"); - SimpleVector( velox::memory::MemoryPool* pool, std::shared_ptr type, BufferPtr nulls, size_t length, - const folly::F14FastMap& metaData, + const SimpleVectorStats& stats, std::optional distinctValueCount, std::optional nullCount, std::optional isSorted, @@ -102,16 +77,15 @@ class SimpleVector : public BaseVector { representedByteCount, storageByteCount), isSorted_(isSorted), - elementSize_(sizeof(T)) { - setMinMax(metaData); - } + elementSize_(sizeof(T)), + stats_(stats) {} // Constructs SimpleVector inferring the type from T. SimpleVector( velox::memory::MemoryPool* pool, BufferPtr nulls, size_t length, - const folly::F14FastMap& metaData, + const SimpleVectorStats& stats, std::optional distinctValueCount, std::optional nullCount, std::optional isSorted, @@ -122,7 +96,7 @@ class SimpleVector : public BaseVector { CppToType::create(), std::move(nulls), length, - metaData, + stats, distinctValueCount, nullCount, isSorted, @@ -131,15 +105,8 @@ class SimpleVector : public BaseVector { virtual ~SimpleVector() override {} - folly::F14FastMap genMetaData() const { - folly::F14FastMap metaData; - if (min_.hasValue()) { - encodeMetaData(metaData, META_MIN, min_.value()); - } - if (max_.hasValue()) { - encodeMetaData(metaData, META_MAX, max_.value()); - } - return metaData; + SimpleVectorStats getStats() const { + return stats_; } // Concrete Vector types need to implement this themselves. @@ -182,11 +149,11 @@ class SimpleVector : public BaseVector { } const std::optional& getMin() const { - return min_; + return stats_.min; } const std::optional& getMax() const { - return max_; + return stats_.max; } void resize(vector_size_t size, bool setNotNull = true) override { @@ -353,18 +320,6 @@ class SimpleVector : public BaseVector { sizeof(T)); } - std::optional min_; - std::optional max_; - // Holds the data for StringView min/max. - std::string minString_; - std::string maxString_; - - private: - void setMinMax(const folly::F14FastMap& metaData) { - min_ = getMetaDataValue(metaData, META_MIN); - max_ = getMetaDataValue(metaData, META_MAX); - } - protected: int comparePrimitiveAsc(const T& left, const T& right) const { if constexpr (std::is_floating_point::value) { @@ -393,19 +348,9 @@ class SimpleVector : public BaseVector { // If T is StringView, store set of rows // where we have computed asciiness. A set bit means the row was processed. SelectivityVector asciiSetRows_; -}; // namespace velox -template <> -void SimpleVector::setMinMax( - const folly::F14FastMap& metaData); - -template <> -inline void SimpleVector::setMinMax( - const folly::F14FastMap& /*metaData*/) {} - -template <> -inline void SimpleVector>::setMinMax( - const folly::F14FastMap& /*metaData*/) {} + SimpleVectorStats stats_; +}; // namespace velox template <> inline std::optional SimpleVector::compare( diff --git a/velox/vector/arrow/Bridge.cpp b/velox/vector/arrow/Bridge.cpp index f47d2948b2fc..3d9db20693e9 100644 --- a/velox/vector/arrow/Bridge.cpp +++ b/velox/vector/arrow/Bridge.cpp @@ -595,7 +595,7 @@ VectorPtr createFlatVector( length, values, std::vector(), - cdvi::EMPTY_METADATA, + SimpleVectorStats{}, std::nullopt, nullCount == -1 ? std::nullopt : std::optional(nullCount)); } @@ -636,7 +636,7 @@ VectorPtr createStringFlatVector( length, stringViews, std::move(stringViewBuffers), - cdvi::EMPTY_METADATA, + SimpleVectorStats{}, std::nullopt, nullCount == -1 ? std::nullopt : std::optional(nullCount)); } diff --git a/velox/vector/tests/VectorMaker-inl.h b/velox/vector/tests/VectorMaker-inl.h index 892db0530dab..216379152a86 100644 --- a/velox/vector/tests/VectorMaker-inl.h +++ b/velox/vector/tests/VectorMaker-inl.h @@ -15,23 +15,12 @@ */ #include + +#include "velox/vector/SimpleVector.h" #include "velox/vector/tests/VectorMakerStats.h" namespace facebook::velox::test { -template -folly::F14FastMap getMetadata( - const VectorMakerStats& stats) { - folly::F14FastMap metadata; - if (stats.min.has_value()) { - encodeMetaData(metadata, SimpleVector::META_MIN, stats.min.value()); - } - if (stats.max.has_value()) { - encodeMetaData(metadata, SimpleVector::META_MAX, stats.max.value()); - } - return metadata; -} - template BufferPtr buildBiasedBuffer( const std::vector>& values, @@ -75,9 +64,6 @@ BiasVectorPtr> VectorMaker::biasVector( // Check BiasVector.h for explanation of this calculation. TEvalType bias = min + static_cast(std::ceil(delta / 2.0)); - auto metadata = getMetadata(stats); - encodeMetaData(metadata, BiasVector::BIAS_VALUE, bias); - BufferPtr buffer; TypeKind valueType; @@ -98,7 +84,8 @@ BiasVectorPtr> VectorMaker::biasVector( data.size(), valueType, buffer, - metadata, + bias, + stats.asSimpleVectorStats(), stats.distinctCount(), stats.nullCount, stats.isSorted); @@ -152,7 +139,7 @@ SequenceVectorPtr> VectorMaker::sequenceVector( data.size(), flatVectorNullable(sequenceVals), copyToBuffer(sequenceLengths, pool_), - getMetadata(stats), + stats.asSimpleVectorStats(), stats.distinctCount(), stats.nullCount, stats.isSorted); @@ -177,7 +164,7 @@ ConstantVectorPtr> VectorMaker::constantVector( data.size(), nullCount > 0, (nullCount > 0) ? TEvalType() : folly::copy(*data.front()), - getMetadata(stats)); + stats.asSimpleVectorStats()); } template @@ -216,7 +203,7 @@ DictionaryVectorPtr> VectorMaker::dictionaryVector( std::move(values), TypeKind::INTEGER, std::move(indices), - getMetadata(stats), + stats.asSimpleVectorStats(), indexMap.size(), nullCount, stats.isSorted); @@ -244,7 +231,7 @@ FlatVectorPtr> VectorMaker::flatVectorNullable( data.size(), std::move(dataBuffer), std::vector(), - getMetadata(stats), + stats.asSimpleVectorStats(), stats.distinctCount(), stats.nullCount, stats.isSorted); @@ -275,7 +262,7 @@ FlatVectorPtr> VectorMaker::flatVector( data.size(), std::move(dataBuffer), std::vector(), - getMetadata(stats), + stats.asSimpleVectorStats(), stats.distinctCount(), stats.nullCount, stats.isSorted); diff --git a/velox/vector/tests/VectorMakerStats.h b/velox/vector/tests/VectorMakerStats.h index 967f690ae0ae..7815704e2344 100644 --- a/velox/vector/tests/VectorMakerStats.h +++ b/velox/vector/tests/VectorMakerStats.h @@ -19,6 +19,8 @@ #include #include +#include "velox/vector/SimpleVector.h" + namespace facebook::velox::test { // Struct that caries metadata about a vector of nullable elements. @@ -33,6 +35,10 @@ class VectorMakerStats { return distinctSet_.size(); } + SimpleVectorStats asSimpleVectorStats() { + return {min, max}; + } + std::optional min; std::optional max; size_t nullCount{0}; @@ -44,16 +50,17 @@ class VectorMakerStats { // Generates VectorMakerStats for a given vector of nullable elements. template -VectorMakerStats genVectorMakerStats( +VectorMakerStats::NativeType> genVectorMakerStats( const std::vector>& data) { - VectorMakerStats result; + using NativeType = typename CppToType::NativeType; + VectorMakerStats result; // Count distinct and null elements. for (const auto& val : data) { if (val == std::nullopt) { ++result.nullCount; } else { - result.addElement(*val); + result.addElement(static_cast(*val)); } } @@ -63,10 +70,13 @@ VectorMakerStats genVectorMakerStats( // Calculate min and max (skip null elements). for (const auto& val : data) { if (val != std::nullopt) { - result.min = - (result.min == std::nullopt) ? val : std::min(*result.min, *val); - result.max = - (result.max == std::nullopt) ? val : std::max(*result.max, *val); + auto nativeVal = static_cast(*val); + result.min = (result.min == std::nullopt) + ? nativeVal + : std::min(*result.min, nativeVal); + result.max = (result.max == std::nullopt) + ? nativeVal + : std::max(*result.max, nativeVal); } } return result; @@ -74,19 +84,21 @@ VectorMakerStats genVectorMakerStats( // Generates VectorMakerStats for a given vector of non-nullable elements. template -VectorMakerStats genVectorMakerStats(const std::vector& data) { - VectorMakerStats result; +VectorMakerStats::NativeType> genVectorMakerStats( + const std::vector& data) { + using NativeType = typename CppToType::NativeType; + VectorMakerStats result; for (const auto& val : data) { - result.addElement(val); + result.addElement(static_cast(val)); } result.isSorted = std::is_sorted(data.begin(), data.end()); const auto& [min, max] = std::minmax_element(data.begin(), data.end()); if (min != data.end()) { - result.min = *min; + result.min = static_cast(*min); } if (max != data.end()) { - result.max = *max; + result.max = static_cast(*max); } return result; } diff --git a/velox/vector/tests/VectorTest.cpp b/velox/vector/tests/VectorTest.cpp index 82779c730384..b62764048adb 100644 --- a/velox/vector/tests/VectorTest.cpp +++ b/velox/vector/tests/VectorTest.cpp @@ -25,6 +25,7 @@ #include "velox/vector/DecodedVector.h" #include "velox/vector/FlatVector.h" #include "velox/vector/LazyVector.h" +#include "velox/vector/SimpleVector.h" #include "velox/vector/TypeAliases.h" #include "velox/vector/VectorTypeUtils.h" #include "velox/vector/tests/VectorMaker.h" @@ -148,15 +149,14 @@ class VectorTest : public testing::Test { rawValues[i] = testValue(i, buffer) - kBias; } } - folly::F14FastMap metadata; - metadata[BiasVector::BIAS_VALUE] = folly::to(kBias); return std::make_shared>( pool_.get(), nulls, size, BiasKind, std::move(values), - std::move(metadata), + kBias, + SimpleVectorStats{}, std::nullopt, numNulls, false, diff --git a/velox/vector/tests/VectorTestBase.h b/velox/vector/tests/VectorTestBase.h index ac347a4aee2c..7e3babc2a2c4 100644 --- a/velox/vector/tests/VectorTestBase.h +++ b/velox/vector/tests/VectorTestBase.h @@ -468,7 +468,7 @@ class VectorTestBase { /*isNull=*/!value.has_value(), CppToType::create(), value ? EvalType(*value) : EvalType(), - cdvi::EMPTY_METADATA, + SimpleVectorStats>{}, sizeof(EvalType)); } From 88e2f8b8d18691cfc5fafec460a21000231e72e3 Mon Sep 17 00:00:00 2001 From: Masha Basmanova Date: Thu, 14 Apr 2022 17:29:55 -0700 Subject: [PATCH 015/198] Add ExprSetListener (#1428) Summary: Add a mechanism to register a listener to receive runtime statistics about expression evaluation at destruction of ExprSet. This is useful for logging and debugging. An earlier PR https://github.com/facebookincubator/velox/issues/1404 introduced a similar mechanism to receive runtime statistics about Task execution. Pull Request resolved: https://github.com/facebookincubator/velox/pull/1428 Reviewed By: pedroerp Differential Revision: D35656774 Pulled By: mbasmanova fbshipit-source-id: 6f18a75d622b44800ec6d8a28984d58cd592ac3f --- velox/expression/Expr.cpp | 77 +++++++++++++++++++++++- velox/expression/Expr.h | 41 ++++++++++++- velox/expression/tests/ExprStatsTest.cpp | 76 +++++++++++++++++++++++ 3 files changed, 192 insertions(+), 2 deletions(-) diff --git a/velox/expression/Expr.cpp b/velox/expression/Expr.cpp index ea15a22f712e..668e1b88f27a 100644 --- a/velox/expression/Expr.cpp +++ b/velox/expression/Expr.cpp @@ -13,11 +13,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include +#include +#include -#include "velox/expression/Expr.h" #include "velox/common/base/SuccinctPrinter.h" #include "velox/core/Expressions.h" #include "velox/expression/ControlExpr.h" +#include "velox/expression/Expr.h" #include "velox/expression/ExprCompiler.h" #include "velox/expression/VarSetter.h" #include "velox/expression/VectorFunction.h" @@ -30,6 +33,43 @@ DEFINE_bool( namespace facebook::velox::exec { +namespace { +folly::Synchronized>>& +listeners() { + static folly::Synchronized>> + kListeners; + return kListeners; +} +} // namespace + +bool registerExprSetListener(std::shared_ptr listener) { + return listeners().withWLock([&](auto& listeners) { + for (const auto& existingListener : listeners) { + if (existingListener == listener) { + // Listener already registered. Do not register again. + return false; + } + } + listeners.push_back(std::move(listener)); + return true; + }); +} + +bool unregisterExprSetListener( + const std::shared_ptr& listener) { + return listeners().withWLock([&](auto& listeners) { + for (auto it = listeners.begin(); it != listeners.end(); ++it) { + if ((*it) == listener) { + listeners.erase(it); + return true; + } + } + + // Listener not found. + return false; + }); +} + namespace { bool isMember( @@ -1174,6 +1214,41 @@ ExprSet::ExprSet( std::move(sources), execCtx, this, enableConstantFolding); } +namespace { +void addStats( + const exec::Expr& expr, + std::unordered_map& stats) { + // Do not aggregate empty stats. + if (expr.stats().numProcessedRows) { + stats[expr.name()].add(expr.stats()); + } + + for (const auto& input : expr.inputs()) { + addStats(*input, stats); + } +} + +std::string makeUuid() { + return boost::lexical_cast(boost::uuids::random_generator()()); +} +} // namespace + +ExprSet::~ExprSet() { + listeners().withRLock([&](auto& listeners) { + if (!listeners.empty()) { + std::unordered_map stats; + for (const auto& expr : exprs()) { + addStats(*expr, stats); + } + + auto uuid = makeUuid(); + for (auto& listener : listeners) { + listener->onCompletion(uuid, {stats}); + } + } + }); +} + void ExprSet::eval( int32_t begin, int32_t end, diff --git a/velox/expression/Expr.h b/velox/expression/Expr.h index 65571a058c02..f636f436f3a6 100644 --- a/velox/expression/Expr.h +++ b/velox/expression/Expr.h @@ -35,6 +35,11 @@ class VectorFunction; struct ExprStats { CpuWallTiming timing; uint64_t numProcessedRows{0}; + + void add(const ExprStats& other) { + timing.add(other.timing); + numProcessedRows += other.numProcessedRows; + } }; // An executable expression. @@ -111,6 +116,10 @@ class Expr { return type_; } + const std::string& name() const { + return name_; + } + bool isString() const { return type()->kind() == TypeKind::VARCHAR; } @@ -348,7 +357,7 @@ class ExprSet { core::ExecCtx* execCtx, bool enableConstantFolding = true); - virtual ~ExprSet() {} + virtual ~ExprSet(); // Initialize and evaluate all expressions available in this ExprSet. void eval( @@ -444,4 +453,34 @@ std::unique_ptr makeExprSetFromFlag( /// zeros. std::string printExprWithStats(const ExprSet& exprSet); +struct ExprSetCompletionEvent { + /// Aggregated runtime stats keyed on expression name (e.g. built-in + /// expression like and, or, switch or a function name). + std::unordered_map stats; +}; + +/// Listener invoked on ExprSet destruction. +class ExprSetListener { + public: + virtual ~ExprSetListener() = default; + + /// Called on ExprSet destruction. Provides runtime statistics about + /// expression evaluation. + /// @param uuid Universally unique identifier of the set of expressions. + /// @param event Runtime stats. + virtual void onCompletion( + const std::string& uuid, + const ExprSetCompletionEvent& event) = 0; +}; + +/// Register a listener to be invoked on ExprSet destruction. Returns true if +/// listener was successfully registered, false if listener is already +/// registered. +bool registerExprSetListener(std::shared_ptr listener); + +/// Unregister a listener registered earlier. Returns true if listener was +/// unregistered successfully, false if listener was not found. +bool unregisterExprSetListener( + const std::shared_ptr& listener); + } // namespace facebook::velox::exec diff --git a/velox/expression/tests/ExprStatsTest.cpp b/velox/expression/tests/ExprStatsTest.cpp index eb6e6971a3af..7920dbaf6b9a 100644 --- a/velox/expression/tests/ExprStatsTest.cpp +++ b/velox/expression/tests/ExprStatsTest.cpp @@ -162,3 +162,79 @@ TEST_F(ExprStatsTest, printWithStats) { " 0:BIGINT .cpu time: 0ns, rows: 0. -> BIGINT\n")); } } + +struct Event { + std::string uuid; + std::unordered_map stats; +}; + +class TestListener : public exec::ExprSetListener { + public: + explicit TestListener(std::vector& events) : events_{events} {} + + void onCompletion( + const std::string& uuid, + const exec::ExprSetCompletionEvent& event) override { + events_.push_back({uuid, event.stats}); + } + + private: + std::vector& events_; +}; + +TEST_F(ExprStatsTest, listener) { + vector_size_t size = 1'024; + + // Register a listener to receive stats on ExprSet destruction. + std::vector events; + auto listener = std::make_shared(events); + ASSERT_TRUE(exec::registerExprSetListener(listener)); + ASSERT_FALSE(exec::registerExprSetListener(listener)); + + auto data = makeRowVector({ + makeFlatVector(size, [](auto row) { return row; }), + makeFlatVector(size, [](auto row) { return row % 7; }), + }); + + // Evaluate a couple of expressions and sanity check the stats received by the + // listener. + auto rowType = asRowType(data->type()); + { + auto exprSet = + compileExpressions({"(c0 + 3) * c1", "(c0 + c1) % 2 = 0"}, rowType); + evaluate(*exprSet, data); + } + ASSERT_EQ(1, events.size()); + auto stats = events.back().stats; + ASSERT_EQ(1024 * 2, stats.at("plus").numProcessedRows); + ASSERT_EQ(1024, stats.at("multiply").numProcessedRows); + ASSERT_EQ(1024, stats.at("mod").numProcessedRows); + + // Evaluate the same expressions twice and verify that stats received by the + // listener are "doubled". + { + auto exprSet = + compileExpressions({"(c0 + 3) * c1", "(c0 + c1) % 2 = 0"}, rowType); + evaluate(*exprSet, data); + evaluate(*exprSet, data); + } + ASSERT_EQ(2, events.size()); + stats = events.back().stats; + ASSERT_EQ(1024 * 2 * 2, stats.at("plus").numProcessedRows); + ASSERT_EQ(1024 * 2, stats.at("multiply").numProcessedRows); + ASSERT_EQ(1024 * 2, stats.at("mod").numProcessedRows); + + ASSERT_NE(events[0].uuid, events[1].uuid); + + // Unregister the listener, evaluate expressions again and verify the listener + // wasn't invoked. + ASSERT_TRUE(exec::unregisterExprSetListener(listener)); + ASSERT_FALSE(exec::unregisterExprSetListener(listener)); + + { + auto exprSet = + compileExpressions({"(c0 + 3) * c1", "(c0 + c1) % 2 = 0"}, rowType); + evaluate(*exprSet, data); + } + ASSERT_EQ(2, events.size()); +} From 20a50db363f692dde2ae79f24f69bcd609be65c0 Mon Sep 17 00:00:00 2001 From: Laith Sakka Date: Thu, 14 Apr 2022 18:09:35 -0700 Subject: [PATCH 016/198] Change functions in Velox directory to use the new writer interface. (#1415) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/1415 This part is step one towards deprecating old writers. The following diff will replace ArrayWriterT and switch out_type meaning, the same for all other types. Reviewed By: mbasmanova Differential Revision: D35625851 fbshipit-source-id: 29b01bdbce75fa0f694d17495240981ad380b9c7 --- velox/examples/SimpleFunctions.cpp | 24 +++++----- .../tests/SimpleFunctionCallNullFreeTest.cpp | 18 ++++---- .../tests/SimpleFunctionInitTest.cpp | 8 ++-- velox/expression/tests/SimpleFunctionTest.cpp | 46 +++++++++++-------- .../benchmarks/ArrayWriterBenchmark.cpp | 16 ------- 5 files changed, 53 insertions(+), 59 deletions(-) diff --git a/velox/examples/SimpleFunctions.cpp b/velox/examples/SimpleFunctions.cpp index 58434645a2ae..be3962930436 100644 --- a/velox/examples/SimpleFunctions.cpp +++ b/velox/examples/SimpleFunctions.cpp @@ -288,7 +288,7 @@ struct MySimpleSplitFunction { const char splitChar{' '}; FOLLY_ALWAYS_INLINE bool call( - out_type>& out, + out_type>& out, const arg_type& input) { auto start = input.begin(); auto cur = start; @@ -296,7 +296,7 @@ struct MySimpleSplitFunction { // This code doesn't copy the string contents. do { cur = std::find(start, input.end(), splitChar); - out.append(out_type(StringView(start, cur - start))); + out.add_item().copy_from(StringView(start, cur - start)); start = cur + 1; } while (cur < input.end()); return true; @@ -307,7 +307,7 @@ void register6() { registerFunction( {"my_ascii_aware_func"}); - registerFunction, Varchar>( + registerFunction, Varchar>( {"my_simple_split_func"}); } @@ -405,11 +405,11 @@ struct MyComplexTimesTwoFunction { // are currently implemented based on std::vector. Vector elements are // currently wrapped by std::optional to represent their nullability. FOLLY_ALWAYS_INLINE bool call( - out_type>& result, + out_type>& result, const arg_type>& inputArray) { result.reserve(inputArray.size()); for (const auto& it : inputArray) { - result.append(it.has_value() ? it.value() * 2 : 0); + result.push_back(it.has_value() ? it.value() * 2 : 0); } return true; } @@ -417,7 +417,7 @@ struct MyComplexTimesTwoFunction { // This method takes and returns a Map. Map proxy objects are implemented // using std::unordered_map; values are wrapped by std::optional. FOLLY_ALWAYS_INLINE bool call( - out_type>& result, + out_type>& result, const arg_type>& inputMap) { result.reserve(inputMap.size()); for (const auto& it : inputMap) { @@ -430,7 +430,7 @@ struct MyComplexTimesTwoFunction { // Takes and returns a Row. Rows are backed by std::tuple; individual elements // are std::optional. FOLLY_ALWAYS_INLINE bool call( - out_type>& result, + out_type>& result, const arg_type>& inputRow) { const auto& elem0 = inputRow.template at<0>(); const auto& elem1 = inputRow.template at<1>(); @@ -456,15 +456,17 @@ struct MyComplexTimesTwoFunction { }; void register8() { - registerFunction, Array>( - {"my_array_func"}); registerFunction< MyComplexTimesTwoFunction, - Map, + ArrayWriterT, + Array>({"my_array_func"}); + registerFunction< + MyComplexTimesTwoFunction, + MapWriterT, Map>({"my_map_func"}); registerFunction< MyComplexTimesTwoFunction, - Row, + RowWriterT, Row>({"my_row_func"}); registerFunction< MyComplexTimesTwoFunction, diff --git a/velox/expression/tests/SimpleFunctionCallNullFreeTest.cpp b/velox/expression/tests/SimpleFunctionCallNullFreeTest.cpp index 7e8132a91050..880938e9fef4 100644 --- a/velox/expression/tests/SimpleFunctionCallNullFreeTest.cpp +++ b/velox/expression/tests/SimpleFunctionCallNullFreeTest.cpp @@ -71,14 +71,14 @@ struct NonDefaultBehaviorFunction { VELOX_DEFINE_FUNCTION_TYPES(T); bool callNullable( - out_type>& out, + out_type>& out, const arg_type>* input) { - out.append(kCallNullable); + out.push_back(kCallNullable); if (input) { for (auto i : *input) { if (i.has_value()) { - out.append(i.value()); + out.push_back(*i); } } } @@ -87,12 +87,12 @@ struct NonDefaultBehaviorFunction { } bool callNullFree( - out_type>& out, + out_type>& out, const null_free_arg_type>& input) { - out.append(kCallNullFree); + out.push_back(kCallNullFree); for (auto i : input) { - out.append(i); + out.push_back(i); } return true; @@ -100,8 +100,10 @@ struct NonDefaultBehaviorFunction { }; TEST_F(SimpleFunctionCallNullFreeTest, nonDefaultBehavior) { - registerFunction, Array>( - {"non_default_behavior"}); + registerFunction< + NonDefaultBehaviorFunction, + ArrayWriterT, + Array>({"non_default_behavior"}); // Make a vector with a NULL. auto arrayVectorWithNull = makeVectorWithNullArrays( diff --git a/velox/expression/tests/SimpleFunctionInitTest.cpp b/velox/expression/tests/SimpleFunctionInitTest.cpp index 093cabf0c990..7a6840558c10 100644 --- a/velox/expression/tests/SimpleFunctionInitTest.cpp +++ b/velox/expression/tests/SimpleFunctionInitTest.cpp @@ -55,7 +55,7 @@ struct NonDefaultWithArrayInitFunction { } bool callNullable( - out_type>& out, + out_type>& out, const arg_type* first, const arg_type>* /*second*/) { if (!first) { @@ -64,10 +64,10 @@ struct NonDefaultWithArrayInitFunction { if (!elements_.empty()) { for (auto i : elements_) { - out.append(i + *first); + out.push_back(i + *first); } } else { - out.append(*first); + out.push_back(*first); } return true; @@ -83,7 +83,7 @@ struct NonDefaultWithArrayInitFunction { TEST_F(SimpleFunctionInitTest, initializationArray) { registerFunction< NonDefaultWithArrayInitFunction, - Array, + ArrayWriterT, int32_t, Array>({"non_default_behavior_with_init"}); diff --git a/velox/expression/tests/SimpleFunctionTest.cpp b/velox/expression/tests/SimpleFunctionTest.cpp index e3f26e48dbb0..64b870836da9 100644 --- a/velox/expression/tests/SimpleFunctionTest.cpp +++ b/velox/expression/tests/SimpleFunctionTest.cpp @@ -94,18 +94,18 @@ struct ArrayWriterFunction { VELOX_DEFINE_FUNCTION_TYPES(T); FOLLY_ALWAYS_INLINE void call( - out_type>& out, + out_type>& out, const arg_type& input) { const size_t size = arrayData[input].size(); out.reserve(size); for (const auto i : arrayData[input]) { - out.append(i); + out.push_back(i); } } }; TEST_F(SimpleFunctionTest, arrayWriter) { - registerFunction, int64_t>( + registerFunction, int64_t>( {"array_writer_func"}, ARRAY(BIGINT())); const size_t rows = arrayData.size(); @@ -131,19 +131,21 @@ struct ArrayOfStringsWriterFunction { VELOX_DEFINE_FUNCTION_TYPES(T); FOLLY_ALWAYS_INLINE void call( - out_type>& out, + out_type>& out, const arg_type& input) { const size_t size = stringArrayData[input].size(); out.reserve(size); for (const auto value : stringArrayData[input]) { - out.append(out_type(StringView(value))); + out.add_item().copy_from(value); } } }; TEST_F(SimpleFunctionTest, arrayOfStringsWriter) { - registerFunction, int64_t>( - {"array_of_strings_writer_func"}, ARRAY(VARCHAR())); + registerFunction< + ArrayOfStringsWriterFunction, + ArrayWriterT, + int64_t>({"array_of_strings_writer_func"}, ARRAY(VARCHAR())); const size_t rows = stringArrayData.size(); auto flatVector = makeFlatVector(rows, [](auto row) { return row; }); @@ -240,7 +242,7 @@ struct RowWriterFunction { VELOX_DEFINE_FUNCTION_TYPES(T); FOLLY_ALWAYS_INLINE bool call( - out_type>& out, + out_type>& out, const arg_type& input) { out = std::make_tuple(rowVectorCol1[input], rowVectorCol2[input]); return true; @@ -248,7 +250,7 @@ struct RowWriterFunction { }; TEST_F(SimpleFunctionTest, rowWriter) { - registerFunction, int64_t>( + registerFunction, int64_t>( {"row_writer_func"}, ROW({BIGINT(), DOUBLE()})); const size_t rows = rowVectorCol1.size(); @@ -336,13 +338,13 @@ struct ArrayRowWriterFunction { VELOX_DEFINE_FUNCTION_TYPES(T); FOLLY_ALWAYS_INLINE bool call( - out_type>>& out, + out_type>>& out, const arg_type& input) { // Appends each row three times. auto tuple = std::make_tuple(rowVectorCol1[input], rowVectorCol2[input]); - out.append(std::optional(tuple)); - out.append(std::optional(tuple)); - out.append(std::optional(tuple)); + out.add_item() = tuple; + out.add_item() = tuple; + out.add_item() = tuple; return true; } }; @@ -350,7 +352,7 @@ struct ArrayRowWriterFunction { TEST_F(SimpleFunctionTest, arrayRowWriter) { registerFunction< ArrayRowWriterFunction, - Array>, + ArrayWriterT>, int32_t>({"array_row_writer_func"}, ARRAY(ROW({BIGINT(), DOUBLE()}))); const size_t rows = rowVectorCol1.size(); @@ -731,20 +733,22 @@ struct MyArrayStringReuseFunction { static constexpr int32_t reuse_strings_from_arg = 0; - void call(out_type>& out, const arg_type& input) { + void call( + out_type>& out, + const arg_type& input) { auto start = input.begin(); auto cur = start; do { cur = std::find(start, input.end(), ' '); - out.append(std::optional{StringView(start, cur - start)}); + out.add_item().copy_from(StringView(start, cur - start)); start = cur + 1; } while (cur < input.end()); } }; TEST_F(SimpleFunctionTest, arrayStringReuse) { - registerFunction, Varchar>( + registerFunction, Varchar>( {"my_array_string_reuse_func"}); std::vector inputData = { @@ -772,15 +776,17 @@ template struct MapStringOut { VELOX_DEFINE_FUNCTION_TYPES(T); - void call(out_type>& out, int64_t n) { + void call(out_type>& out, int64_t n) { auto string = std::to_string(n); - out.emplace(StringView(string), std::optional{StringView(string)}); + auto [key, value] = out.add_item(); + key.copy_from(string); + value.copy_from(string); } }; // Output map with string. TEST_F(SimpleFunctionTest, mapStringOut) { - registerFunction, int64_t>( + registerFunction, int64_t>( {"func_map_string_out"}); auto input = vectorMaker_.flatVector({1, 2, 3, 4}); diff --git a/velox/functions/prestosql/benchmarks/ArrayWriterBenchmark.cpp b/velox/functions/prestosql/benchmarks/ArrayWriterBenchmark.cpp index 75e30de3a243..c3dd6ee144c2 100644 --- a/velox/functions/prestosql/benchmarks/ArrayWriterBenchmark.cpp +++ b/velox/functions/prestosql/benchmarks/ArrayWriterBenchmark.cpp @@ -126,21 +126,6 @@ struct SimpleGeneralInterface { } }; -template -struct SimpleOld { - template - bool call(TOut& out, const int64_t& n) { - for (int i = 0; i < n; i++) { - if (WITH_NULLS && i % 5) { - out.append(std::nullopt); - } else { - out.append(i); - } - } - return true; - } -}; - class ArrayWriterBenchmark : public functions::test::FunctionBenchmarkBase { public: ArrayWriterBenchmark() : FunctionBenchmarkBase() { @@ -150,7 +135,6 @@ class ArrayWriterBenchmark : public functions::test::FunctionBenchmarkBase { {"simple_push_back"}); registerFunction, int64_t>( {"simple_general"}); - registerFunction, int64_t>({"simple_old"}); facebook::velox::exec::registerVectorFunction( "vector_resize_optimized", From 7c1b76de39b525fde932747096ccaade841ec6a1 Mon Sep 17 00:00:00 2001 From: Andrii Vasylevskyi Date: Fri, 15 Apr 2022 10:00:08 -0700 Subject: [PATCH 017/198] Upgrade getdeps libbpf to 0.7.0 Summary: Upgrade libbpf to latest version in getdeps. Reviewed By: sharmafb Differential Revision: D35595843 fbshipit-source-id: 0baa7ad9ee031214990d25034bc71c18e69c564f --- build/fbcode_builder/manifests/libbpf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build/fbcode_builder/manifests/libbpf b/build/fbcode_builder/manifests/libbpf index 0416822e4346..9ab3a26bee06 100644 --- a/build/fbcode_builder/manifests/libbpf +++ b/build/fbcode_builder/manifests/libbpf @@ -2,8 +2,8 @@ name = libbpf [download] -url = https://github.com/libbpf/libbpf/archive/v0.3.tar.gz -sha256 = c168d84a75b541f753ceb49015d9eb886e3fb5cca87cdd9aabce7e10ad3a1efc +url = https://github.com/libbpf/libbpf/archive/refs/tags/v0.7.0.tar.gz +sha256 = 5083588ce5a3a620e395ee1e596af77b4ec5771ffc71cff2af49dfee38c06361 # BPF only builds on linux, so make it a NOP on other platforms [build.not(os=linux)] @@ -11,7 +11,7 @@ builder = nop [build.os=linux] builder = make -subdir = libbpf-0.3/src +subdir = libbpf-0.7.0/src [make.build_args] BUILD_STATIC_ONLY=y From 36f6228215b1b9004f1af403d2f5125f9a8e2929 Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Fri, 15 Apr 2022 12:17:05 -0700 Subject: [PATCH 018/198] Fix nullptr math in HashTable.cpp Reviewed By: YiqiuLiu Differential Revision: D35679075 fbshipit-source-id: c778564a4a3d73b031222907119913481a4f57a1 --- velox/exec/HashTable.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/velox/exec/HashTable.cpp b/velox/exec/HashTable.cpp index 4f92cdc7b039..54e6406c1a05 100644 --- a/velox/exec/HashTable.cpp +++ b/velox/exec/HashTable.cpp @@ -1161,7 +1161,9 @@ int32_t HashTable::listJoinResults( char* next = nullptr; if (nextOffset_) { next = nextRow(iter.nextHit); - __builtin_prefetch(reinterpret_cast(next) + nextOffset_); + if (next) { + __builtin_prefetch(reinterpret_cast(next) + nextOffset_); + } } inputRows[numOut] = (*iter.rows)[iter.lastRowIndex]; // NOLINT hits[numOut] = iter.nextHit; From 7033a567f33d58b4599abc95f9dd9d7c9bf0ba6d Mon Sep 17 00:00:00 2001 From: svcscm svcscm Date: Fri, 15 Apr 2022 12:27:21 -0700 Subject: [PATCH 019/198] Updating submodules Summary: GitHub commits: https://github.com/facebook/cachelib/commit/c6c70ea541a5019357afb40451a539e63fc52ec6 https://github.com/facebook/fb303/commit/de2c500803ef419764ea1754cda5b9377cd21f84 https://github.com/facebook/fbthrift/commit/0080df8784036f2abfd506ff2978a9b46d3f643e https://github.com/facebook/fbzmq/commit/0971dd1f9ca796fffbd0cfd5838a30c42c059ee8 https://github.com/facebook/folly/commit/f736a16afe59f00191d53e2577304a8706b6bc45 https://github.com/facebook/litho/commit/329a991d6bf3b26f59ce7f125a71bdfb405479f7 https://github.com/facebook/proxygen/commit/bfd523711f4b4ab670ef6de66f71380cf0f012ce https://github.com/facebook/rocksdb/commit/082eb04200c9b86b02f5a0979b77ddbf45499c73 https://github.com/facebook/wangle/commit/447667d620a060b5dd16719fdddd2442dceb9552 https://github.com/facebook/watchman/commit/959c55b75573224e549cb55c14d6c1b6d6efa309 https://github.com/facebookexperimental/rust-shed/commit/2a834d270ffc98d499380da7985c8f6363b1d379 https://github.com/facebookincubator/fizz/commit/95b148433b06c61aa4c331af8786f1dad5a8de08 https://github.com/facebookincubator/katran/commit/86bb284354a84c4e08455d9bccaa61f7a2b04912 https://github.com/facebookincubator/mvfst/commit/8b3b19bd29a2badb556a6e86b6137fc126d503e5 https://github.com/facebookincubator/velox/commit/7c1b76de39b525fde932747096ccaade841ec6a1 https://github.com/facebookresearch/multimodal/commit/9a85c5be78457c46685c2fb0bda61d6f88f268d8 https://github.com/facebookresearch/recipes/commit/3e69e431b3a9097dbd70020082af63122223f8fc https://github.com/pytorch/fbgemm/commit/9416c3729acf2e2baa611ad87b67dce1e3b596fc https://github.com/pytorch/kineto/commit/955fa148c74b551647bce97e1aa6270bd7abb7d1 Reviewed By: wittgenst fbshipit-source-id: e06e910261e499c203454fae03a0e3251101989c --- build/deps/github_hashes/facebook/folly-rev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/deps/github_hashes/facebook/folly-rev.txt b/build/deps/github_hashes/facebook/folly-rev.txt index 77e4eb55fb80..1b08871156c4 100644 --- a/build/deps/github_hashes/facebook/folly-rev.txt +++ b/build/deps/github_hashes/facebook/folly-rev.txt @@ -1 +1 @@ -Subproject commit c12758ea770997bf10424070e89d9c9d6f00ae78 +Subproject commit f736a16afe59f00191d53e2577304a8706b6bc45 From dd4b8133b904e1e5e917ebcc725b600d3665852f Mon Sep 17 00:00:00 2001 From: Sergey Pershin Date: Fri, 15 Apr 2022 14:10:58 -0700 Subject: [PATCH 020/198] Fix one case of fulfilling promises under a lock. (#1432) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/1432 Fulfilling promises under a lock runs a risk of dealocking in case when the promise callback also uses some locks and is run in the same executor. The common practice is to move out promises from class members into the local variables unders the lock and then fullfill promises, using local variables, outside of the lock. Reviewed By: tanjialiang Differential Revision: D35682204 fbshipit-source-id: 4e3c4519c5d07d75df1ae1b6028fa1f58ae11826 --- velox/exec/PartitionedOutputBufferManager.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/velox/exec/PartitionedOutputBufferManager.cpp b/velox/exec/PartitionedOutputBufferManager.cpp index 627fd4067b14..e9d1b9077183 100644 --- a/velox/exec/PartitionedOutputBufferManager.cpp +++ b/velox/exec/PartitionedOutputBufferManager.cpp @@ -419,9 +419,13 @@ void PartitionedOutputBuffer::getData( } void PartitionedOutputBuffer::terminate() { - std::lock_guard l(mutex_); - VELOX_CHECK(not task_->isRunning()); - for (auto& promise : promises_) { + std::vector outstandingPromises; + { + std::lock_guard l(mutex_); + VELOX_CHECK(not task_->isRunning()); + outstandingPromises.swap(promises_); + } + for (auto& promise : outstandingPromises) { promise.setValue(true); } } From 3ed155876e325aa09eee777eac9a37dad5c39ca5 Mon Sep 17 00:00:00 2001 From: svcscm svcscm Date: Fri, 15 Apr 2022 17:04:00 -0700 Subject: [PATCH 021/198] Updating submodules Summary: GitHub commits: https://github.com/facebook/folly/commit/851645ee2b36c7d9bbb4f0336288269f0db5ecaa Reviewed By: wittgenst fbshipit-source-id: d6a3288e301a9afab5b2a99a9c1d3e802136a289 --- build/deps/github_hashes/facebook/folly-rev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/deps/github_hashes/facebook/folly-rev.txt b/build/deps/github_hashes/facebook/folly-rev.txt index 1b08871156c4..450f1fc947b7 100644 --- a/build/deps/github_hashes/facebook/folly-rev.txt +++ b/build/deps/github_hashes/facebook/folly-rev.txt @@ -1 +1 @@ -Subproject commit f736a16afe59f00191d53e2577304a8706b6bc45 +Subproject commit 851645ee2b36c7d9bbb4f0336288269f0db5ecaa From 8706c5187161e905c97363b6159733ad2c20bcfc Mon Sep 17 00:00:00 2001 From: Xiang Xu Date: Fri, 15 Apr 2022 21:33:40 -0700 Subject: [PATCH 022/198] (Red diff) Cont. remove ZMQ dependency from CMakeList for OSS build Summary: Decom ZMQ from OSS build {gif:p0tv3fvs} Reviewed By: shih-hao-tseng Differential Revision: D35223763 fbshipit-source-id: 11e559c1a26ae0b7d651fc60ddcd9c2f9ff1dc4c --- build/fbcode_builder/manifests/openr | 2 -- 1 file changed, 2 deletions(-) diff --git a/build/fbcode_builder/manifests/openr b/build/fbcode_builder/manifests/openr index c5be2d9bccd0..8054c1b6e058 100644 --- a/build/fbcode_builder/manifests/openr +++ b/build/fbcode_builder/manifests/openr @@ -12,14 +12,12 @@ builder = cmake [build.not(os=linux)] # boost.fiber is required and that is not available on macos. -# libzmq doesn't currently build on windows. builder = nop [dependencies] boost fb303 fbthrift -fbzmq folly googletest re2 From 24b7f000bc274c2533fce159efc623ddc09cf690 Mon Sep 17 00:00:00 2001 From: svcscm svcscm Date: Fri, 15 Apr 2022 22:13:49 -0700 Subject: [PATCH 023/198] Updating submodules Summary: GitHub commits: https://github.com/facebook/cachelib/commit/c0ef64d9687d2db169dc9874e4dda5ecdbdcc58d https://github.com/facebook/fb303/commit/344c3b9abb9ff4c2148aef856b76e7ebf9ff0a0d https://github.com/facebook/fbthrift/commit/24b49dfb20518081f4ff5cc2e3afcd00512a567e https://github.com/facebook/fbzmq/commit/3d7b4b564900eb71541eb18138a1b41a94020877 https://github.com/facebook/folly/commit/2ebeecfb0778e1b594987ad7b276fdd9c7efeef6 https://github.com/facebook/proxygen/commit/e316b51fb34cdf78ad84325983f10a888e130426 https://github.com/facebook/wangle/commit/790abec1387bf3ce7dcc8f5cb45084d761ab99c1 https://github.com/facebook/watchman/commit/24f84946c02d66b7295e013a52e4f2a6a339efec https://github.com/facebookexperimental/rust-shed/commit/af62a23bc25f94a02d525e2d3459a97f11c161f7 https://github.com/facebookincubator/katran/commit/2272ba685d6a1415c6dbc3cd04b1e6f8c4d6724c https://github.com/facebookincubator/mvfst/commit/7cf4db27176057e0f084af68fe057c45eb80fb69 Reviewed By: wittgenst fbshipit-source-id: e7f8d30099da7c8b275f4660d4f979945d7eb053 --- build/deps/github_hashes/facebook/folly-rev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/deps/github_hashes/facebook/folly-rev.txt b/build/deps/github_hashes/facebook/folly-rev.txt index 450f1fc947b7..a26f36d9ec57 100644 --- a/build/deps/github_hashes/facebook/folly-rev.txt +++ b/build/deps/github_hashes/facebook/folly-rev.txt @@ -1 +1 @@ -Subproject commit 851645ee2b36c7d9bbb4f0336288269f0db5ecaa +Subproject commit 2ebeecfb0778e1b594987ad7b276fdd9c7efeef6 From e1521fc9e94c40961dfaebf38346a7eb7d2ffd2c Mon Sep 17 00:00:00 2001 From: svcscm svcscm Date: Sat, 16 Apr 2022 10:44:53 -0700 Subject: [PATCH 024/198] Updating submodules Summary: GitHub commits: https://github.com/facebook/folly/commit/39e5f2f72bdef7f87a83958298087ecc1e6d48f9 Reviewed By: wittgenst fbshipit-source-id: 7a698a77f9e794f7e6b4020cddb048ccc1dd90e5 --- build/deps/github_hashes/facebook/folly-rev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/deps/github_hashes/facebook/folly-rev.txt b/build/deps/github_hashes/facebook/folly-rev.txt index a26f36d9ec57..30df6af7441f 100644 --- a/build/deps/github_hashes/facebook/folly-rev.txt +++ b/build/deps/github_hashes/facebook/folly-rev.txt @@ -1 +1 @@ -Subproject commit 2ebeecfb0778e1b594987ad7b276fdd9c7efeef6 +Subproject commit 39e5f2f72bdef7f87a83958298087ecc1e6d48f9 From 35b754c8775e8dd6b569c35723e7ed82c7d8cc39 Mon Sep 17 00:00:00 2001 From: svcscm svcscm Date: Mon, 18 Apr 2022 01:26:25 -0700 Subject: [PATCH 025/198] Updating submodules Summary: GitHub commits: https://github.com/facebook/folly/commit/0534212d7d8708888b10dcb6cb55d9b859fb4c51 Reviewed By: wittgenst fbshipit-source-id: 916b998b99413f61fa67ecd72144dae6eec31d45 --- build/deps/github_hashes/facebook/folly-rev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/deps/github_hashes/facebook/folly-rev.txt b/build/deps/github_hashes/facebook/folly-rev.txt index 30df6af7441f..06330d5236c4 100644 --- a/build/deps/github_hashes/facebook/folly-rev.txt +++ b/build/deps/github_hashes/facebook/folly-rev.txt @@ -1 +1 @@ -Subproject commit 39e5f2f72bdef7f87a83958298087ecc1e6d48f9 +Subproject commit 0534212d7d8708888b10dcb6cb55d9b859fb4c51 From ef351d6373c5ca6a7a9647fb16c5f2fe7c0c5688 Mon Sep 17 00:00:00 2001 From: svcscm svcscm Date: Mon, 18 Apr 2022 04:53:05 -0700 Subject: [PATCH 026/198] Updating submodules Summary: GitHub commits: https://github.com/facebook/folly/commit/54b24d042e46b1a2749720d3c36680af71feefe4 https://github.com/facebookincubator/mvfst/commit/744ae1f78e1d2a62a93db93218834080913f9637 Reviewed By: wittgenst fbshipit-source-id: 1e5204362eff2f019c14b2d21b230f3b431c7ee2 --- build/deps/github_hashes/facebook/folly-rev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/deps/github_hashes/facebook/folly-rev.txt b/build/deps/github_hashes/facebook/folly-rev.txt index 06330d5236c4..dce45d5ff84f 100644 --- a/build/deps/github_hashes/facebook/folly-rev.txt +++ b/build/deps/github_hashes/facebook/folly-rev.txt @@ -1 +1 @@ -Subproject commit 0534212d7d8708888b10dcb6cb55d9b859fb4c51 +Subproject commit 54b24d042e46b1a2749720d3c36680af71feefe4 From 6e1186836ed71bbc1526b9ae3df9e3dab1453d36 Mon Sep 17 00:00:00 2001 From: svcscm svcscm Date: Mon, 18 Apr 2022 10:38:10 -0700 Subject: [PATCH 027/198] Updating submodules Summary: GitHub commits: https://github.com/facebook/folly/commit/b7a60b2cea1726b9a688d7382edeadcea03976b1 Reviewed By: yns88 fbshipit-source-id: 4c469d1869e7d5b3a8212073cef22ce6a71d82d3 --- build/deps/github_hashes/facebook/folly-rev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/deps/github_hashes/facebook/folly-rev.txt b/build/deps/github_hashes/facebook/folly-rev.txt index dce45d5ff84f..0331ed25ad87 100644 --- a/build/deps/github_hashes/facebook/folly-rev.txt +++ b/build/deps/github_hashes/facebook/folly-rev.txt @@ -1 +1 @@ -Subproject commit 54b24d042e46b1a2749720d3c36680af71feefe4 +Subproject commit b7a60b2cea1726b9a688d7382edeadcea03976b1 From 772d54cc97a197dc9f07e8920dc6fb48c64b4cde Mon Sep 17 00:00:00 2001 From: Sergey Pershin Date: Mon, 18 Apr 2022 13:03:14 -0700 Subject: [PATCH 028/198] Rename Task::finished() to finishedLocked(). (#1433) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/1433 This method is being called under a lock. Reviewed By: tanjialiang Differential Revision: D35699012 fbshipit-source-id: 825fd3c44919cc7099a569051727a6f0b52d09b9 --- velox/exec/Task.cpp | 6 +++--- velox/exec/Task.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/velox/exec/Task.cpp b/velox/exec/Task.cpp index debd0d305084..87ff77a6cf1a 100644 --- a/velox/exec/Task.cpp +++ b/velox/exec/Task.cpp @@ -1370,7 +1370,7 @@ StopReason Task::enterForTerminateLocked(ThreadState& state) { StopReason Task::leave(ThreadState& state) { std::lock_guard l(mutex_); if (--numThreads_ == 0) { - finished(); + finishedLocked(); } state.clearThread(); if (state.isTerminated) { @@ -1404,7 +1404,7 @@ StopReason Task::enterSuspended(ThreadState& state) { if (reason == StopReason::kNone || reason == StopReason::kPause) { state.isSuspended = true; if (--numThreads_ == 0) { - finished(); + finishedLocked(); } } return StopReason::kNone; @@ -1453,7 +1453,7 @@ StopReason Task::shouldStop() { return StopReason::kNone; } -void Task::finished() { +void Task::finishedLocked() { for (auto& promise : threadFinishPromises_) { promise.setValue(true); } diff --git a/velox/exec/Task.h b/velox/exec/Task.h index 521add4305fb..cf960c87dfc9 100644 --- a/velox/exec/Task.h +++ b/velox/exec/Task.h @@ -506,7 +506,7 @@ class Task : public std::enable_shared_from_this { SplitsStore& splitsStore, exec::Split&& split); - void finished(); + void finishedLocked(); StopReason shouldStopLocked(); From 8d7fe84d2ce43df952e08ac1015d5bc7c6b868ab Mon Sep 17 00:00:00 2001 From: Laith Sakka Date: Mon, 18 Apr 2022 14:36:52 -0700 Subject: [PATCH 029/198] Cast support to generic (#1417) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/1417 - This diff adds the support of type access and cast to generics. - It introduces 4 functions that can be called on the GenericView: - type() - kind() - castTo: performs an unchecked cast and returns arg_type. A safety debug time type check will happen. - tryCastTo: return std::optional>, performs unchecked cast. return std::null opt if T does not match the type of the vector. - **Cost**: - The first time we do the cast we create the readers corresponding to that type. Then for the coming rows, the cost is a couple of instructions; checking reader is created, accessing the reader in the variant and returning the element at the row index. In some non-common cases there is additional check that the type casted to is consistent across rows. - TryCastTo is more expensive, since it does a type check as well. - In general its not expensive to use it with complex types, but avoid using it with primitives by either implementing a function specialized when input is primitive (see == function as example ). Or casting to complex types already specialized with primitive types, i.e. Array Array instead of Array and then casting Any to int for every element. - **What can be casted to?** This diff enabled cast to all basic types plus Array Map, Row, Row and Row up to 5.. This allow to recursively traverse complex types. - **How to add a new casted to type?** - Any type except Generic<> it self, (Cast to self type) or Variadic<...>. - The diff adds example function HasDuplicate, which checks if an array has duplicate items. Reviewed By: kevinwilfong Differential Revision: D35616634 fbshipit-source-id: 847af1dfc3af9b49ce32386b05bef585be955a81 --- velox/expression/ComplexViewTypes.h | 214 ++++++++- velox/expression/VectorUdfTypeSystem.h | 15 +- velox/expression/tests/GenericViewTest.cpp | 517 ++++++++++++++++++++- velox/type/Type.h | 6 + 4 files changed, 729 insertions(+), 23 deletions(-) diff --git a/velox/expression/ComplexViewTypes.h b/velox/expression/ComplexViewTypes.h index fb8b6b047549..483dccab2481 100644 --- a/velox/expression/ComplexViewTypes.h +++ b/velox/expression/ComplexViewTypes.h @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include @@ -921,27 +922,228 @@ inline auto get(const RowView& row) { return row.template at(); } +template +using reader_ptr_t = VectorReader*; + +template +struct HasGeneric { + static constexpr bool value() { + return false; + } +}; + +template +struct HasGeneric> { + static constexpr bool value() { + return true; + } +}; + +template +struct HasGeneric> { + static constexpr bool value() { + return HasGeneric::value() || HasGeneric::value(); + } +}; + +template +struct HasGeneric> { + static constexpr bool value() { + return HasGeneric::value(); + } +}; + +template +struct HasGeneric> { + static constexpr bool value() { + return (HasGeneric::value() || ...); + } +}; + +// This is basically Array, Map, Row. +template +struct AllGenericExceptTop { + static constexpr bool value() { + return false; + } +}; + +template +struct AllGenericExceptTop> { + static constexpr bool value() { + return isGenericType::value; + } +}; + +template +struct AllGenericExceptTop> { + static constexpr bool value() { + return isGenericType::value && isGenericType::value; + } +}; + +template +struct AllGenericExceptTop> { + static constexpr bool value() { + return (isGenericType::value && ...); + } +}; + class GenericView { public: - GenericView(const BaseVector* vector, vector_size_t index) - : vector_(vector), index_(index) {} + GenericView( + const DecodedVector& decoded, + std::array, 3>& castReaders, + TypePtr& castType, + vector_size_t index) + : decoded_(decoded), + castReaders_(castReaders), + castType_(castType), + index_(index) {} uint64_t hash() const { - return vector_->hashValueAt(index_); + return decoded_.base()->hashValueAt(index_); } bool operator==(const GenericView& other) const { - return vector_->equalValueAt(other.vector_, index_, other.index_); + return decoded_.base()->equalValueAt( + other.decoded_.base(), index_, other.index_); } std::optional compare( const GenericView& other, const CompareFlags flags) const { - return vector_->compare(other.vector_, index_, other.index_, flags); + return decoded_.base()->compare( + other.decoded_.base(), index_, other.index_, flags); + } + + TypeKind kind() const { + return decoded_.base()->typeKind(); + } + + const TypePtr type() const { + return decoded_.base()->type(); + } + + // If conversion is invalid, behavior is undefined. However, debug time + // checks will throw an exception. + template + typename VectorReader::exec_in_t castTo() const { + VELOX_DCHECK( + CastTypeChecker::check(type()), + fmt::format( + "castTo type is not compatible with type of vector, vector type is {}, casted to type is {}", + type()->toString(), + CppToType::create()->toString())); + + // TODO: We can distinguish if this is a null-free or not null-free + // generic. And based on that determine if we want to call operator[] or + // readNullFree. For now we always return nullable. + return ensureReader()->operator[](index_); + } + + template + std::optional::exec_in_t> tryCastTo() const { + if (!CastTypeChecker::check(type())) { + return std::nullopt; + } + + return ensureReader()->operator[](index_); } private: - const BaseVector* vector_; + // Utility class that checks that vectorType matches T. + template + struct CastTypeChecker { + static bool check(const TypePtr& vectorType) { + return CppToType::typeKind == vectorType->kind(); + } + }; + + template + struct CastTypeChecker> { + static bool check(const TypePtr&) { + return true; + } + }; + + template + struct CastTypeChecker> { + static bool check(const TypePtr& vectorType) { + return TypeKind::ARRAY == vectorType->kind() && + CastTypeChecker::check(vectorType->childAt(0)); + } + }; + + template + struct CastTypeChecker> { + static bool check(const TypePtr& vectorType) { + return TypeKind::MAP == vectorType->kind() && + CastTypeChecker::check(vectorType->childAt(0)) && + CastTypeChecker::check(vectorType->childAt(1)); + } + }; + + template + struct CastTypeChecker> { + static bool check(const TypePtr& vectorType) { + int index = 0; + return TypeKind::ROW == vectorType->kind() && + (CastTypeChecker::check(vectorType->childAt(index++)) && ... && + true); + } + }; + + template + VectorReader* ensureReader() const { + static_assert( + !isGenericType::value && !isVariadicType::value, + "That does not make any sense! You cant cast to Generic or Variadic"); + + // This is an optimization to avoid checking dynamically for every row that + // the user is always casting to the same type. + // Types are divided into three sets, for 1, and 2 we do not do the check, + // since no two types can ever refer to the same vector. + + if constexpr (!HasGeneric::value()) { + // Two types with no generic can never represent same vector. + return ensureReaderImpl(); + } else { + if constexpr (AllGenericExceptTop::value()) { + // This is basically Array, Map, Row. + return ensureReaderImpl(); + } else { + auto requestedType = CppToType::create(); + if (castType_) { + VELOX_USER_CHECK( + castType_->operator==(*requestedType), + fmt::format( + "Not allowed to cast to the two types {} and {} within the same batch." + "Consider creating a new type set to allow it.", + castType_->toString(), + requestedType->toString())); + } else { + castType_ = std::move(requestedType); + } + return ensureReaderImpl(); + } + } + } + + template + VectorReader* ensureReaderImpl() const { + auto* reader = static_cast*>(castReaders_[I].get()); + if (LIKELY(reader != nullptr)) { + return reader; + } else { + castReaders_[I] = std::make_shared>(&decoded_); + return static_cast*>(castReaders_[I].get()); + } + } + + const DecodedVector& decoded_; + std::array, 3>& castReaders_; + TypePtr& castType_; vector_size_t index_; }; diff --git a/velox/expression/VectorUdfTypeSystem.h b/velox/expression/VectorUdfTypeSystem.h index 20f0cd6a4888..8bace04aca2b 100644 --- a/velox/expression/VectorUdfTypeSystem.h +++ b/velox/expression/VectorUdfTypeSystem.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -450,9 +451,6 @@ struct VectorReader> { childReader_{detail::decode(arrayValuesDecoder_, *vector_.elements())} { } - explicit VectorReader(const VectorReader>&) = delete; - VectorReader>& operator=(const VectorReader>&) = delete; - bool isSet(size_t offset) const { return !decoded_.isNullAt(offset); } @@ -1458,8 +1456,7 @@ struct VectorReader> { using exec_in_t = GenericView; using exec_null_free_in_t = exec_in_t; - explicit VectorReader(const DecodedVector* decoded) - : decoded_(*decoded), base_(decoded->base()) {} + explicit VectorReader(const DecodedVector* decoded) : decoded_(*decoded) {} explicit VectorReader(const VectorReader>&) = delete; @@ -1471,7 +1468,7 @@ struct VectorReader> { exec_in_t operator[](size_t offset) const { auto index = decoded_.index(offset); - return GenericView{base_, index}; + return GenericView{decoded_, castReaders_, castType_, index}; } exec_null_free_in_t readNullFree(vector_size_t offset) const { @@ -1509,7 +1506,11 @@ struct VectorReader> { } const DecodedVector& decoded_; - const BaseVector* base_; + + // Those two variables are mutated by the GenericView during cast operations, + // and are shared across GenericViews constructed by the reader. + mutable std::array, 3> castReaders_; + mutable TypePtr castType_ = nullptr; }; } // namespace facebook::velox::exec diff --git a/velox/expression/tests/GenericViewTest.cpp b/velox/expression/tests/GenericViewTest.cpp index 4da33f7340f2..6feff009d0b3 100644 --- a/velox/expression/tests/GenericViewTest.cpp +++ b/velox/expression/tests/GenericViewTest.cpp @@ -106,9 +106,20 @@ class GenericViewTest : public functions::test::FunctionBaseTest { } } } + + template + void testHasGeneric(bool expected) { + ASSERT_EQ(exec::HasGeneric::value(), expected) + << CppToType::create()->toString(); + } + + template + void testAllGenericExceptTop(bool expected) { + ASSERT_EQ(exec::AllGenericExceptTop::value(), expected); + } }; -TEST_F(GenericViewTest, testInt) { +TEST_F(GenericViewTest, primitive) { std::vector> data1 = { 1, 2, std::nullopt, 1, std::nullopt, 5, 6}; std::vector> data2 = { @@ -120,7 +131,7 @@ TEST_F(GenericViewTest, testInt) { testHash(vector1); } -TEST_F(GenericViewTest, testCompare) { +TEST_F(GenericViewTest, compare) { std::vector> data = {1, 2, std::nullopt, 1}; auto vector = vectorMaker_.flatVectorNullable(data); @@ -139,7 +150,7 @@ TEST_F(GenericViewTest, testCompare) { } // Test reader where generic elements are arrays -TEST_F(GenericViewTest, testArrayOfInt) { +TEST_F(GenericViewTest, arrayOfInt) { auto vector1 = vectorMaker_.arrayVectorNullable(arrayData1); auto vector2 = vectorMaker_.arrayVectorNullable(arrayData2); testEqual(vector1, vector2, arrayData1, arrayData2); @@ -147,7 +158,7 @@ TEST_F(GenericViewTest, testArrayOfInt) { } // Test reader> where generic elements are ints. -TEST_F(GenericViewTest, testArrayOfGeneric) { +TEST_F(GenericViewTest, arrayOfGeneric) { auto vector1 = vectorMaker_.arrayVectorNullable(arrayData1); auto vector2 = vectorMaker_.arrayVectorNullable(arrayData2); @@ -181,9 +192,8 @@ struct CompareFunc { VELOX_DEFINE_FUNCTION_TYPES(T); template - FOLLY_ALWAYS_INLINE bool call(bool& out, const G& a, const G& b) { + void call(bool& out, const G& a, const G& b) { out = (a == b); - return true; } }; @@ -215,9 +225,8 @@ struct HashFunc { VELOX_DEFINE_FUNCTION_TYPES(T); template - FOLLY_ALWAYS_INLINE bool call(int64_t& out, const G& a, const G& b) { + void call(int64_t& out, const G& a, const G& b) { out = a.hash() + b.hash(); - return true; } }; @@ -274,12 +283,11 @@ struct HashAllArgs { VELOX_DEFINE_FUNCTION_TYPES(T); template - FOLLY_ALWAYS_INLINE bool call(int64_t& out, const G& args) { + void call(int64_t& out, const G& args) { out = 0; for (auto arg : args) { out += arg.value().hash(); } - return true; } }; @@ -335,5 +343,494 @@ TEST_F(GenericViewTest, e2eHashVariadicAnyType) { } } +TEST_F(GenericViewTest, castToInt) { + std::vector> data = {1, 2, std::nullopt, 1}; + + auto vector = vectorMaker_.flatVectorNullable(data); + DecodedVector decoded; + exec::VectorReader> reader(decode(decoded, *vector)); + ASSERT_EQ(reader[0].castTo(), 1); + ASSERT_EQ(reader[0].tryCastTo().value(), 1); + + ASSERT_EQ(reader[1].castTo(), 2); + ASSERT_EQ(reader[1].tryCastTo().value(), 2); +} + +TEST_F(GenericViewTest, castToArrayViewOfGeneric) { + VectorPtr vector = vectorMaker_.arrayVectorNullable(arrayData1); + + DecodedVector decoded; + exec::VectorReader> reader(decode(decoded, *vector)); + + auto generic = reader[4]; // {{0, 1, 2, 4}} + ASSERT_EQ(generic.kind(), TypeKind::ARRAY); + + // Test cast to. + { + auto arrayView = generic.castTo>>(); + auto i = 0; + for (auto genericItem : arrayView) { + if (genericItem.has_value()) { + ASSERT_EQ(genericItem.value().kind(), TypeKind::BIGINT); + auto val = genericItem.value().castTo(); + ASSERT_EQ(val, arrayData1[4].value()[i].value()); + i++; + } + } + } + + // Test try cast to. + { + auto arrayView = generic.tryCastTo>>().value(); + + auto i = 0; + for (auto genericItem : arrayView) { + if (genericItem.has_value()) { + ASSERT_EQ(genericItem.value().kind(), TypeKind::BIGINT); + + auto val = genericItem.value().tryCastTo().value(); + ASSERT_EQ(val, arrayData1[4].value()[i].value()); + i++; + } + } + } +} + +TEST_F(GenericViewTest, tryCastTo) { + DecodedVector decoded; + + { // Reader for vector of bigint. + auto vector = vectorMaker_.flatVectorNullable({1}); + exec::VectorReader> reader(decode(decoded, *vector)); + + ASSERT_FALSE(reader[0].tryCastTo().has_value()); + ASSERT_FALSE(reader[0].tryCastTo().has_value()); + ASSERT_FALSE(reader[0].tryCastTo().has_value()); + ASSERT_FALSE(reader[0].tryCastTo>>().has_value()); + ASSERT_FALSE( + (reader[0].tryCastTo, Generic<>>>().has_value())); + ASSERT_FALSE(reader[0].tryCastTo>>().has_value()); + + ASSERT_EQ(reader[0].tryCastTo().value(), 1); + } + + { // Reader for vector of array(bigint). + auto arrayVector = vectorMaker_.arrayVectorNullable(arrayData1); + exec::VectorReader> reader(decode(decoded, *arrayVector)); + + ASSERT_FALSE(reader[0].tryCastTo().has_value()); + ASSERT_FALSE(reader[0].tryCastTo().has_value()); + ASSERT_FALSE(reader[0].tryCastTo().has_value()); + ASSERT_FALSE( + (reader[0].tryCastTo, Generic<>>>().has_value())); + + ASSERT_TRUE(reader[0].tryCastTo>>().has_value()); + } +} + +TEST_F(GenericViewTest, validMultiCast) { + DecodedVector decoded; + + { // Reader for vector of array(bigint). + auto arrayVector = vectorMaker_.arrayVectorNullable(arrayData1); + exec::VectorReader> reader(decode(decoded, *arrayVector)); + + ASSERT_EQ( + reader[0].tryCastTo>>().value().size(), + arrayData1[0].value().size()); + ASSERT_EQ( + reader[0].tryCastTo>().value().size(), + arrayData1[0].value().size()); + } +} + +TEST_F(GenericViewTest, invalidMultiCast) { + DecodedVector decoded; + + { // Reader for vector of map(bigint, bigint). + auto arrayVector = makeMapVector({{{1, 2}, {3, 4}}}); + exec::VectorReader> reader(decode(decoded, *arrayVector)); + + // valid. + ASSERT_EQ((reader[0].castTo, Generic<>>>().size()), 2); + // valid. + ASSERT_EQ((reader[0].castTo>().size()), 2); + // valid. + ASSERT_EQ((reader[0].castTo, int64_t>>().size()), 2); + + // Will throw since Map, int64_t> and Map> not + // allowed togother. + EXPECT_THROW( + (reader[0].castTo>>().size()), VeloxUserError); + } +} + +TEST_F(GenericViewTest, castToMap) { + using map_type = std::vector>>; + + map_type map1 = {}; + map_type map2 = {{1, 4}, {3, 3}, {4, std::nullopt}}; + + std::vector mapsData = {map1, map2}; + + auto mapVector = makeMapVector(mapsData); + + DecodedVector decoded; + exec::VectorReader> reader(decode(decoded, *mapVector)); + + { + auto generic = reader[0]; + auto map = generic.tryCastTo, Generic<>>>(); + ASSERT_TRUE(map.has_value()); + auto mapView = map.value(); + ASSERT_EQ(mapView.size(), 0); + } + + { + auto generic = reader[1]; + auto mapView = generic.castTo, Generic<>>>(); + ASSERT_EQ(mapView.size(), 3); + ASSERT_EQ(mapView.begin()->first.castTo(), 1); + ASSERT_EQ(mapView.begin()->second.value().castTo(), 4); + } +} + +// A function that convert a variaidic number of inputs that can have +// any type to string written using castTo. +template +struct ToStringFuncCastTo { + VELOX_DEFINE_FUNCTION_TYPES(T); + + // Used to print nullable items of maps, rows and arrays. + template + void printItem(out_type& out, const TItem& item) { + if (item.has_value()) { + print(out, *item); + } else { + out += "null"; + } + } + + template + void printMap(out_type& out, const TMapView& mapView) { + out += "map("; + for (auto [key, value] : mapView) { + out += "<"; + print(out, key); + out += ","; + printItem(out, value); + out += ">,"; + } + out += ")"; + } + + template + void printArray(out_type& out, const TArrayView& arrayView) { + out += "array("; + for (const auto& item : arrayView) { + printItem(out, item); + out += ", "; + } + out += ")"; + } + + template + void printRow(out_type& out, const TRowView& rowView) { + out += "row("; + printItem(out, exec::get<0>(rowView)); + out += ", "; + printItem(out, exec::get<1>(rowView)); + out += ")"; + } + + void print(out_type& out, const arg_type>& arg) { + // Note: VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH can be used to simplify + // iterating over all primitive types. + switch (arg.kind()) { + case TypeKind::BIGINT: + out += std::to_string(arg.template castTo()); + break; + case TypeKind::DOUBLE: + out += std::to_string(arg.template castTo()); + break; + case TypeKind::BOOLEAN: + out += std::to_string(arg.template castTo()); + break; + case TypeKind::VARCHAR: + out += arg.template castTo(); + break; + case TypeKind::ARRAY: { + if (*arg.type() == *ARRAY(BIGINT())) { + // Special handling for array usually this is faster than + // going through Array> and casting every element. + auto arrayView = arg.template castTo>(); + out += "array("; + for (auto item : arrayView) { + if (item.has_value()) { + out += std::to_string(item.value()); + } else { + out += "null"; + } + out += ", "; + } + out += ")"; + } else { + auto arrayView = arg.template castTo>>(); + printArray(out, arrayView); + } + break; + } + case TypeKind::MAP: { + auto mapView = arg.template castTo, Generic<>>>(); + printMap(out, mapView); + break; + } + case TypeKind::ROW: { + auto rowSize = arg.type()->asRow().size(); + VELOX_CHECK(rowSize == 2, "print only supports rows of width 2"); + auto rowView = arg.template castTo, Generic<>>>(); + printRow(out, rowView); + break; + } + default: + VELOX_UNREACHABLE("not supported"); + } + } + + void call(out_type& out, const arg_type>>& args) { + auto i = 0; + for (const auto& arg : args) { + out += "arg " + std::to_string(i++) + " : "; + printItem(out, arg); + out += "\n"; + } + } +}; + +TEST_F(GenericViewTest, castE2E) { + registerFunction>>( + {"to_string_cast"}); + + auto test = [&](const std::string& args, const std::string& expected) { + auto result = evaluate>( + fmt::format("to_string_cast({})", args), + makeRowVector({makeFlatVector(1)})); + ASSERT_EQ(result->valueAt(0).str(), expected); + }; + test("row_constructor(1,2)", "arg 0 : row(1, 2)\n"); + + test( + "row_constructor(array_constructor(1,2,3),true)", + "arg 0 : row(array(1, 2, 3, ), 1)\n"); + + test( + "'hi', array_constructor(array_constructor(1.2, 1.4)), 1", + "arg 0 : hi\narg 1 : array(array(1.200000, 1.400000, ), )\narg 2 : 1\n"); + + test( + "1.3, map(array_constructor(1), array_constructor(null))", + "arg 0 : 1.300000\narg 1 : map(<1,null>,)\n"); +} + +// A function that convert a variaidic number of inputs that can have +// any type to string written using tryCastTo. +template +struct ToStringFuncTryCastTo { + VELOX_DEFINE_FUNCTION_TYPES(T); + + // Uses to print nullable items of maps, rows and arrays. + template + void printItem(out_type& out, const TItem& item) { + if (item.has_value()) { + print(out, *item); + } else { + out += "null"; + } + } + + template + void printMap(out_type& out, const TMapView& mapView) { + out += "map("; + for (auto [key, value] : mapView) { + out += "<"; + print(out, key); + out += ","; + printItem(out, value); + out += ">,"; + } + out += ")"; + } + + template + void printArray(out_type& out, const TArrayView& arrayView) { + out += "array("; + for (const auto& item : arrayView) { + printItem(out, item); + out += ", "; + } + out += ")"; + } + + template + void printRow(out_type& out, const TRowView& rowView) { + out += "row("; + printItem(out, exec::get<0>(rowView)); + out += ", "; + printItem(out, exec::get<1>(rowView)); + out += ")"; + } + + void print(out_type& out, const arg_type>& arg) { + if (auto bigIntValue = arg.template tryCastTo()) { + out += std::to_string(*bigIntValue); + } else if (auto doubleValue = arg.template tryCastTo()) { + out += std::to_string(*doubleValue); + } else if (auto boolValue = arg.template tryCastTo()) { + out += std::to_string(*boolValue); + } else if (auto arrayView = arg.template tryCastTo>()) { + // Special handling for array usually this is faster than going + // through Array> and casting every element. + out += "array("; + for (auto item : *arrayView) { + if (item.has_value()) { + out += std::to_string(item.value()); + } else { + out += "null"; + } + out += ", "; + } + out += ")"; + } else if (auto arrayView = arg.template tryCastTo>>()) { + printArray(out, *arrayView); + } else if ( + auto mapView = arg.template tryCastTo, Generic<>>>()) { + printMap(out, *mapView); + } else if (auto stringView = arg.template tryCastTo()) { + out += *stringView; + } else if ( + auto rowView = arg.template tryCastTo, Generic<>>>()) { + printRow(out, *rowView); + } else { + VELOX_UNREACHABLE("type not supported in this function"); + } + } + + void call(out_type& out, const arg_type>>& args) { + auto i = 0; + for (const auto& arg : args) { + out += "arg " + std::to_string(i++) + " : "; + printItem(out, arg); + out += "\n"; + } + } +}; + +TEST_F(GenericViewTest, tryCastE2E) { + registerFunction>>( + {"to_string_try_cast"}); + + auto test = [&](const std::string& args, const std::string& expected) { + auto result = evaluate>( + fmt::format("to_string_try_cast({})", args), + makeRowVector({makeFlatVector(1)})); + ASSERT_EQ(result->valueAt(0).str(), expected); + }; + test("row_constructor(1,2)", "arg 0 : row(1, 2)\n"); + + test( + "row_constructor(array_constructor(1,2,3),true)", + "arg 0 : row(array(1, 2, 3, ), 1)\n"); + + test( + "'hi', array_constructor(array_constructor(1.2, 1.4)), 1", + "arg 0 : hi\narg 1 : array(array(1.200000, 1.400000, ), )\narg 2 : 1\n"); + + test( + "1.3, map(array_constructor(1), array_constructor(null))", + "arg 0 : 1.300000\narg 1 : map(<1,null>,)\n"); +} + +template +struct ArrayHasDuplicateFunc { + VELOX_DEFINE_FUNCTION_TYPES(T); + bool call(bool& out, const arg_type>>& input) { + std::unordered_set>> set; + for (auto item : input) { + if (!item.has_value()) { + // Return null if null is encountered. + return false; + } + + if (set.count(*item)) { + // Item already exisits. + out = true; + return true; + } + set.insert(*item); + } + out = false; + return true; + } +}; + +TEST_F(GenericViewTest, hasDuplicate) { + registerFunction>>( + {"has_duplicate_func"}); + + auto test = [&](const std::string& arg, bool expected) { + auto result = evaluate>( + fmt::format("has_duplicate_func(array_constructor({}))", arg), + makeRowVector({makeFlatVector(1)})); + ASSERT_EQ(result->valueAt(0), expected); + }; + + test("1,2,3,4,5", false); + test("1,2,3,4,4", true); + + test("'what','no'", false); + test("'what','what'", true); + + // Nested array. + test( + "array_constructor(1,2,3),array_constructor(1,2), array_constructor(1)", + false); + test( + "array_constructor(1,2,3),array_constructor(1), array_constructor(1)", + true); +} + +TEST_F(GenericViewTest, hasGenericTest) { + testHasGeneric(false); + testHasGeneric>(false); + testHasGeneric>(false); + testHasGeneric>(false); + testHasGeneric>(false); + + testHasGeneric>(true); + testHasGeneric>>(true); + testHasGeneric>>(true); + testHasGeneric, int64_t>>(true); + testHasGeneric, int64_t>>(true); + testHasGeneric>>(true); +} + +TEST_F(GenericViewTest, allGenericExceptTop) { + testAllGenericExceptTop(false); + testAllGenericExceptTop>(false); + testAllGenericExceptTop>(false); + testAllGenericExceptTop>(false); + testAllGenericExceptTop>(false); + testAllGenericExceptTop>>(false); + testAllGenericExceptTop, int64_t>>(false); + testAllGenericExceptTop, int64_t>>(false); + testAllGenericExceptTop>>(false); + + testAllGenericExceptTop>>(true); + testAllGenericExceptTop, Generic<>>>(true); + testAllGenericExceptTop, Generic<>>>(true); + testAllGenericExceptTop>(true); + testAllGenericExceptTop>>(true); +} + } // namespace } // namespace facebook::velox diff --git a/velox/type/Type.h b/velox/type/Type.h index 8c105f4a63cf..6fa79270f3f4 100644 --- a/velox/type/Type.h +++ b/velox/type/Type.h @@ -1307,6 +1307,12 @@ struct isVariadicType : public std::false_type {}; template struct isVariadicType> : public std::true_type {}; +template +struct isGenericType : public std::false_type {}; + +template +struct isGenericType> : public std::true_type {}; + template struct Map { using key_type = KEY; From 2afd039cc6136b24fd86f61391bbac7ade04f818 Mon Sep 17 00:00:00 2001 From: svcscm svcscm Date: Mon, 18 Apr 2022 16:19:13 -0700 Subject: [PATCH 030/198] Updating submodules Summary: GitHub commits: https://github.com/facebook/cachelib/commit/18af563f93fae42a1cc73025a4ec0927faf4c59b https://github.com/facebook/fb303/commit/c283e78953dd8f44fbd283d8ed10d8e2a8d1d5c3 https://github.com/facebook/fbzmq/commit/b1fa008d79d42838f2b71a3c9395965b9a90d7bc https://github.com/facebook/folly/commit/9d2b3ae18f8492e237f63a04ff6a66d1b79e1e91 https://github.com/facebook/watchman/commit/1652dd6d8796d47c3367de1b69e9f01d98f97488 https://github.com/facebookexperimental/rust-shed/commit/b01d6b7b8881a07c1338ba149af8e3db96316af2 https://github.com/facebookresearch/recipes/commit/3912d6eeac66cadadeba0bb023a95f795fbfe2a2 Reviewed By: yns88 fbshipit-source-id: e1a81cef9377a4969d584f5e6740abf070c2ea68 --- build/deps/github_hashes/facebook/folly-rev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/deps/github_hashes/facebook/folly-rev.txt b/build/deps/github_hashes/facebook/folly-rev.txt index 0331ed25ad87..18d97e8995b8 100644 --- a/build/deps/github_hashes/facebook/folly-rev.txt +++ b/build/deps/github_hashes/facebook/folly-rev.txt @@ -1 +1 @@ -Subproject commit b7a60b2cea1726b9a688d7382edeadcea03976b1 +Subproject commit 9d2b3ae18f8492e237f63a04ff6a66d1b79e1e91 From a16c1c3ede658341f127b9f582a50d21429e3695 Mon Sep 17 00:00:00 2001 From: Laith Sakka Date: Mon, 18 Apr 2022 16:22:50 -0700 Subject: [PATCH 031/198] Use `Any` as alias to `Generic<>` (#1416) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/1416 title. Reviewed By: kevinwilfong Differential Revision: D35623348 fbshipit-source-id: d297b4a4733ad7d2c711fc6015790478bc316644 --- velox/core/SimpleFunctionMetadata.h | 12 +- velox/core/tests/TestTypeAnalysis.cpp | 82 +++++------ .../tests/SimpleFunctionResolutionTest.cpp | 17 ++- velox/expression/tests/GenericViewTest.cpp | 127 +++++++++--------- velox/expression/tests/VectorReaderTest.cpp | 2 +- velox/functions/prestosql/Cardinality.h | 4 +- .../GeneralFunctionsRegistration.cpp | 5 +- .../functions/tests/FunctionRegistryTest.cpp | 12 +- velox/type/Type.h | 2 + 9 files changed, 128 insertions(+), 135 deletions(-) diff --git a/velox/core/SimpleFunctionMetadata.h b/velox/core/SimpleFunctionMetadata.h index 9955578caa86..b9d9fa6330ad 100644 --- a/velox/core/SimpleFunctionMetadata.h +++ b/velox/core/SimpleFunctionMetadata.h @@ -120,18 +120,18 @@ struct TypeAnalysisResults { // rank 2: has variadic but generic free. // e.g: Variadic -> int. // rank 3: has generic but no variadic of generic. - // e.g: Generic<>, Generic<>, -> int. + // e.g: Any, Any, -> int. // rank 4: has variadic of generic. - // e.g: Variadic> -> int. + // e.g: Variadic -> int. // If two functions have the same rank, then concreteCount is used to // to resolve the ordering. // e.g: consider the two functions: - // 1. int, Generic<>, Variadic -> has rank 3. concreteCount =2 - // 2. int, Generic<>, Generic<> -> has rank 3. concreteCount =1 + // 1. int, Any, Variadic -> has rank 3. concreteCount =2 + // 2. int, Any, Any -> has rank 3. concreteCount =1 // in this case (1) is picked. - // e.g: (Generic<>, int) will be picked before (Generic<>, Generic<>) - // e.g: Variadic>> is picked before Variadic>. + // e.g: (Any, int) will be picked before (Any, Any) + // e.g: Variadic> is picked before Variadic. uint32_t getRank() { if (!hasGeneric && !hasVariadic) { return 1; diff --git a/velox/core/tests/TestTypeAnalysis.cpp b/velox/core/tests/TestTypeAnalysis.cpp index 35679fe929fd..456385146a71 100644 --- a/velox/core/tests/TestTypeAnalysis.cpp +++ b/velox/core/tests/TestTypeAnalysis.cpp @@ -97,23 +97,23 @@ TEST_F(TypeAnalysisTest, hasGeneric) { testHasGeneric>(false); testHasGeneric, Array>>(false); - testHasGeneric>, Array>>(true); + testHasGeneric, Array>>(true); testHasGeneric>, Array>>(true); - testHasGeneric, Generic<>>>(true); - testHasGeneric>>(true); - testHasGeneric>(true); - testHasGeneric>(true); - testHasGeneric, int32_t>(true); + testHasGeneric, Any>>(true); + testHasGeneric>(true); + testHasGeneric(true); + testHasGeneric(true); + testHasGeneric(true); } TEST_F(TypeAnalysisTest, hasVariadic) { testHasVariadic(false); testHasVariadic, Array>>(false); - testHasVariadic, Generic<>>>(false); + testHasVariadic, Any>>(false); testHasVariadic>(false); testHasVariadic>(true); - testHasVariadic>>(true); + testHasVariadic>(true); testHasVariadic, Array>(true); testHasVariadic>>(true); } @@ -121,17 +121,17 @@ TEST_F(TypeAnalysisTest, hasVariadic) { TEST_F(TypeAnalysisTest, hasVariadicOfGeneric) { testHasVariadicOfGeneric(false); testHasVariadicOfGeneric, Array>>(false); - testHasVariadicOfGeneric, Generic<>>>(false); + testHasVariadicOfGeneric, Any>>(false); testHasVariadicOfGeneric>(false); testHasVariadicOfGeneric>(false); testHasVariadicOfGeneric, Array>(false); testHasVariadicOfGeneric>>(false); - testHasVariadicOfGeneric, Generic<>>(false); - testHasVariadicOfGeneric, Variadic>(false); + testHasVariadicOfGeneric, Any>(false); + testHasVariadicOfGeneric>(false); - testHasVariadicOfGeneric>>(true); - testHasVariadicOfGeneric>, int32_t>(true); - testHasVariadicOfGeneric>>>(true); + testHasVariadicOfGeneric>(true); + testHasVariadicOfGeneric, int32_t>(true); + testHasVariadicOfGeneric>>(true); testHasVariadicOfGeneric>>>>( true); } @@ -141,23 +141,23 @@ TEST_F(TypeAnalysisTest, countConcrete) { testCountConcrete(1); testCountConcrete(2); testCountConcrete(3); - testCountConcrete>(0); + testCountConcrete(0); testCountConcrete>(0); - testCountConcrete>>(0); + testCountConcrete>(0); testCountConcrete>(1); - testCountConcrete>>>(1); + testCountConcrete>>(1); testCountConcrete, Array>>(5); - testCountConcrete, Generic<>>>(3); + testCountConcrete, Any>>(3); testCountConcrete>(3); testCountConcrete, Array>(3); testCountConcrete>>(3); - testCountConcrete, Generic<>>(1); - testCountConcrete, Variadic>(1); + testCountConcrete, Any>(1); + testCountConcrete>(1); - testCountConcrete>>(0); - testCountConcrete>, int32_t>(1); - testCountConcrete>>>(2); + testCountConcrete>(0); + testCountConcrete, int32_t>(1); + testCountConcrete>>(2); } TEST_F(TypeAnalysisTest, testStringType) { @@ -167,10 +167,10 @@ TEST_F(TypeAnalysisTest, testStringType) { testStringType({"real"}); testStringType>({"array(integer)"}); - testStringType, int32_t>>({"map(any, integer)"}); + testStringType>({"map(any, integer)"}); testStringType>({"row(integer, integer)"}); - testStringType>({"any"}); + testStringType({"any"}); testStringType>({"__user_T1"}); testStringType>({"integer"}); @@ -183,16 +183,16 @@ TEST_F(TypeAnalysisTest, testStringType) { testStringType>({"array(integer)"}); testStringType>({"map(bigint, double)"}); - testStringType, double, Generic>>( + testStringType>>( {"row(any, double, __user_T1)"}); } TEST_F(TypeAnalysisTest, testVariables) { testVariables({}); testVariables>({}); - testVariables>({}); + testVariables({}); testVariables>({"__user_T1"}); - testVariables, int32_t>>({}); + testVariables>({}); testVariables>({}); testVariables, Map, Generic>>( {"__user_T2", "__user_T5"}); @@ -207,15 +207,15 @@ TEST_F(TypeAnalysisTest, testRank) { testRank, int, Variadic>(2); testRank>>(2); - testRank>(3); - testRank, Generic<>, Variadic>(3); + testRank(3); + testRank, Any, Variadic>(3); testRank, Generic>(3); - testRank>, Generic>(3); - testRank>, int32_t>(3); - testRank, Generic<>, Generic<>>(3); + testRank, Generic>(3); + testRank, int32_t>(3); + testRank, Any, Any>(3); - testRank>>(4); - testRank, Generic<>, Variadic>>>(4); + testRank>(4); + testRank, Any, Variadic>>(4); } TEST_F(TypeAnalysisTest, testPriority) { @@ -227,17 +227,17 @@ TEST_F(TypeAnalysisTest, testPriority) { test(getPriority(), getPriority>()); - test(getPriority>(), getPriority>>()); + test(getPriority>(), getPriority>()); - test(getPriority>(), getPriority, Generic<>>()); + test(getPriority>(), getPriority()); - test(getPriority, Generic<>>(), getPriority>>()); + test(getPriority(), getPriority>()); - test(getPriority>(), getPriority, Generic<>>()); + test(getPriority(), getPriority()); test( - getPriority, Variadic>>>(), - getPriority, Variadic>>()); + getPriority>>(), + getPriority>()); } } // namespace } // namespace facebook::velox::core diff --git a/velox/exec/tests/SimpleFunctionResolutionTest.cpp b/velox/exec/tests/SimpleFunctionResolutionTest.cpp index 98cb4c7fc0f7..bf5955d74dc6 100644 --- a/velox/exec/tests/SimpleFunctionResolutionTest.cpp +++ b/velox/exec/tests/SimpleFunctionResolutionTest.cpp @@ -55,13 +55,12 @@ struct TestFunction { return true; } - bool call(int32_t& out, const arg_type>>&) { + bool call(int32_t& out, const arg_type>&) { out = 4; return true; } - bool - call(int32_t& out, const int32_t&, const arg_type>>&) { + bool call(int32_t& out, const int32_t&, const arg_type>&) { out = 5; return true; } @@ -71,7 +70,7 @@ TEST_F(SimpleFunctionResolutionTest, rank1Picked) { registerFunction({"f1"}); registerFunction>({"f1"}); registerFunction, Generic>({"f1"}); - registerFunction>>({"f1"}); + registerFunction>({"f1"}); checkResults("f1", 1); } @@ -79,25 +78,25 @@ TEST_F(SimpleFunctionResolutionTest, rank1Picked) { TEST_F(SimpleFunctionResolutionTest, rank2Picked) { registerFunction>({"f2"}); registerFunction, Generic>({"f2"}); - registerFunction>>({"f2"}); + registerFunction>({"f2"}); checkResults("f2", 2); } TEST_F(SimpleFunctionResolutionTest, rank3Picked) { registerFunction, Generic>({"f3"}); - registerFunction>>({"f3"}); + registerFunction>({"f3"}); checkResults("f3", 3); } TEST_F(SimpleFunctionResolutionTest, rank4Picked) { - registerFunction>>({"f4"}); + registerFunction>({"f4"}); checkResults("f4", 4); } // Test when two functions have the same rank. TEST_F(SimpleFunctionResolutionTest, sameRank) { - registerFunction>>({"f5"}); - registerFunction>>({"f5"}); + registerFunction>({"f5"}); + registerFunction>({"f5"}); checkResults("f5", 5); } diff --git a/velox/expression/tests/GenericViewTest.cpp b/velox/expression/tests/GenericViewTest.cpp index 6feff009d0b3..317049fe0e8d 100644 --- a/velox/expression/tests/GenericViewTest.cpp +++ b/velox/expression/tests/GenericViewTest.cpp @@ -82,8 +82,8 @@ class GenericViewTest : public functions::test::FunctionBaseTest { const DataT& data2) { DecodedVector decoded1; DecodedVector decoded2; - exec::VectorReader> reader1(decode(decoded1, *vector1)); - exec::VectorReader> reader2(decode(decoded2, *vector2)); + exec::VectorReader reader1(decode(decoded1, *vector1)); + exec::VectorReader reader2(decode(decoded2, *vector2)); for (auto i = 0; i < vector1->size(); i++) { ASSERT_EQ(data1[i].has_value(), reader1.isSet(i)); @@ -97,7 +97,7 @@ class GenericViewTest : public functions::test::FunctionBaseTest { void testHash(const VectorPtr& vector) { DecodedVector decoded; - exec::VectorReader> reader(decode(decoded, *vector)); + exec::VectorReader reader(decode(decoded, *vector)); for (auto i = 0; i < vector->size(); i++) { if (reader.isSet(i)) { ASSERT_EQ( @@ -136,7 +136,7 @@ TEST_F(GenericViewTest, compare) { auto vector = vectorMaker_.flatVectorNullable(data); DecodedVector decoded; - exec::VectorReader> reader(decode(decoded, *vector)); + exec::VectorReader reader(decode(decoded, *vector)); CompareFlags flags; ASSERT_EQ(reader[0].compare(reader[0], flags).value(), 0); ASSERT_EQ(reader[0].compare(reader[3], flags).value(), 0); @@ -164,8 +164,8 @@ TEST_F(GenericViewTest, arrayOfGeneric) { DecodedVector decoded1; DecodedVector decoded2; - exec::VectorReader>> reader1(decode(decoded1, *vector1)); - exec::VectorReader>> reader2(decode(decoded2, *vector2)); + exec::VectorReader> reader1(decode(decoded1, *vector1)); + exec::VectorReader> reader2(decode(decoded2, *vector2)); // Reader will return std::vector> like object. for (auto i = 0; i < vector1->size(); i++) { @@ -198,7 +198,7 @@ struct CompareFunc { }; TEST_F(GenericViewTest, e2eCompareInts) { - registerFunction, Generic<>>({"func1"}); + registerFunction({"func1"}); registerFunction, Generic>({"func2"}); registerFunction, Generic>({"func3"}); @@ -256,8 +256,7 @@ TEST_F(GenericViewTest, e2eHashAddSameType) { TEST_F(GenericViewTest, e2eHashDifferentTypes) { registerFunction, Generic>( {"add_hash_diff_type"}); - registerFunction, Generic<>>( - {"add_hash_diff_type2"}); + registerFunction({"add_hash_diff_type2"}); auto vectorInt = vectorMaker_.flatVector({1, 2, 3}); auto vectorDouble = vectorMaker_.flatVector({4, 5, 6}); @@ -317,7 +316,7 @@ TEST_F(GenericViewTest, e2eHashVariadicSameType) { } TEST_F(GenericViewTest, e2eHashVariadicAnyType) { - registerFunction>>({"func1"}); + registerFunction>({"func1"}); registerFunction>>({"func2"}); auto vectorInt1 = vectorMaker_.flatVector({1, 2, 3}); @@ -348,7 +347,7 @@ TEST_F(GenericViewTest, castToInt) { auto vector = vectorMaker_.flatVectorNullable(data); DecodedVector decoded; - exec::VectorReader> reader(decode(decoded, *vector)); + exec::VectorReader reader(decode(decoded, *vector)); ASSERT_EQ(reader[0].castTo(), 1); ASSERT_EQ(reader[0].tryCastTo().value(), 1); @@ -360,14 +359,14 @@ TEST_F(GenericViewTest, castToArrayViewOfGeneric) { VectorPtr vector = vectorMaker_.arrayVectorNullable(arrayData1); DecodedVector decoded; - exec::VectorReader> reader(decode(decoded, *vector)); + exec::VectorReader reader(decode(decoded, *vector)); auto generic = reader[4]; // {{0, 1, 2, 4}} ASSERT_EQ(generic.kind(), TypeKind::ARRAY); // Test cast to. { - auto arrayView = generic.castTo>>(); + auto arrayView = generic.castTo>(); auto i = 0; for (auto genericItem : arrayView) { if (genericItem.has_value()) { @@ -381,7 +380,7 @@ TEST_F(GenericViewTest, castToArrayViewOfGeneric) { // Test try cast to. { - auto arrayView = generic.tryCastTo>>().value(); + auto arrayView = generic.tryCastTo>().value(); auto i = 0; for (auto genericItem : arrayView) { @@ -401,30 +400,28 @@ TEST_F(GenericViewTest, tryCastTo) { { // Reader for vector of bigint. auto vector = vectorMaker_.flatVectorNullable({1}); - exec::VectorReader> reader(decode(decoded, *vector)); + exec::VectorReader reader(decode(decoded, *vector)); ASSERT_FALSE(reader[0].tryCastTo().has_value()); ASSERT_FALSE(reader[0].tryCastTo().has_value()); ASSERT_FALSE(reader[0].tryCastTo().has_value()); - ASSERT_FALSE(reader[0].tryCastTo>>().has_value()); - ASSERT_FALSE( - (reader[0].tryCastTo, Generic<>>>().has_value())); - ASSERT_FALSE(reader[0].tryCastTo>>().has_value()); + ASSERT_FALSE(reader[0].tryCastTo>().has_value()); + ASSERT_FALSE((reader[0].tryCastTo>().has_value())); + ASSERT_FALSE(reader[0].tryCastTo>().has_value()); ASSERT_EQ(reader[0].tryCastTo().value(), 1); } { // Reader for vector of array(bigint). auto arrayVector = vectorMaker_.arrayVectorNullable(arrayData1); - exec::VectorReader> reader(decode(decoded, *arrayVector)); + exec::VectorReader reader(decode(decoded, *arrayVector)); ASSERT_FALSE(reader[0].tryCastTo().has_value()); ASSERT_FALSE(reader[0].tryCastTo().has_value()); ASSERT_FALSE(reader[0].tryCastTo().has_value()); - ASSERT_FALSE( - (reader[0].tryCastTo, Generic<>>>().has_value())); + ASSERT_FALSE((reader[0].tryCastTo>().has_value())); - ASSERT_TRUE(reader[0].tryCastTo>>().has_value()); + ASSERT_TRUE(reader[0].tryCastTo>().has_value()); } } @@ -433,10 +430,10 @@ TEST_F(GenericViewTest, validMultiCast) { { // Reader for vector of array(bigint). auto arrayVector = vectorMaker_.arrayVectorNullable(arrayData1); - exec::VectorReader> reader(decode(decoded, *arrayVector)); + exec::VectorReader reader(decode(decoded, *arrayVector)); ASSERT_EQ( - reader[0].tryCastTo>>().value().size(), + reader[0].tryCastTo>().value().size(), arrayData1[0].value().size()); ASSERT_EQ( reader[0].tryCastTo>().value().size(), @@ -449,19 +446,19 @@ TEST_F(GenericViewTest, invalidMultiCast) { { // Reader for vector of map(bigint, bigint). auto arrayVector = makeMapVector({{{1, 2}, {3, 4}}}); - exec::VectorReader> reader(decode(decoded, *arrayVector)); + exec::VectorReader reader(decode(decoded, *arrayVector)); // valid. - ASSERT_EQ((reader[0].castTo, Generic<>>>().size()), 2); + ASSERT_EQ((reader[0].castTo>().size()), 2); // valid. ASSERT_EQ((reader[0].castTo>().size()), 2); // valid. - ASSERT_EQ((reader[0].castTo, int64_t>>().size()), 2); + ASSERT_EQ((reader[0].castTo>().size()), 2); - // Will throw since Map, int64_t> and Map> not + // Will throw since Map and Map not // allowed togother. EXPECT_THROW( - (reader[0].castTo>>().size()), VeloxUserError); + (reader[0].castTo>().size()), VeloxUserError); } } @@ -476,11 +473,11 @@ TEST_F(GenericViewTest, castToMap) { auto mapVector = makeMapVector(mapsData); DecodedVector decoded; - exec::VectorReader> reader(decode(decoded, *mapVector)); + exec::VectorReader reader(decode(decoded, *mapVector)); { auto generic = reader[0]; - auto map = generic.tryCastTo, Generic<>>>(); + auto map = generic.tryCastTo>(); ASSERT_TRUE(map.has_value()); auto mapView = map.value(); ASSERT_EQ(mapView.size(), 0); @@ -488,7 +485,7 @@ TEST_F(GenericViewTest, castToMap) { { auto generic = reader[1]; - auto mapView = generic.castTo, Generic<>>>(); + auto mapView = generic.castTo>(); ASSERT_EQ(mapView.size(), 3); ASSERT_EQ(mapView.begin()->first.castTo(), 1); ASSERT_EQ(mapView.begin()->second.value().castTo(), 4); @@ -543,7 +540,7 @@ struct ToStringFuncCastTo { out += ")"; } - void print(out_type& out, const arg_type>& arg) { + void print(out_type& out, const arg_type& arg) { // Note: VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH can be used to simplify // iterating over all primitive types. switch (arg.kind()) { @@ -575,20 +572,20 @@ struct ToStringFuncCastTo { } out += ")"; } else { - auto arrayView = arg.template castTo>>(); + auto arrayView = arg.template castTo>(); printArray(out, arrayView); } break; } case TypeKind::MAP: { - auto mapView = arg.template castTo, Generic<>>>(); + auto mapView = arg.template castTo>(); printMap(out, mapView); break; } case TypeKind::ROW: { auto rowSize = arg.type()->asRow().size(); VELOX_CHECK(rowSize == 2, "print only supports rows of width 2"); - auto rowView = arg.template castTo, Generic<>>>(); + auto rowView = arg.template castTo>(); printRow(out, rowView); break; } @@ -597,7 +594,7 @@ struct ToStringFuncCastTo { } } - void call(out_type& out, const arg_type>>& args) { + void call(out_type& out, const arg_type>& args) { auto i = 0; for (const auto& arg : args) { out += "arg " + std::to_string(i++) + " : "; @@ -608,7 +605,7 @@ struct ToStringFuncCastTo { }; TEST_F(GenericViewTest, castE2E) { - registerFunction>>( + registerFunction>( {"to_string_cast"}); auto test = [&](const std::string& args, const std::string& expected) { @@ -680,7 +677,7 @@ struct ToStringFuncTryCastTo { out += ")"; } - void print(out_type& out, const arg_type>& arg) { + void print(out_type& out, const arg_type& arg) { if (auto bigIntValue = arg.template tryCastTo()) { out += std::to_string(*bigIntValue); } else if (auto doubleValue = arg.template tryCastTo()) { @@ -700,22 +697,20 @@ struct ToStringFuncTryCastTo { out += ", "; } out += ")"; - } else if (auto arrayView = arg.template tryCastTo>>()) { + } else if (auto arrayView = arg.template tryCastTo>()) { printArray(out, *arrayView); - } else if ( - auto mapView = arg.template tryCastTo, Generic<>>>()) { + } else if (auto mapView = arg.template tryCastTo>()) { printMap(out, *mapView); } else if (auto stringView = arg.template tryCastTo()) { out += *stringView; - } else if ( - auto rowView = arg.template tryCastTo, Generic<>>>()) { + } else if (auto rowView = arg.template tryCastTo>()) { printRow(out, *rowView); } else { VELOX_UNREACHABLE("type not supported in this function"); } } - void call(out_type& out, const arg_type>>& args) { + void call(out_type& out, const arg_type>& args) { auto i = 0; for (const auto& arg : args) { out += "arg " + std::to_string(i++) + " : "; @@ -726,7 +721,7 @@ struct ToStringFuncTryCastTo { }; TEST_F(GenericViewTest, tryCastE2E) { - registerFunction>>( + registerFunction>( {"to_string_try_cast"}); auto test = [&](const std::string& args, const std::string& expected) { @@ -753,8 +748,8 @@ TEST_F(GenericViewTest, tryCastE2E) { template struct ArrayHasDuplicateFunc { VELOX_DEFINE_FUNCTION_TYPES(T); - bool call(bool& out, const arg_type>>& input) { - std::unordered_set>> set; + bool call(bool& out, const arg_type>& input) { + std::unordered_set> set; for (auto item : input) { if (!item.has_value()) { // Return null if null is encountered. @@ -774,7 +769,7 @@ struct ArrayHasDuplicateFunc { }; TEST_F(GenericViewTest, hasDuplicate) { - registerFunction>>( + registerFunction>( {"has_duplicate_func"}); auto test = [&](const std::string& arg, bool expected) { @@ -806,12 +801,12 @@ TEST_F(GenericViewTest, hasGenericTest) { testHasGeneric>(false); testHasGeneric>(false); - testHasGeneric>(true); - testHasGeneric>>(true); - testHasGeneric>>(true); - testHasGeneric, int64_t>>(true); - testHasGeneric, int64_t>>(true); - testHasGeneric>>(true); + testHasGeneric(true); + testHasGeneric>(true); + testHasGeneric>(true); + testHasGeneric>(true); + testHasGeneric>(true); + testHasGeneric>(true); } TEST_F(GenericViewTest, allGenericExceptTop) { @@ -819,17 +814,17 @@ TEST_F(GenericViewTest, allGenericExceptTop) { testAllGenericExceptTop>(false); testAllGenericExceptTop>(false); testAllGenericExceptTop>(false); - testAllGenericExceptTop>(false); - testAllGenericExceptTop>>(false); - testAllGenericExceptTop, int64_t>>(false); - testAllGenericExceptTop, int64_t>>(false); - testAllGenericExceptTop>>(false); - - testAllGenericExceptTop>>(true); - testAllGenericExceptTop, Generic<>>>(true); - testAllGenericExceptTop, Generic<>>>(true); + testAllGenericExceptTop(false); + testAllGenericExceptTop>(false); + testAllGenericExceptTop>(false); + testAllGenericExceptTop>(false); + testAllGenericExceptTop>(false); + + testAllGenericExceptTop>(true); + testAllGenericExceptTop>(true); + testAllGenericExceptTop>(true); testAllGenericExceptTop>(true); - testAllGenericExceptTop>>(true); + testAllGenericExceptTop>(true); } } // namespace diff --git a/velox/expression/tests/VectorReaderTest.cpp b/velox/expression/tests/VectorReaderTest.cpp index 84a7c6d91697..fe36d78a9941 100644 --- a/velox/expression/tests/VectorReaderTest.cpp +++ b/velox/expression/tests/VectorReaderTest.cpp @@ -563,7 +563,7 @@ TEST_F(VectorReaderTest, genericContainsNull) { [](vector_size_t row) { return row; }, [](vector_size_t row) { return row % 5 == 0; }); DecodedVector decoded; - exec::VectorReader> reader(decode(decoded, *vector.get())); + exec::VectorReader reader(decode(decoded, *vector.get())); ASSERT_THROW(reader.containsNull(1), VeloxUserError); // TODO (kevinwilfong): Add these back once generics are supported, and add diff --git a/velox/functions/prestosql/Cardinality.h b/velox/functions/prestosql/Cardinality.h index 51cde4ee23ee..d2dfc07851ae 100644 --- a/velox/functions/prestosql/Cardinality.h +++ b/velox/functions/prestosql/Cardinality.h @@ -23,11 +23,11 @@ template struct CardinalityFunction { VELOX_DEFINE_FUNCTION_TYPES(T); - void call(int64_t& out, const arg_type>>& input) { + void call(int64_t& out, const arg_type>& input) { out = input.size(); } - void call(int64_t& out, const arg_type, Generic<>>>& input) { + void call(int64_t& out, const arg_type>& input) { out = input.size(); } }; diff --git a/velox/functions/prestosql/registration/GeneralFunctionsRegistration.cpp b/velox/functions/prestosql/registration/GeneralFunctionsRegistration.cpp index b5dc53adc7f7..f81328e65619 100644 --- a/velox/functions/prestosql/registration/GeneralFunctionsRegistration.cpp +++ b/velox/functions/prestosql/registration/GeneralFunctionsRegistration.cpp @@ -32,9 +32,8 @@ void registerGeneralFunctions() { VELOX_REGISTER_VECTOR_FUNCTION(udf_least, "least"); VELOX_REGISTER_VECTOR_FUNCTION(udf_greatest, "greatest"); - registerFunction>>( - {"cardinality"}); - registerFunction, Generic<>>>( + registerFunction>({"cardinality"}); + registerFunction>( {"cardinality"}); } diff --git a/velox/functions/tests/FunctionRegistryTest.cpp b/velox/functions/tests/FunctionRegistryTest.cpp index c549c2f66e80..b77cb3ca01a5 100644 --- a/velox/functions/tests/FunctionRegistryTest.cpp +++ b/velox/functions/tests/FunctionRegistryTest.cpp @@ -467,14 +467,12 @@ struct TestFunction { out = 3; } - void call(int64_t& out, const arg_type>>&) { + void call(int64_t& out, const arg_type>&) { out = 4; } - void call( - double& out, - const arg_type&, - const arg_type>>&) { + void + call(double& out, const arg_type&, const arg_type>&) { out = 5; } }; @@ -482,9 +480,9 @@ struct TestFunction { TEST_F(FunctionRegistryTest, resolveFunctionsBasedOnPriority) { std::string func = "func_with_priority"; - registerFunction>>({func}); + registerFunction>({func}); registerFunction({func}); - registerFunction>>({func}); + registerFunction>({func}); registerFunction>({func}); registerFunction, Generic>({func}); diff --git a/velox/type/Type.h b/velox/type/Type.h index 6fa79270f3f4..7627b4279a77 100644 --- a/velox/type/Type.h +++ b/velox/type/Type.h @@ -1301,6 +1301,8 @@ struct Generic { Generic() = delete; }; +using Any = Generic<>; + template struct isVariadicType : public std::false_type {}; From c62bd762f4fd4da003fb8c13cbf8cdb7cfdbd752 Mon Sep 17 00:00:00 2001 From: svcscm svcscm Date: Mon, 18 Apr 2022 22:27:22 -0700 Subject: [PATCH 032/198] Updating submodules Summary: GitHub commits: https://github.com/facebook/fbthrift/commit/bc3f7af95e213e63daa6dd0c47d72cb34a7a831f https://github.com/facebook/folly/commit/3ca50e348f8b2a0e6ea39630fb332e97dafce4d8 Reviewed By: yns88 fbshipit-source-id: 009f3003d7e54da4b8c481c3f57c51577e4aedbd --- build/deps/github_hashes/facebook/folly-rev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/deps/github_hashes/facebook/folly-rev.txt b/build/deps/github_hashes/facebook/folly-rev.txt index 18d97e8995b8..780ee8944ae5 100644 --- a/build/deps/github_hashes/facebook/folly-rev.txt +++ b/build/deps/github_hashes/facebook/folly-rev.txt @@ -1 +1 @@ -Subproject commit 9d2b3ae18f8492e237f63a04ff6a66d1b79e1e91 +Subproject commit 3ca50e348f8b2a0e6ea39630fb332e97dafce4d8 From e57883fe0545c20205a450a4f6ef000b90e6e0e9 Mon Sep 17 00:00:00 2001 From: Kevin Wilfong Date: Tue, 19 Apr 2022 12:10:06 -0700 Subject: [PATCH 033/198] Fix bug when reapplying peeled dictionary encoding when nulls have been removed (#1026) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/1026 When we call setDictionaryWrap to peel dictionary encoding, and then setWrapped to reapply the dictionary encoding the size of the DictionaryVector can change. This is because we do not use the original vector size, but rather rows.end() for the size of the new unpeeled vector. This causes problems when nulls are removed, as when we call addNulls, the DictionaryVector may be smaller than the number of rows when there were nulls at the end of the batch. In this case, addNulls attempts to resize the DictionaryVector which is not supported. To fix this, I changed addNulls in Expr.cpp to call setSize on the vector to effectively resize it (all Vectors support setSize), and then call a new function ensureNullsCapacity which has logic I moved from mutableNulls to resize nulls_ in the Vector (again all Vectors support this). I debated nulling out flatRawNulls_ in DictionaryVector when ensureNullsCapacity is called, I decided not to since previously we could call mutableNulls on the DictionaryVector and produce the same results without modifying it. In Exprs.cpp we call addNulls immediately after which nulls out flatRawNulls_ anyway. Reviewed By: mbasmanova Differential Revision: D34176042 fbshipit-source-id: 896cba7fd07b2d184073ee63157ad303a348098b --- velox/expression/Expr.cpp | 10 ++++- velox/expression/tests/ExprTest.cpp | 69 +++++++++++++++++++++++++++++ velox/vector/BaseVector.cpp | 8 ++-- velox/vector/BaseVector.h | 20 ++++++--- velox/vector/BiasVector.h | 2 +- velox/vector/DictionaryVector.h | 2 +- velox/vector/FlatVector.h | 2 +- velox/vector/SimpleVector.h | 2 +- 8 files changed, 100 insertions(+), 15 deletions(-) diff --git a/velox/expression/Expr.cpp b/velox/expression/Expr.cpp index 668e1b88f27a..dca94bd5e317 100644 --- a/velox/expression/Expr.cpp +++ b/velox/expression/Expr.cpp @@ -673,13 +673,19 @@ void Expr::addNulls( return; } - if (!result->unique() || !(*result)->mayAddNulls()) { + if (!result->unique() || !(*result)->isNullsWritable()) { BaseVector::ensureWritable( SelectivityVector::empty(), type(), context->pool(), result); } + if ((*result)->size() < rows.end()) { - (*result)->resize(rows.end()); + // Not all Vectors support resize. Since we only want to append nulls, + // we can work around that by calling setSize to resize the vector and + // ensureNullsCapacity to resize the nulls_ bit vector. + (*result)->setSize(rows.end()); + (*result)->ensureNullsCapacity(rows.end(), true); } + (*result)->addNulls(rawNulls, rows); } diff --git a/velox/expression/tests/ExprTest.cpp b/velox/expression/tests/ExprTest.cpp index 142493d916b8..9052865c3fff 100644 --- a/velox/expression/tests/ExprTest.cpp +++ b/velox/expression/tests/ExprTest.cpp @@ -2570,3 +2570,72 @@ TEST_F(ExprTest, constantToString) { "4 elements starting at 0 {1.2000000476837158, 3.4000000953674316, null, 5.599999904632568}:ARRAY", exprSet.exprs()[2]->toString()); } + +namespace { +// A naive function that wraps the input in a dictionary vector resized to +// rows.end() - 1. It assumes all selected values are non-null. +class TestingShrinkingDictionary : public exec::VectorFunction { + public: + bool isDefaultNullBehavior() const override { + return true; + } + + void apply( + const SelectivityVector& rows, + std::vector& args, + const TypePtr& /* outputType */, + exec::EvalCtx* context, + VectorPtr* result) const override { + BufferPtr indices = + AlignedBuffer::allocate(rows.end(), context->pool()); + auto rawIndices = indices->asMutable(); + rows.applyToSelected([&](int row) { rawIndices[row] = row; }); + + *result = + BaseVector::wrapInDictionary(nullptr, indices, rows.end() - 1, args[0]); + } + + static std::vector> signatures() { + return {exec::FunctionSignatureBuilder() + .returnType("bigint") + .argumentType("bigint") + .build()}; + } +}; +} // namespace + +TEST_F(ExprTest, specialFormPropagateNulls) { + exec::registerVectorFunction( + "test_shrinking_dictionary", + TestingShrinkingDictionary::signatures(), + std::make_unique()); + + // This test verifies an edge case where applyFunctionWithPeeling may produce + // a result vector which is dictionary encoded and has fewer values than + // are rows. + // This can happen when the last value in a column used in an expression is + // null which causes removeSureNulls to move the end of the SelectivityVector + // forward. When we incorrectly use rows.end() as the size of the + // dictionary when rewrapping the results. + // Normally this is masked when this vector is used in a function call which + // produces a new output vector. However, in SpecialForm expressions, we may + // return the output untouched, and when we try to add back in the nulls, we + // get an exception trying to resize the DictionaryVector. + // This is difficult to reproduce, so this test artificially triggers the + // issue by using a UDF that returns a dictionary one smaller than rows.end(). + + // Making the last row NULL, so we call addNulls in eval. + auto c0 = makeFlatVector( + 5, + [](vector_size_t row) { return row; }, + [](vector_size_t row) { return row == 4; }); + + auto rowVector = makeRowVector({c0}); + auto evalResult = evaluate("test_shrinking_dictionary(\"c0\")", rowVector); + + auto expectedResult = makeFlatVector( + 5, + [](vector_size_t row) { return row; }, + [](vector_size_t row) { return row == 4; }); + assertEqualVectors(expectedResult, evalResult); +} diff --git a/velox/vector/BaseVector.cpp b/velox/vector/BaseVector.cpp index 8614af4b305d..861de44919b3 100644 --- a/velox/vector/BaseVector.cpp +++ b/velox/vector/BaseVector.cpp @@ -345,13 +345,13 @@ VectorPtr BaseVector::create( } void BaseVector::addNulls(const uint64_t* bits, const SelectivityVector& rows) { - VELOX_CHECK(mayAddNulls()); + VELOX_CHECK(isNullsWritable()); VELOX_CHECK(length_ >= rows.end()); ensureNulls(); auto target = nulls_->asMutable(); const uint64_t* selected = rows.asRange().bits(); if (!bits) { - // A A 1 in rows makes a 0 in nulls. + // A 1 in rows makes a 0 in nulls. bits::andWithNegatedBits(target, selected, rows.begin(), rows.end()); return; } @@ -368,7 +368,7 @@ void BaseVector::addNulls(const uint64_t* bits, const SelectivityVector& rows) { } void BaseVector::clearNulls(const SelectivityVector& rows) { - VELOX_CHECK(mayAddNulls()); + VELOX_CHECK(isNullsWritable()); if (!nulls_) { return; } @@ -390,7 +390,7 @@ void BaseVector::clearNulls(const SelectivityVector& rows) { } void BaseVector::clearNulls(vector_size_t begin, vector_size_t end) { - VELOX_CHECK(mayAddNulls()); + VELOX_CHECK(isNullsWritable()); if (!nulls_) { return; } diff --git a/velox/vector/BaseVector.h b/velox/vector/BaseVector.h index 07ab34b37164..d0a95261c6ea 100644 --- a/velox/vector/BaseVector.h +++ b/velox/vector/BaseVector.h @@ -187,16 +187,26 @@ class BaseVector { } virtual BufferPtr mutableNulls(vector_size_t size) { + ensureNullsCapacity(size); + return nulls_; + } + + /* + * Allocates or reallocates nulls_ with the given size if nulls_ hasn't + * been allocated yet or has been allocated with a smaller capacity. + */ + void ensureNullsCapacity(vector_size_t size, bool setNotNull = false) { if (nulls_ && nulls_->capacity() >= bits::nbytes(size)) { - return nulls_; + return; } if (nulls_) { - AlignedBuffer::reallocate(&nulls_, size, false); + AlignedBuffer::reallocate( + &nulls_, size, setNotNull ? bits::kNotNull : bits::kNull); } else { - nulls_ = AlignedBuffer::allocate(size, pool_, false); + nulls_ = AlignedBuffer::allocate( + size, pool_, setNotNull ? bits::kNotNull : bits::kNull); } rawNulls_ = nulls_->as(); - return nulls_; } std::optional getDistinctValueCount() const { @@ -342,7 +352,7 @@ class BaseVector { return countNulls(nulls, 0, size); } - virtual bool mayAddNulls() const { + virtual bool isNullsWritable() const { return true; } diff --git a/velox/vector/BiasVector.h b/velox/vector/BiasVector.h index b43efe130e04..aaa75b09ae10 100644 --- a/velox/vector/BiasVector.h +++ b/velox/vector/BiasVector.h @@ -159,7 +159,7 @@ class BiasVector : public SimpleVector { return true; } - bool mayAddNulls() const override { + bool isNullsWritable() const override { return true; } diff --git a/velox/vector/DictionaryVector.h b/velox/vector/DictionaryVector.h index 26603079805e..ad653802628b 100644 --- a/velox/vector/DictionaryVector.h +++ b/velox/vector/DictionaryVector.h @@ -186,7 +186,7 @@ class DictionaryVector : public SimpleVector { return dictionaryValues_->wrappedIndex(rawIndices_[index]); } - bool mayAddNulls() const override { + bool isNullsWritable() const override { return true; } diff --git a/velox/vector/FlatVector.h b/velox/vector/FlatVector.h index 1d06b04fb6e7..964a15abe543 100644 --- a/velox/vector/FlatVector.h +++ b/velox/vector/FlatVector.h @@ -290,7 +290,7 @@ class FlatVector final : public SimpleVector { return size; } - bool mayAddNulls() const override { + bool isNullsWritable() const override { return true; } diff --git a/velox/vector/SimpleVector.h b/velox/vector/SimpleVector.h index a0abeb926283..769e6d7a164a 100644 --- a/velox/vector/SimpleVector.h +++ b/velox/vector/SimpleVector.h @@ -164,7 +164,7 @@ class SimpleVector : public BaseVector { return elementSize_; } - bool mayAddNulls() const override { + bool isNullsWritable() const override { return false; } From 51136d3ebe380b64eee768d11a7d51eefdd3aba0 Mon Sep 17 00:00:00 2001 From: svcscm svcscm Date: Tue, 19 Apr 2022 16:12:52 -0700 Subject: [PATCH 034/198] Updating submodules Summary: GitHub commits: https://github.com/facebook/cachelib/commit/2a06af8135d24a52da2d73441971ef2eb167b49e https://github.com/facebook/fb303/commit/dc2f16f6a478b5862632186e3af5b6db48aa9237 https://github.com/facebook/fbthrift/commit/616edf56c06d95d20df6ff33205b280d7d0cd06d https://github.com/facebook/fbzmq/commit/277cb2480a005e76d430d76c602a400b5ca97776 https://github.com/facebook/folly/commit/f147eac7d9b556422cfb6119144e438df84dba41 https://github.com/facebook/proxygen/commit/a0609a354a40e0af576419ab477893e8559a8ea7 https://github.com/facebook/watchman/commit/177c459bce390e1ba65a31e7e71f0d91e5918c63 Reviewed By: yns88 fbshipit-source-id: 895a5269a1fc86bbda8e042780668b2a218460fa --- build/deps/github_hashes/facebook/folly-rev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/deps/github_hashes/facebook/folly-rev.txt b/build/deps/github_hashes/facebook/folly-rev.txt index 780ee8944ae5..b4aabbf281f5 100644 --- a/build/deps/github_hashes/facebook/folly-rev.txt +++ b/build/deps/github_hashes/facebook/folly-rev.txt @@ -1 +1 @@ -Subproject commit 3ca50e348f8b2a0e6ea39630fb332e97dafce4d8 +Subproject commit f147eac7d9b556422cfb6119144e438df84dba41 From 8517ff4d6f9acf88b737f638fac7169b0014ac5d Mon Sep 17 00:00:00 2001 From: Pedro Eugenio Rocha Pedreira Date: Tue, 19 Apr 2022 16:22:04 -0700 Subject: [PATCH 035/198] Add dbgen wrapper to generate Vectors for `Nation` (#1422) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/1422 Adding a wrapper around dbgen to generate TPC-H synthetic data and return them as RowVector. For now only adding Nation tables, but as we agree on the API I'll go ahead an add the other tables. . Note that for now I'm wrapping around duckDB's dbgen, but only using vanilla dbgen. So in the next diffs we can add our own copy of dbgen (by vendoring it or a submodule), and remove the duckDB dependency. Reviewed By: mbasmanova Differential Revision: D35634474 fbshipit-source-id: 892ec0efdbb93711866ba0b3e7ec5ba3c373465b --- CMakeLists.txt | 1 + velox/CMakeLists.txt | 1 + velox/external/duckdb/tpch/dbgen/bm_utils.cpp | 2 +- velox/external/duckdb/tpch/dbgen/dbgen.cpp | 14 +- .../duckdb/tpch/dbgen/include/dbgen/dss.h | 2 +- velox/tpch/gen/CMakeLists.txt | 20 +++ velox/tpch/gen/TpchGen.cpp | 129 ++++++++++++++++++ velox/tpch/gen/TpchGen.h | 71 ++++++++++ velox/tpch/gen/tests/CMakeLists.txt | 19 +++ velox/tpch/gen/tests/TpchGenTest.cpp | 93 +++++++++++++ 10 files changed, 343 insertions(+), 9 deletions(-) create mode 100644 velox/tpch/gen/CMakeLists.txt create mode 100644 velox/tpch/gen/TpchGen.cpp create mode 100644 velox/tpch/gen/TpchGen.h create mode 100644 velox/tpch/gen/tests/CMakeLists.txt create mode 100644 velox/tpch/gen/tests/TpchGenTest.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 64ce751f2c03..b18ab46afcd5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -288,6 +288,7 @@ endif() include_directories(SYSTEM velox) include_directories(SYSTEM velox/external) include_directories(SYSTEM velox/external/duckdb) +include_directories(SYSTEM velox/external/duckdb/tpch/dbgen/include) include(CTest) # include after project() but before add_subdirectory() diff --git a/velox/CMakeLists.txt b/velox/CMakeLists.txt index 69fd54854089..5e7c87d0734b 100644 --- a/velox/CMakeLists.txt +++ b/velox/CMakeLists.txt @@ -59,6 +59,7 @@ endif() if(${VELOX_ENABLE_DUCKDB}) add_subdirectory(duckdb) add_subdirectory(external/duckdb) + add_subdirectory(tpch/gen) endif() if(${VELOX_CODEGEN_SUPPORT}) diff --git a/velox/external/duckdb/tpch/dbgen/bm_utils.cpp b/velox/external/duckdb/tpch/dbgen/bm_utils.cpp index 1e01891d87f3..429840da554b 100644 --- a/velox/external/duckdb/tpch/dbgen/bm_utils.cpp +++ b/velox/external/duckdb/tpch/dbgen/bm_utils.cpp @@ -446,7 +446,7 @@ set_state(int table, long sf, long procs, long step, DSS_HUGE *extra_rows) { /* need to set seeds of child in case there's a dependency */ /* NOTE: this assumes that the parent and child have the same base row * count */ - if (tdefs[table].child != NONE) + if (tdefs[table].child != DBGEN_NONE) tdefs[tdefs[table].child].gen_seed(0, rowcount); } if (step > procs) /* moving to the end to generate updates */ diff --git a/velox/external/duckdb/tpch/dbgen/dbgen.cpp b/velox/external/duckdb/tpch/dbgen/dbgen.cpp index 69f1a2078948..27d00f30d9e3 100644 --- a/velox/external/duckdb/tpch/dbgen/dbgen.cpp +++ b/velox/external/duckdb/tpch/dbgen/dbgen.cpp @@ -28,7 +28,7 @@ seed_t DBGenGlobals::Seed[MAX_STREAM + 1] = { {PART, 1841581359, 0, 1}, /* P_TYPE_SD 2 */ {PART, 1193163244, 0, 1}, /* P_SIZE_SD 3 */ {PART, 727633698, 0, 1}, /* P_CNTR_SD 4 */ - {NONE, 933588178, 0, 1}, /* text pregeneration 5 */ + {DBGEN_NONE, 933588178, 0, 1}, /* text pregeneration 5 */ {PART, 804159733, 0, 2}, /* P_CMNT_SD 6 */ {PSUPP, 1671059989, 0, SUPP_PER_PART}, /* PS_QTY_SD 7 */ {PSUPP, 1051288424, 0, SUPP_PER_PART}, /* PS_SCST_SD 8 */ @@ -75,15 +75,15 @@ seed_t DBGenGlobals::Seed[MAX_STREAM + 1] = { double DBGenGlobals::dM = 2147483647.0; tdef DBGenGlobals::tdefs[10] = { {"part.tbl", "part table", 200000, NULL, NULL, PSUPP, 0}, - {"partsupp.tbl", "partsupplier table", 200000, NULL, NULL, NONE, 0}, - {"supplier.tbl", "suppliers table", 10000, NULL, NULL, NONE, 0}, - {"customer.tbl", "customers table", 150000, NULL, NULL, NONE, 0}, + {"partsupp.tbl", "partsupplier table", 200000, NULL, NULL, DBGEN_NONE, 0}, + {"supplier.tbl", "suppliers table", 10000, NULL, NULL, DBGEN_NONE, 0}, + {"customer.tbl", "customers table", 150000, NULL, NULL, DBGEN_NONE, 0}, {"orders.tbl", "order table", 150000, NULL, NULL, LINE, 0}, - {"lineitem.tbl", "lineitem table", 150000, NULL, NULL, NONE, 0}, + {"lineitem.tbl", "lineitem table", 150000, NULL, NULL, DBGEN_NONE, 0}, {"orders.tbl", "orders/lineitem tables", 150000, NULL, NULL, LINE, 0}, {"part.tbl", "part/partsupplier tables", 200000, NULL, NULL, PSUPP, 0}, - {"nation.tbl", "nation table", NATIONS_MAX, NULL, NULL, NONE, 0}, - {"region.tbl", "region table", NATIONS_MAX, NULL, NULL, NONE, 0}, + {"nation.tbl", "nation table", NATIONS_MAX, NULL, NULL, DBGEN_NONE, 0}, + {"region.tbl", "region table", NATIONS_MAX, NULL, NULL, DBGEN_NONE, 0}, }; static seed_t *Seed = DBGenGlobals::Seed; diff --git a/velox/external/duckdb/tpch/dbgen/include/dbgen/dss.h b/velox/external/duckdb/tpch/dbgen/include/dbgen/dss.h index 5b62f671e539..69a50dddf3e6 100644 --- a/velox/external/duckdb/tpch/dbgen/include/dbgen/dss.h +++ b/velox/external/duckdb/tpch/dbgen/include/dbgen/dss.h @@ -40,7 +40,7 @@ #define printf(...) #define fprintf(...) -#define NONE -1 +#define DBGEN_NONE -1 #define PART 0 #define PSUPP 1 #define SUPP 2 diff --git a/velox/tpch/gen/CMakeLists.txt b/velox/tpch/gen/CMakeLists.txt new file mode 100644 index 000000000000..dd7b0068af8b --- /dev/null +++ b/velox/tpch/gen/CMakeLists.txt @@ -0,0 +1,20 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +add_library(velox_tpch_gen TpchGen.cpp) + +target_link_libraries(velox_tpch_gen velox_memory velox_vector dbgen duckdb) + +if(${VELOX_BUILD_TESTING}) + add_subdirectory(tests) +endif() diff --git a/velox/tpch/gen/TpchGen.cpp b/velox/tpch/gen/TpchGen.cpp new file mode 100644 index 000000000000..3bee2b53adf1 --- /dev/null +++ b/velox/tpch/gen/TpchGen.cpp @@ -0,0 +1,129 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/tpch/gen/TpchGen.h" +#include "velox/external/duckdb/tpch/dbgen/include/dbgen/dbgen_gunk.hpp" +#include "velox/external/duckdb/tpch/dbgen/include/dbgen/dss.h" +#include "velox/external/duckdb/tpch/dbgen/include/dbgen/dsstypes.h" +#include "velox/vector/FlatVector.h" + +namespace facebook::velox::tpch { + +namespace { + +// The cardinality of the LINEITEM table is not a strict multiple of SF since +// the number of lineitems in an order is chosen at random with an average of +// four. This function contains the row count for all authorized scale factors +// (as described by the TPC-H spec), and approximates the remaining. +constexpr size_t getLineItemRowCount(size_t scaleFactor) { + switch (scaleFactor) { + case 1: + return 6'001'215; + case 10: + return 59'986'052; + case 30: + return 179'998'372; + case 100: + return 600'037'902; + case 300: + return 1'799'989'091; + case 1'000: + return 5'999'989'709; + case 3'000: + return 18'000'048'306; + case 10'000: + return 59'999'994'267; + case 30'000: + return 179'999'978'268; + case 100'000: + return 599'999'969'200; + default: + break; + } + return 6'000'000 * scaleFactor; +} + +} // namespace + +constexpr size_t getRowCount(Table table, size_t scaleFactor) { + switch (table) { + case Table::TBL_PART: + return 200'000 * scaleFactor; + case Table::TBL_SUPPLIER: + return 10'000 * scaleFactor; + case Table::TBL_PARTSUP: + return 800'000 * scaleFactor; + case Table::TBL_CUSTOMER: + return 150'000 * scaleFactor; + case Table::TBL_ORDERS: + return 1'500'000 * scaleFactor; + case Table::TBL_NATION: + return 25; + case Table::TBL_REGION: + return 5; + case Table::TBL_LINEITEM: + return getLineItemRowCount(scaleFactor); + } + return 0; // make gcc happy. +} + +RowVectorPtr genTpchNation( + size_t maxRows, + size_t offset, + size_t scaleFactor, + memory::MemoryPool* pool) { + size_t rowCount = getRowCount(Table::TBL_NATION, scaleFactor); + size_t vectorSize = std::min(rowCount - offset, maxRows); + + // Create schema and allocate vectors. + static TypePtr nationRowType = + ROW({"n_nationkey", "n_name", "n_regionkey", "n_comment"}, + {BIGINT(), VARCHAR(), BIGINT(), VARCHAR()}); + std::vector children = { + BaseVector::create(BIGINT(), vectorSize, pool), + BaseVector::create(VARCHAR(), vectorSize, pool), + BaseVector::create(BIGINT(), vectorSize, pool), + BaseVector::create(VARCHAR(), vectorSize, pool), + }; + + auto nationKeyVector = children[0]->asFlatVector(); + auto nameVector = children[1]->asFlatVector(); + auto regionKeyVector = children[2]->asFlatVector(); + auto commentVector = children[3]->asFlatVector(); + + // load_dists()/cleanup_dists() need to be called to ensure the global + // variables required by dbgen are populated. + load_dists(); + code_t code; + + // Dbgen generates the dataset one row at a time, so we need to transpose it + // into a columnar format. + for (size_t i = 0; i < vectorSize; ++i) { + row_start(NATION); + mk_nation(i + offset + 1, &code); + nationKeyVector->set(i, code.code); + nameVector->set(i, StringView(code.text, strlen(code.text))); + regionKeyVector->set(i, code.join); + commentVector->set(i, StringView(code.comment, code.clen)); + row_stop_h(NATION); + } + cleanup_dists(); + + return std::make_shared( + pool, nationRowType, BufferPtr(nullptr), vectorSize, std::move(children)); +} + +} // namespace facebook::velox::tpch diff --git a/velox/tpch/gen/TpchGen.h b/velox/tpch/gen/TpchGen.h new file mode 100644 index 000000000000..881d1e668649 --- /dev/null +++ b/velox/tpch/gen/TpchGen.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "velox/common/memory/Memory.h" +#include "velox/vector/ComplexVector.h" + +namespace facebook::velox::tpch { + +/// This file uses TPC-H DBGEN to generate data encoded using Velox Vectors. +/// +/// The basic input for the API is the TPC-H table name (the Table enum), the +/// TPC-H scale factor, the maximum batch size, and the offset. The common usage +/// is to make successive calls to this API advancing the offset parameter, +/// until all records were read. Clients might also assign different slices of +/// the range "[0, getRowCount(Table, scaleFactor)[" to different threads in +/// order to generate datasets in parallel. +/// +/// If not enough records are available given a particular scale factor and +/// offset, less than maxRows records might be returned. +/// +/// Data is always returned in a RowVector. + +enum class Table { + TBL_PART, + TBL_SUPPLIER, + TBL_PARTSUP, + TBL_CUSTOMER, + TBL_ORDERS, + TBL_LINEITEM, + TBL_NATION, + TBL_REGION, +}; + +/// Returns the row count for a particular TPC-H table given a scale factor, as +/// defined in the spec available at: +/// +/// https://www.tpc.org/tpch/ +constexpr size_t getRowCount(Table table, size_t scaleFactor); + +/// Returns a row vector containing at most `maxRows` rows of the "nation" +/// table, starting at `offset`, and given the scale factor. The row vector +/// returned has the following schema: +/// +/// n_nationkey: BIGINT +/// n_name: VARCHAR +/// n_regionkey: BIGINT +/// n_comment: VARCHAR +/// +RowVectorPtr genTpchNation( + size_t maxRows = 10000, + size_t offset = 0, + size_t scaleFactor = 1, + memory::MemoryPool* pool = + &velox::memory::getProcessDefaultMemoryManager().getRoot()); + +} // namespace facebook::velox::tpch diff --git a/velox/tpch/gen/tests/CMakeLists.txt b/velox/tpch/gen/tests/CMakeLists.txt new file mode 100644 index 000000000000..4607d4801a3e --- /dev/null +++ b/velox/tpch/gen/tests/CMakeLists.txt @@ -0,0 +1,19 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +add_executable(velox_tpch_gen_test TpchGenTest.cpp) + +add_test(velox_tpch_gen_test velox_tpch_gen_test) + +target_link_libraries(velox_tpch_gen_test velox_tpch_gen velox_type + velox_vector gtest gtest_main) diff --git a/velox/tpch/gen/tests/TpchGenTest.cpp b/velox/tpch/gen/tests/TpchGenTest.cpp new file mode 100644 index 000000000000..f975e7e453be --- /dev/null +++ b/velox/tpch/gen/tests/TpchGenTest.cpp @@ -0,0 +1,93 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "gtest/gtest.h" + +#include "velox/tpch/gen/TpchGen.h" +#include "velox/type/StringView.h" +#include "velox/vector/FlatVector.h" + +namespace { + +using namespace facebook::velox; +using namespace facebook::velox::tpch; + +TEST(TpchGenTestNation, default) { + auto rowVector = genTpchNation(); + ASSERT_NE(rowVector, nullptr); + EXPECT_EQ(4, rowVector->childrenSize()); + EXPECT_EQ(25, rowVector->size()); + + auto nationKey = rowVector->childAt(0)->asFlatVector(); + auto nationName = rowVector->childAt(1)->asFlatVector(); + + EXPECT_EQ(0, nationKey->valueAt(0)); + EXPECT_EQ("ALGERIA"_sv, nationName->valueAt(0)); + + // Ensure we won't crash while accessing any of the columns. + LOG(INFO) << rowVector->toString(0); + + EXPECT_EQ(24, nationKey->valueAt(24)); + EXPECT_EQ("UNITED STATES"_sv, nationName->valueAt(24)); + LOG(INFO) << rowVector->toString(24); +} + +// Ensure scale factor doesn't affect Nation table. +TEST(TpchGenTestNation, scaleFactor) { + auto rowVector = genTpchNation(10000, 0, 1000); + ASSERT_NE(rowVector, nullptr); + + EXPECT_EQ(4, rowVector->childrenSize()); + EXPECT_EQ(25, rowVector->size()); +} + +TEST(TpchGenTestNation, smallBatch) { + auto rowVector = genTpchNation(10); + ASSERT_NE(rowVector, nullptr); + + EXPECT_EQ(4, rowVector->childrenSize()); + EXPECT_EQ(10, rowVector->size()); + + auto nationKey = rowVector->childAt(0)->asFlatVector(); + EXPECT_EQ(0, nationKey->valueAt(0)); + EXPECT_EQ(9, nationKey->valueAt(9)); +} + +TEST(TpchGenTestNation, smallBatchWithOffset) { + auto rowVector = genTpchNation(10, 5); + ASSERT_NE(rowVector, nullptr); + + EXPECT_EQ(4, rowVector->childrenSize()); + EXPECT_EQ(10, rowVector->size()); + + auto nationKey = rowVector->childAt(0)->asFlatVector(); + EXPECT_EQ(5, nationKey->valueAt(0)); + EXPECT_EQ(14, nationKey->valueAt(9)); +} + +TEST(TpchGenTestNation, smallBatchPastEnd) { + auto rowVector = genTpchNation(10, 20); + ASSERT_NE(rowVector, nullptr); + + EXPECT_EQ(4, rowVector->childrenSize()); + EXPECT_EQ(5, rowVector->size()); + + auto nationKey = rowVector->childAt(0)->asFlatVector(); + EXPECT_EQ(20, nationKey->valueAt(0)); + EXPECT_EQ(24, nationKey->valueAt(4)); +} + +} // namespace From d507c376ce64a30838c2dd9a6823eb694009d48a Mon Sep 17 00:00:00 2001 From: Pedro Eugenio Rocha Pedreira Date: Tue, 19 Apr 2022 20:29:20 -0700 Subject: [PATCH 036/198] Add dbgen support for Orders table (#1449) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/1449 Add dbgen support for Orders table Reviewed By: mbasmanova Differential Revision: D35656757 fbshipit-source-id: 14fb0b4e06e782820e79d519c66eac40c763b2db --- velox/tpch/gen/TpchGen.cpp | 90 ++++++++++++++++++++++++++++ velox/tpch/gen/TpchGen.h | 21 +++++++ velox/tpch/gen/tests/TpchGenTest.cpp | 47 ++++++++++++++- 3 files changed, 157 insertions(+), 1 deletion(-) diff --git a/velox/tpch/gen/TpchGen.cpp b/velox/tpch/gen/TpchGen.cpp index 3bee2b53adf1..5c923ea27590 100644 --- a/velox/tpch/gen/TpchGen.cpp +++ b/velox/tpch/gen/TpchGen.cpp @@ -80,6 +80,93 @@ constexpr size_t getRowCount(Table table, size_t scaleFactor) { return 0; // make gcc happy. } +RowVectorPtr genTpchOrders( + size_t maxRows, + size_t offset, + size_t scaleFactor, + memory::MemoryPool* pool) { + size_t rowCount = getRowCount(Table::TBL_ORDERS, scaleFactor); + size_t vectorSize = std::min(rowCount - offset, maxRows); + + // DBGEN takes the scale factor through this ugly global variable. + scale = scaleFactor; + + // Create schema and allocate vectors. + static TypePtr ordersRowType = ROW( + { + "o_orderkey", + "o_custkey", + "o_orderstatus", + "o_totalprice", + "o_orderdate", + "o_orderpriority", + "o_clerk", + "o_shippriority", + "o_comment", + }, + { + BIGINT(), + BIGINT(), + VARCHAR(), + DOUBLE(), + VARCHAR(), + VARCHAR(), + VARCHAR(), + INTEGER(), + VARCHAR(), + }); + std::vector children = { + BaseVector::create(BIGINT(), vectorSize, pool), + BaseVector::create(BIGINT(), vectorSize, pool), + BaseVector::create(VARCHAR(), vectorSize, pool), + BaseVector::create(DOUBLE(), vectorSize, pool), + BaseVector::create(VARCHAR(), vectorSize, pool), + BaseVector::create(VARCHAR(), vectorSize, pool), + BaseVector::create(VARCHAR(), vectorSize, pool), + BaseVector::create(INTEGER(), vectorSize, pool), + BaseVector::create(VARCHAR(), vectorSize, pool), + }; + + auto orderKeyVector = children[0]->asFlatVector(); + auto custKeyVector = children[1]->asFlatVector(); + auto orderStatusVector = children[2]->asFlatVector(); + auto totalPriceVector = children[3]->asFlatVector(); + auto orderDateVector = children[4]->asFlatVector(); + auto orderPriorityVector = children[5]->asFlatVector(); + auto clerkVector = children[6]->asFlatVector(); + auto shipPriorityVector = children[7]->asFlatVector(); + auto commentVector = children[8]->asFlatVector(); + + // load_dists()/cleanup_dists() need to be called to ensure the global + // variables required by dbgen are populated. + load_dists(); + order_t order; + + // Dbgen generates the dataset one row at a time, so we need to transpose it + // into a columnar format. + for (size_t i = 0; i < vectorSize; ++i) { + row_start(ORDER); + mk_order(i + offset + 1, &order, 0); + + orderKeyVector->set(i, order.okey); + custKeyVector->set(i, order.custkey); + orderStatusVector->set(i, StringView(&order.orderstatus, 1)); + totalPriceVector->set(i, order.totalprice); + orderDateVector->set(i, StringView(order.odate, strlen(order.odate))); + orderPriorityVector->set( + i, StringView(order.opriority, strlen(order.opriority))); + clerkVector->set(i, StringView(order.clerk, strlen(order.clerk))); + shipPriorityVector->set(i, order.spriority); + commentVector->set(i, StringView(order.comment, order.clen)); + + row_stop_h(ORDER); + } + cleanup_dists(); + + return std::make_shared( + pool, ordersRowType, BufferPtr(nullptr), vectorSize, std::move(children)); +} + RowVectorPtr genTpchNation( size_t maxRows, size_t offset, @@ -88,6 +175,9 @@ RowVectorPtr genTpchNation( size_t rowCount = getRowCount(Table::TBL_NATION, scaleFactor); size_t vectorSize = std::min(rowCount - offset, maxRows); + // DBGEN takes the scale factor through this ugly global variable. + scale = scaleFactor; + // Create schema and allocate vectors. static TypePtr nationRowType = ROW({"n_nationkey", "n_name", "n_regionkey", "n_comment"}, diff --git a/velox/tpch/gen/TpchGen.h b/velox/tpch/gen/TpchGen.h index 881d1e668649..99f064e17bd0 100644 --- a/velox/tpch/gen/TpchGen.h +++ b/velox/tpch/gen/TpchGen.h @@ -52,6 +52,27 @@ enum class Table { /// https://www.tpc.org/tpch/ constexpr size_t getRowCount(Table table, size_t scaleFactor); +/// Returns a row vector containing at most `maxRows` rows of the "orders" +/// table, starting at `offset`, and given the scale factor. The row vector +/// returned has the following schema: +/// +/// o_orderkey: BIGINT +/// o_custkey: BIGINT +/// o_orderstatus: VARCHAR +/// o_totalprice: DOUBLE +/// o_orderdate: VARCHAR +/// o_orderpriority: VARCHAR +/// o_clerk: VARCHAR +/// o_shippriority: INTEGER +/// o_comment: VARCHAR +/// +RowVectorPtr genTpchOrders( + size_t maxRows = 10000, + size_t offset = 0, + size_t scaleFactor = 1, + memory::MemoryPool* pool = + &velox::memory::getProcessDefaultMemoryManager().getRoot()); + /// Returns a row vector containing at most `maxRows` rows of the "nation" /// table, starting at `offset`, and given the scale factor. The row vector /// returned has the following schema: diff --git a/velox/tpch/gen/tests/TpchGenTest.cpp b/velox/tpch/gen/tests/TpchGenTest.cpp index f975e7e453be..afa458eb8a41 100644 --- a/velox/tpch/gen/tests/TpchGenTest.cpp +++ b/velox/tpch/gen/tests/TpchGenTest.cpp @@ -47,7 +47,7 @@ TEST(TpchGenTestNation, default) { // Ensure scale factor doesn't affect Nation table. TEST(TpchGenTestNation, scaleFactor) { - auto rowVector = genTpchNation(10000, 0, 1000); + auto rowVector = genTpchNation(10'000, 0, 1'000); ASSERT_NE(rowVector, nullptr); EXPECT_EQ(4, rowVector->childrenSize()); @@ -90,4 +90,49 @@ TEST(TpchGenTestNation, smallBatchPastEnd) { EXPECT_EQ(24, nationKey->valueAt(4)); } +TEST(TpchGenTestOrders, batches) { + auto rowVector1 = genTpchOrders(10'000); + + EXPECT_EQ(9, rowVector1->childrenSize()); + EXPECT_EQ(10'000, rowVector1->size()); + + auto orderKey = rowVector1->childAt(0)->asFlatVector(); + auto orderDate = rowVector1->childAt(4)->asFlatVector(); + + EXPECT_EQ(1, orderKey->valueAt(0)); + EXPECT_EQ("1996-01-02"_sv, orderDate->valueAt(0)); + LOG(INFO) << rowVector1->toString(0); + + EXPECT_EQ(40'000, orderKey->valueAt(9999)); + EXPECT_EQ("1995-01-30"_sv, orderDate->valueAt(9999)); + LOG(INFO) << rowVector1->toString(9999); + + // Get second batch. + auto rowVector2 = genTpchOrders(10'000, 10'000); + + EXPECT_EQ(9, rowVector2->childrenSize()); + EXPECT_EQ(10'000, rowVector2->size()); + + orderKey = rowVector2->childAt(0)->asFlatVector(); + orderDate = rowVector2->childAt(4)->asFlatVector(); + + EXPECT_EQ(40001, orderKey->valueAt(0)); + EXPECT_EQ("1995-02-25"_sv, orderDate->valueAt(0)); + LOG(INFO) << rowVector2->toString(0); + + EXPECT_EQ(80000, orderKey->valueAt(9999)); + EXPECT_EQ("1995-12-15"_sv, orderDate->valueAt(9999)); + LOG(INFO) << rowVector2->toString(9999); +} + +TEST(TpchGenTestOrders, lastBatch) { + // Ask for 200 but there are only 100 left. + auto rowVector = genTpchOrders(200, 1'499'900); + EXPECT_EQ(100, rowVector->size()); + + // Ensure we get 200 on a larger scale factor. + rowVector = genTpchOrders(200, 1'499'900, 2); + EXPECT_EQ(200, rowVector->size()); +} + } // namespace From 95ef2db7582dcf3ff8ce5e800ca96d47b8bd94aa Mon Sep 17 00:00:00 2001 From: svcscm svcscm Date: Tue, 19 Apr 2022 22:55:31 -0700 Subject: [PATCH 037/198] Updating submodules Summary: GitHub commits: https://github.com/facebook/folly/commit/a56b1999d9e993509256e3391d48013dca052809 https://github.com/facebookresearch/recipes/commit/548d50dff33080c5f0d80eb60ac43ebc5238b0d4 Reviewed By: yns88 fbshipit-source-id: 27664197f27d0bdd24e0fa277ab7e5aa908d9b92 --- build/deps/github_hashes/facebook/folly-rev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/deps/github_hashes/facebook/folly-rev.txt b/build/deps/github_hashes/facebook/folly-rev.txt index b4aabbf281f5..edafe8995954 100644 --- a/build/deps/github_hashes/facebook/folly-rev.txt +++ b/build/deps/github_hashes/facebook/folly-rev.txt @@ -1 +1 @@ -Subproject commit f147eac7d9b556422cfb6119144e438df84dba41 +Subproject commit a56b1999d9e993509256e3391d48013dca052809 From b25bc7b56ec90247e7347180febfde8ccd8afd31 Mon Sep 17 00:00:00 2001 From: svcscm svcscm Date: Wed, 20 Apr 2022 00:59:05 -0700 Subject: [PATCH 038/198] Updating submodules Summary: GitHub commits: https://github.com/facebook/cachelib/commit/cd396f9aabd7329c63969153ac3d0bdf1cc95709 https://github.com/facebook/fb303/commit/a193d6a3f857aa44d7306131e2ba298e7464cdc2 https://github.com/facebook/fbzmq/commit/7f689286550b21d239167b04366b3292ed99cf11 https://github.com/facebook/folly/commit/77de3e8432f53b8989e8dd404ee79a5c5aaf4069 https://github.com/facebook/watchman/commit/c0f03b2ad2184a1d607e968090cdafa0d3be3c02 https://github.com/facebookexperimental/rust-shed/commit/ca9b32f2102ed281a464b1b72372adee6e87ff9f Reviewed By: yns88 fbshipit-source-id: 803665c9dea1ae583593e9bacbea60efa2c328d7 --- build/deps/github_hashes/facebook/folly-rev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/deps/github_hashes/facebook/folly-rev.txt b/build/deps/github_hashes/facebook/folly-rev.txt index edafe8995954..77d7e33cce4f 100644 --- a/build/deps/github_hashes/facebook/folly-rev.txt +++ b/build/deps/github_hashes/facebook/folly-rev.txt @@ -1 +1 @@ -Subproject commit a56b1999d9e993509256e3391d48013dca052809 +Subproject commit 77de3e8432f53b8989e8dd404ee79a5c5aaf4069 From 4d1025c57f94733b83969eda35816444cfa578a6 Mon Sep 17 00:00:00 2001 From: Sergey Pershin Date: Wed, 20 Apr 2022 14:39:30 -0700 Subject: [PATCH 039/198] Remove obsolete TODO and add a new one. (#1452) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/1452 Remove obsolete TODO and add a new one. No code changes. Reviewed By: amitkdutta Differential Revision: D35788780 fbshipit-source-id: 294423d23b3ec7cbd80b33b1be7be28fde820ffa --- .../prestosql/aggregates/MinMaxByAggregates.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/velox/functions/prestosql/aggregates/MinMaxByAggregates.cpp b/velox/functions/prestosql/aggregates/MinMaxByAggregates.cpp index e48752d289b0..94b36d33112f 100644 --- a/velox/functions/prestosql/aggregates/MinMaxByAggregates.cpp +++ b/velox/functions/prestosql/aggregates/MinMaxByAggregates.cpp @@ -459,6 +459,9 @@ std::unique_ptr create( template