diff --git a/velox/exec/fuzzer/AggregationFuzzer.cpp b/velox/exec/fuzzer/AggregationFuzzer.cpp index 456c9644e79e2..088122e146e79 100644 --- a/velox/exec/fuzzer/AggregationFuzzer.cpp +++ b/velox/exec/fuzzer/AggregationFuzzer.cpp @@ -363,7 +363,7 @@ void AggregationFuzzer::go() { auto partitionKeys = generateKeys("p", argNames, argTypes); auto sortingKeys = generateSortingKeys("s", argNames, argTypes); - auto input = generateInputDataWithRowNumber( + auto input = generateInputDataForWindowFuzzer( argNames, argTypes, partitionKeys, signature); logVectors(input); diff --git a/velox/exec/fuzzer/AggregationFuzzerBase.cpp b/velox/exec/fuzzer/AggregationFuzzerBase.cpp index 310a61629361f..1f9cb239f3661 100644 --- a/velox/exec/fuzzer/AggregationFuzzerBase.cpp +++ b/velox/exec/fuzzer/AggregationFuzzerBase.cpp @@ -236,14 +236,41 @@ std::vector AggregationFuzzerBase::generateKeys( std::vector AggregationFuzzerBase::generateSortingKeys( const std::string& prefix, std::vector& names, - std::vector& types) { + std::vector& types, + const bool hasRowNumberKey, + const bool rangeFrame) { std::vector keys; - auto numKeys = boost::random::uniform_int_distribution(1, 5)(rng_); + vector_size_t numKeys; + vector_size_t maxDepth; + std::vector sortingKeyTypes = kScalarTypes; + + // If frame has kRange bound, only one sorting key should be present. If the + // row_number column is not present, generate this sorting key randomly; use + // the row_number column as sorting key otherwise. + if (rangeFrame) { + if (hasRowNumberKey) { + return keys; + } else { + numKeys = 1; + // Pick scalar type which supports '+', '-' binary operations. + sortingKeyTypes = { + TINYINT(), + SMALLINT(), + INTEGER(), + BIGINT(), + HUGEINT(), + REAL(), + DOUBLE()}; + maxDepth = 0; + } + } else { + numKeys = randInt(1, 5); + maxDepth = 2; + } + for (auto i = 0; i < numKeys; ++i) { keys.push_back(fmt::format("{}{}", prefix, i)); - - // Pick random, possibly complex, type. - types.push_back(vectorFuzzer_.randOrderableType(2)); + types.push_back(vectorFuzzer_.randOrderableType(maxDepth, sortingKeyTypes)); names.push_back(keys.back()); } @@ -296,13 +323,17 @@ std::vector AggregationFuzzerBase::generateInputData( return input; } -std::vector AggregationFuzzerBase::generateInputDataWithRowNumber( +std::vector +AggregationFuzzerBase::generateInputDataForWindowFuzzer( std::vector names, std::vector types, const std::vector& partitionKeys, - const CallableSignature& signature) { - names.push_back("row_number"); - types.push_back(BIGINT()); + const CallableSignature& signature, + const bool hasRowNumberKey) { + if (hasRowNumberKey) { + names.push_back("row_number"); + types.push_back(BIGINT()); + } auto generator = findInputGenerator(signature); @@ -329,7 +360,8 @@ std::vector AggregationFuzzerBase::generateInputDataWithRowNumber( auto numPartitions = size ? randInt(1, size) : 1; auto indices = vectorFuzzer_.fuzzIndices(size, numPartitions); auto nulls = vectorFuzzer_.fuzzNulls(size); - for (auto i = children.size(); i < types.size() - 1; ++i) { + auto n = hasRowNumberKey ? types.size() - 1 : types.size(); + for (auto i = children.size(); i < n; ++i) { if (partitionKeySet.find(names[i]) != partitionKeySet.end()) { // The partition keys are built with a dictionary over a smaller set of // values. This is done to introduce some repetition of key values for @@ -341,8 +373,10 @@ std::vector AggregationFuzzerBase::generateInputDataWithRowNumber( children.push_back(vectorFuzzer_.fuzz(types[i], size)); } } - children.push_back(vectorMaker.flatVector( - size, [&](auto /*row*/) { return rowNumber++; })); + if (hasRowNumberKey) { + children.push_back(vectorMaker.flatVector( + size, [&](auto /*row*/) { return rowNumber++; })); + } input.push_back(vectorMaker.rowVector(names, children)); } diff --git a/velox/exec/fuzzer/AggregationFuzzerBase.h b/velox/exec/fuzzer/AggregationFuzzerBase.h index 32189bf89af59..0e52c5975afd8 100644 --- a/velox/exec/fuzzer/AggregationFuzzerBase.h +++ b/velox/exec/fuzzer/AggregationFuzzerBase.h @@ -185,11 +185,18 @@ class AggregationFuzzerBase { std::vector& types); // Similar to generateKeys, but restricts types to orderable types (i.e. no - // maps). + // maps). For range frames with k preceding/following frame bounds: + // 1. hasRowNumberKey indicates whether the row_number column should be used + // as the sorting key. The row_number key is added for consistent result + // verification when the function is order dependent or when the frame is + // of ROWS type. + // 2. rangeFrame must be set to true. std::vector generateSortingKeys( const std::string& prefix, std::vector& names, - std::vector& types); + std::vector& types, + const bool hasRowNumberKey = true, + const bool rangeFrame = false); std::pair pickSignature(); @@ -202,11 +209,12 @@ class AggregationFuzzerBase { // child named "row_number" of BIGINT row numbers that differentiates every // row. Row numbers start from 0. This additional input vector is needed for // result verification of window aggregations. - std::vector generateInputDataWithRowNumber( + std::vector generateInputDataForWindowFuzzer( std::vector names, std::vector types, const std::vector& partitionKeys, - const CallableSignature& signature); + const CallableSignature& signature, + const bool hasRowNumberKey = true); std::pair, ReferenceQueryErrorCode> computeReferenceResults( diff --git a/velox/exec/fuzzer/PrestoQueryRunner.cpp b/velox/exec/fuzzer/PrestoQueryRunner.cpp index 99791ff0cb630..82455279b4bfa 100644 --- a/velox/exec/fuzzer/PrestoQueryRunner.cpp +++ b/velox/exec/fuzzer/PrestoQueryRunner.cpp @@ -169,6 +169,7 @@ PrestoQueryRunner::PrestoQueryRunner( user_{std::move(user)}, timeout_(timeout) { eventBaseThread_.start("PrestoQueryRunner"); + queryRunnerContext_ = std::make_shared(); } std::optional PrestoQueryRunner::toSql( @@ -431,56 +432,6 @@ std::optional PrestoQueryRunner::toSql( return sql.str(); } -namespace { - -void appendWindowFrame( - const core::WindowNode::Frame& frame, - std::stringstream& sql) { - // TODO: Add support for k Range Frames by retrieving the original range bound - // from WindowNode. - switch (frame.type) { - case core::WindowNode::WindowType::kRange: - sql << " RANGE"; - break; - case core::WindowNode::WindowType::kRows: - sql << " ROWS"; - break; - default: - VELOX_UNREACHABLE(); - } - sql << " BETWEEN"; - - auto appendBound = [&sql]( - const core::WindowNode::BoundType& bound, - const core::TypedExprPtr& value) { - switch (bound) { - case core::WindowNode::BoundType::kUnboundedPreceding: - sql << " UNBOUNDED PRECEDING"; - break; - case core::WindowNode::BoundType::kUnboundedFollowing: - sql << " UNBOUNDED FOLLOWING"; - break; - case core::WindowNode::BoundType::kCurrentRow: - sql << " CURRENT ROW"; - break; - case core::WindowNode::BoundType::kPreceding: - sql << " " << value->toString() << " PRECEDING"; - break; - case core::WindowNode::BoundType::kFollowing: - sql << " " << value->toString() << " FOLLOWING"; - break; - default: - VELOX_UNREACHABLE(); - } - }; - - appendBound(frame.startType, frame.startValue); - sql << " AND"; - appendBound(frame.endType, frame.endValue); -} - -} // namespace - std::optional PrestoQueryRunner::toSql( const std::shared_ptr& windowNode) { if (!isSupportedDwrfType(windowNode->sources()[0]->outputType())) { @@ -525,7 +476,9 @@ std::optional PrestoQueryRunner::toSql( } } - appendWindowFrame(functions[i].frame, sql); + auto frameClause = + queryRunnerContext_->windowFrames_.at(windowNode->id()).back(); + sql << frameClause; sql << ")"; } diff --git a/velox/exec/fuzzer/ReferenceQueryRunner.h b/velox/exec/fuzzer/ReferenceQueryRunner.h index 8420380ddb074..6582be888c842 100644 --- a/velox/exec/fuzzer/ReferenceQueryRunner.h +++ b/velox/exec/fuzzer/ReferenceQueryRunner.h @@ -19,6 +19,11 @@ namespace facebook::velox::exec::test { +class QueryRunnerContext { + public: + std::unordered_map> windowFrames_; +}; + /// Query runner that uses reference database, i.e. DuckDB, Presto, Spark. class ReferenceQueryRunner { public: @@ -78,6 +83,7 @@ class ReferenceQueryRunner { const std::string& sessionProperty) { VELOX_UNSUPPORTED(); } -}; + std::shared_ptr queryRunnerContext_; +}; } // namespace facebook::velox::exec::test diff --git a/velox/exec/fuzzer/WindowFuzzer.cpp b/velox/exec/fuzzer/WindowFuzzer.cpp index 734e87a702ac3..2f50e9dea7854 100644 --- a/velox/exec/fuzzer/WindowFuzzer.cpp +++ b/velox/exec/fuzzer/WindowFuzzer.cpp @@ -18,6 +18,7 @@ #include #include "velox/common/base/Portability.h" +#include "velox/exec/fuzzer/PrestoQueryRunner.h" #include "velox/exec/tests/utils/PlanBuilder.h" #include "velox/exec/tests/utils/TempDirectoryPath.h" @@ -60,20 +61,132 @@ void WindowFuzzer::addWindowFunctionSignatures( } } -std::tuple WindowFuzzer::generateFrameClause() { - auto frameType = [](int value) -> const std::string { - switch (value) { - case 0: - return "RANGE"; - case 1: - return "ROWS"; - default: - VELOX_UNREACHABLE("Unknown value for frame type generation"); - } +std::tuple< + core::WindowNode::WindowType, + core::WindowNode::BoundType, + core::WindowNode::BoundType> +WindowFuzzer::frameWindowTypeAndBoundType() { + // Randomly select if ROWS or RANGE frame + auto windowType = vectorFuzzer_.coinToss(0.1) + ? core::WindowNode::WindowType::kRows + : core::WindowNode::WindowType::kRange; + + const std::vector startBoundOptions = { + core::WindowNode::BoundType::kUnboundedPreceding, + core::WindowNode::BoundType::kPreceding, + core::WindowNode::BoundType::kCurrentRow, + core::WindowNode::BoundType::kFollowing}; + const std::vector endBoundOptions = { + core::WindowNode::BoundType::kPreceding, + core::WindowNode::BoundType::kCurrentRow, + core::WindowNode::BoundType::kFollowing, + core::WindowNode::BoundType::kUnboundedFollowing}; + + auto startBoundIndex = boost::random::uniform_int_distribution( + 0, startBoundOptions.size() - 1)(rng_); + auto endBoundMinIdx = std::max(0, static_cast(startBoundIndex) - 1); + auto endBoundIndex = boost::random::uniform_int_distribution( + endBoundMinIdx, endBoundOptions.size() - 1)(rng_); + auto frameStartBoundType = startBoundOptions[startBoundIndex]; + auto frameEndBoundType = endBoundOptions[endBoundIndex]; + + return std::make_tuple(windowType, frameStartBoundType, frameEndBoundType); +} + +// For frames with k RANGE PRECEDING/FOLLOWING, Velox requires the application +// to add columns with the range frame boundary value computed according to the +// frame type. +// If the frame is k PRECEDING : +// frame_boundary_value = current_order_by - k (for ascending ORDER BY) +// frame_boundary_value = current_order_by + k (for descending ORDER BY) +// If the frame is k FOLLOWING : +// frame_boundary_value = current_order_by + k (for ascending ORDER BY) +// frame_boundary_value = current_order_by - k (for descending ORDER BY) +template +T WindowFuzzer::genOffsetAtIdx( + const T& offsetColumnVal, + T offsetValue, + core::WindowNode::BoundType frameBoundType, + core::SortOrder sortOrder) { + auto isPreceding = [&](core::WindowNode::BoundType boundType) { + return boundType == core::WindowNode::BoundType::kPreceding; }; - auto isRowsFrame = - boost::random::uniform_int_distribution(0, 1)(rng_); - auto frameTypeString = frameType(isRowsFrame); + if ((isPreceding(frameBoundType) && sortOrder.isAscending()) || + (!isPreceding(frameBoundType) && !sortOrder.isAscending())) { + if constexpr (std::is_same_v || std::is_same_v) { + return offsetColumnVal - offsetValue; + } else if constexpr (std::is_integral_v) { + return checkedMinus(offsetColumnVal, offsetValue); + } + } + + if ((!isPreceding(frameBoundType) && sortOrder.isAscending()) || + (isPreceding(frameBoundType) && !sortOrder.isAscending())) { + if constexpr (std::is_same_v || std::is_same_v) { + return offsetColumnVal + offsetValue; + } else if constexpr (std::is_integral_v) { + return checkedPlus(offsetColumnVal, offsetValue); + } + } + + VELOX_UNREACHABLE( + "Offset cannot be generated: sortOrder ascending {}, frameBoundType {}", + sortOrder.toString(), + core::WindowNode::boundTypeName(frameBoundType)); +} + +template +const std::string WindowFuzzer::addKRangeOffsetColumnToInput( + std::vector& input, + core::WindowNode::BoundType frameBoundType, + std::string& columnName, + SortingKeyAndOrder& orderByKey) { + auto type = CppToType::create(); + VectorPtr fuzzOffset = vectorFuzzer_.fuzzConstant(type, 1); + const T offsetValue = fuzzOffset->as>()->valueAt(0); + const auto size = vectorFuzzer_.getOptions().vectorSize; + velox::test::VectorMaker vectorMaker{pool_.get()}; + + for (auto i = 0; i < FLAGS_num_batches; i++) { + BufferPtr values = AlignedBuffer::allocate(size, pool_.get()); + auto* valuesPtr = values->asMutable(); + BufferPtr nulls = allocateNulls(size, pool_.get()); + auto* rawNulls = nulls->asMutable(); + auto orderByCol = input[i]->childAt(orderByKey.key_); + const SelectivityVector allRows(size); + DecodedVector decodedVector(*orderByCol, allRows); + + for (auto j = 0; j < size; j++) { + if (decodedVector.isNullAt(j)) { + bits::setNull(rawNulls, j, true); + } else { + valuesPtr[j] = genOffsetAtIdx( + decodedVector.valueAt(j), + offsetValue, + frameBoundType, + orderByKey.sortOrder_); + } + } + + auto offsetColumn = std::make_shared>( + pool_.get(), type, nulls, size, values, std::vector{}); + + auto names = input[i]->type()->asRow().names(); + names.push_back(columnName); + auto children = input[i]->children(); + children.push_back(offsetColumn); + input[i] = vectorMaker.rowVector(names, children); + } + + return fuzzOffset->toString(0); +} + +std::string WindowFuzzer::generateFrameClause( + core::WindowNode::WindowType windowType, + core::WindowNode::BoundType startBoundType, + core::WindowNode::BoundType endBoundType, + bool sqlFrame) { + auto frameType = core::WindowNode::windowTypeName(windowType); constexpr int64_t kMax = std::numeric_limits::max(); constexpr int64_t kMin = std::numeric_limits::min(); @@ -88,21 +201,39 @@ std::tuple WindowFuzzer::generateFrameClause() { maxKValue = kMax; } - auto frameBound = - [minKValue, maxKValue, this]( - core::WindowNode::BoundType boundType) -> const std::string { + auto frameBound = [&](core::WindowNode::BoundType boundType, + std::string columnName, + bool isStartBound) -> const std::string { // Generating only constant bounded k PRECEDING/FOLLOWING frames for now. auto kValue = boost::random::uniform_int_distribution( minKValue, maxKValue)(rng_); switch (boundType) { case core::WindowNode::BoundType::kUnboundedPreceding: return "UNBOUNDED PRECEDING"; - case core::WindowNode::BoundType::kPreceding: + case core::WindowNode::BoundType::kPreceding: { + if (windowType == core::WindowNode::WindowType::kRange) { + if (sqlFrame) { + auto sqlFrameBound = + isStartBound ? prestoFrames_[0].first : prestoFrames_[0].second; + return fmt::format("{} PRECEDING", sqlFrameBound); + } + return fmt::format("{} PRECEDING", columnName); + } return fmt::format("{} PRECEDING", kValue); + } case core::WindowNode::BoundType::kCurrentRow: return "CURRENT ROW"; - case core::WindowNode::BoundType::kFollowing: + case core::WindowNode::BoundType::kFollowing: { + if (windowType == core::WindowNode::WindowType::kRange) { + if (sqlFrame) { + auto sqlFrameBound = + isStartBound ? prestoFrames_[0].first : prestoFrames_[0].second; + return fmt::format("{} FOLLOWING", sqlFrameBound); + } + return fmt::format("{} FOLLOWING", columnName); + } return fmt::format("{} FOLLOWING", kValue); + } case core::WindowNode::BoundType::kUnboundedFollowing: return "UNBOUNDED FOLLOWING"; default: @@ -110,43 +241,10 @@ std::tuple WindowFuzzer::generateFrameClause() { } }; - // Generating k PRECEDING and k FOLLOWING frames only for ROWS type. - // k RANGE frames require more work as we have to generate columns with the - // frame bound values. - std::vector startBoundOptions, endBoundOptions; - if (isRowsFrame) { - startBoundOptions = { - core::WindowNode::BoundType::kUnboundedPreceding, - core::WindowNode::BoundType::kPreceding, - core::WindowNode::BoundType::kCurrentRow, - core::WindowNode::BoundType::kFollowing}; - endBoundOptions = { - core::WindowNode::BoundType::kPreceding, - core::WindowNode::BoundType::kCurrentRow, - core::WindowNode::BoundType::kFollowing, - core::WindowNode::BoundType::kUnboundedFollowing}; - } else { - startBoundOptions = { - core::WindowNode::BoundType::kUnboundedPreceding, - core::WindowNode::BoundType::kCurrentRow}; - endBoundOptions = { - core::WindowNode::BoundType::kCurrentRow, - core::WindowNode::BoundType::kUnboundedFollowing}; - } + auto frameStart = frameBound(startBoundType, "k0", true); + auto frameEnd = frameBound(endBoundType, "k1", false); - // End bound option should not be greater than start bound option as this - // would result in an invalid frame. - auto startBoundIndex = boost::random::uniform_int_distribution( - 0, startBoundOptions.size() - 1)(rng_); - auto endBoundMinIdx = std::max(0, static_cast(startBoundIndex) - 1); - auto endBoundIndex = boost::random::uniform_int_distribution( - endBoundMinIdx, endBoundOptions.size() - 1)(rng_); - auto frameStart = frameBound(startBoundOptions[startBoundIndex]); - auto frameEnd = frameBound(endBoundOptions[endBoundIndex]); - - return std::make_tuple( - frameTypeString + " BETWEEN " + frameStart + " AND " + frameEnd, - isRowsFrame); + return fmt::format(" {} BETWEEN {} AND {}", frameType, frameStart, frameEnd); } std::string WindowFuzzer::generateOrderByClause( @@ -173,15 +271,18 @@ std::string WindowFuzzer::getFrame( if (!sortingKeysAndOrders.empty()) { frame << generateOrderByClause(sortingKeysAndOrders); } - frame << " " << frameClause; + frame << frameClause; return frame.str(); } std::vector WindowFuzzer::generateSortingKeysAndOrders( const std::string& prefix, std::vector& names, - std::vector& types) { - auto keys = generateSortingKeys(prefix, names, types); + std::vector& types, + const bool hasRowNumberKey, + const bool isKRangeFrame) { + auto keys = + generateSortingKeys(prefix, names, types, hasRowNumberKey, isKRangeFrame); std::vector results; for (auto i = 0; i < keys.size(); ++i) { auto asc = vectorFuzzer_.coinToss(0.5); @@ -191,6 +292,48 @@ std::vector WindowFuzzer::generateSortingKeysAndOrders( return results; } +template +void WindowFuzzer::addOffsetColumnsToInput( + std::vector& input, + core::WindowNode::BoundType startBoundType, + core::WindowNode::BoundType endBoundType, + SortingKeyAndOrder& orderByKey) { + auto isKBound = [](core::WindowNode::BoundType boundType) { + return (boundType == core::WindowNode::BoundType::kPreceding) || + (boundType == core::WindowNode::BoundType::kFollowing); + }; + const auto isFrameStartKBound = isKBound(startBoundType); + const auto isFrameEndKBound = isKBound(endBoundType); + + using TCpp = typename TypeTraits::NativeType; + constexpr bool isOffsetTypeValid = !( + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v); + VELOX_CHECK( + isOffsetTypeValid, + "Invalid type {} for offset column", + TypeTraits::name); + + std::string colName; + std::pair prestoFrames; + if (isFrameStartKBound) { + colName = "k0"; + prestoFrames.first = addKRangeOffsetColumnToInput( + input, startBoundType, colName, orderByKey); + } + if (isFrameEndKBound) { + colName = "k1"; + prestoFrames.second = addKRangeOffsetColumnToInput( + input, endBoundType, colName, orderByKey); + } + + // Currently only one window operator is tested by the window fuzzer in the + // window plan node. + prestoFrames_.clear(); + prestoFrames_.reserve(1); + prestoFrames_.push_back(prestoFrames); +} + void WindowFuzzer::go() { VELOX_CHECK( FLAGS_steps > 0 || FLAGS_duration_sec > 0, @@ -226,23 +369,81 @@ void WindowFuzzer::go() { const auto call = makeFunctionCall(signature.name, argNames, false, false, ignoreNulls); - std::vector sortingKeysAndOrders; - // 50% chance without order-by clause. - if (vectorFuzzer_.coinToss(0.5)) { - sortingKeysAndOrders = - generateSortingKeysAndOrders("s", argNames, argTypes); - } + core::WindowNode::WindowType frameType; + core::WindowNode::BoundType startBoundType; + core::WindowNode::BoundType endBoundType; + std::tie(frameType, startBoundType, endBoundType) = + frameWindowTypeAndBoundType(); + + auto isKBoundFrame = [&](core::WindowNode::BoundType boundType) { + return ( + boundType == core::WindowNode::BoundType::kPreceding || + boundType == core::WindowNode::BoundType::kFollowing); + }; + + bool isKRangeFrame = frameType == core::WindowNode::WindowType::kRange && + (isKBoundFrame(startBoundType) || isKBoundFrame(endBoundType)); + auto hasRowNumberKey = + requireSortedInput || frameType == core::WindowNode::WindowType::kRows; + const auto partitionKeys = generateSortingKeys("p", argNames, argTypes); - const auto [frameClause, isRowsFrame] = generateFrameClause(); - const auto input = generateInputDataWithRowNumber( - argNames, argTypes, partitionKeys, signature); + std::vector sortingKeysAndOrders; // If the function is order-dependent or uses "rows" frame, sort all input // rows by row_number additionally. - if (requireSortedInput || isRowsFrame) { + if (hasRowNumberKey) { sortingKeysAndOrders.emplace_back("row_number", core::kAscNullsLast); ++stats_.numSortedInputs; } + TypeKind orderByTypeKind; + // kRange frames need only one order by key. This should be row_number for + // functions that are order dependent. + if (isKRangeFrame) { + sortingKeysAndOrders = generateSortingKeysAndOrders( + "s", argNames, argTypes, hasRowNumberKey, isKRangeFrame); + orderByTypeKind = argTypes[argTypes.size() - 1]->kind(); + } else if (vectorFuzzer_.coinToss(0.5)) { + // 50% chance without order-by clause. + sortingKeysAndOrders = + generateSortingKeysAndOrders("s", argNames, argTypes); + } + + auto input = generateInputDataForWindowFuzzer( + argNames, argTypes, partitionKeys, signature, hasRowNumberKey); + if (isKRangeFrame) { + // Catch possible type overflow errors when generating offset columns. + try { + VELOX_USER_CHECK( + sortingKeysAndOrders.size() == 1, + "Window with k PRECEDING/FOLLOWING frame bounds should have a single ORDER-BY key"); + auto orderByKey = sortingKeysAndOrders[0]; + VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH( + addOffsetColumnsToInput, + orderByTypeKind, + input, + startBoundType, + endBoundType, + orderByKey); + } catch (VeloxUserError& e) { + continue; + } catch (VeloxRuntimeError& e) { + throw e; + } + } + + const auto frameClause = + generateFrameClause(frameType, startBoundType, endBoundType); + std::string prestoFrameClause; + if (isKRangeFrame && FLAGS_enable_window_reference_verification) { + if (auto* prestoQueryRunner = + dynamic_cast(referenceQueryRunner_.get())) { + prestoFrameClause = + generateFrameClause(frameType, startBoundType, endBoundType, true); + } + } else { + prestoFrameClause = frameClause; + } + logVectors(input); bool failed = verifyWindow( @@ -253,7 +454,8 @@ void WindowFuzzer::go() { input, customVerification, customVerifier, - FLAGS_enable_window_reference_verification); + FLAGS_enable_window_reference_verification, + prestoFrameClause); if (failed) { signatureWithStats.second.numFailed++; } @@ -380,17 +582,20 @@ bool WindowFuzzer::verifyWindow( const std::vector& input, bool customVerification, const std::shared_ptr& customVerifier, - bool enableWindowVerification) { + bool enableWindowVerification, + const std::string& prestoFrameClause) { SCOPE_EXIT { if (customVerifier) { customVerifier->reset(); } }; + core::PlanNodeId windowNodeId; auto frame = getFrame(partitionKeys, sortingKeysAndOrders, frameClause); auto plan = PlanBuilder() .values(input) .window({fmt::format("{} over ({})", functionCall, frame)}) + .capturePlanNodeId(windowNodeId) .planNode(); if (persistAndRunOnce_) { @@ -406,6 +611,8 @@ bool WindowFuzzer::verifyWindow( if (!customVerification) { if (resultOrError.result && enableWindowVerification) { + referenceQueryRunner_->queryRunnerContext_->windowFrames_[windowNodeId] + .push_back(prestoFrameClause); auto referenceResult = computeReferenceResults(plan, input); stats_.updateReferenceQueryStats(referenceResult.second); if (auto expectedResult = referenceResult.first) { diff --git a/velox/exec/fuzzer/WindowFuzzer.h b/velox/exec/fuzzer/WindowFuzzer.h index f53e26098fb4f..767a6d9c05518 100644 --- a/velox/exec/fuzzer/WindowFuzzer.h +++ b/velox/exec/fuzzer/WindowFuzzer.h @@ -79,9 +79,42 @@ class WindowFuzzer : public AggregationFuzzerBase { private: void addWindowFunctionSignatures(const WindowFunctionMap& signatureMap); + std::tuple< + core::WindowNode::WindowType, + core::WindowNode::BoundType, + core::WindowNode::BoundType> + frameWindowTypeAndBoundType(); + + template + void addOffsetColumnsToInput( + std::vector& input, + core::WindowNode::BoundType startBoundType, + core::WindowNode::BoundType endBoundType, + SortingKeyAndOrder& orderByKey); + + // Add offset column to input data for k-range frames. Returns the value of K + // as a string. + template + const std::string addKRangeOffsetColumnToInput( + std::vector& input, + core::WindowNode::BoundType frameBoundType, + std::string& columnName, + SortingKeyAndOrder& orderByKey); + + template + T genOffsetAtIdx( + const T& offsetCol, + T offsetValue, + core::WindowNode::BoundType frameBoundType, + core::SortOrder sortOrder); + // Return a randomly generated frame clause string together with a boolean // flag indicating whether it is a ROWS frame. - std::tuple generateFrameClause(); + std::string generateFrameClause( + core::WindowNode::WindowType windowType, + core::WindowNode::BoundType startBoundType, + core::WindowNode::BoundType endBoundType, + bool sqlFrame = false); std::string generateOrderByClause( const std::vector& sortingKeysAndOrders); @@ -94,7 +127,9 @@ class WindowFuzzer : public AggregationFuzzerBase { std::vector generateSortingKeysAndOrders( const std::string& prefix, std::vector& names, - std::vector& types); + std::vector& types, + const bool hasRowNumberKey = true, + const bool isKRangeFrame = false); // Return 'true' if query plans failed. bool verifyWindow( @@ -105,7 +140,8 @@ class WindowFuzzer : public AggregationFuzzerBase { const std::vector& input, bool customVerification, const std::shared_ptr& customVerifier, - bool enableWindowVerification); + bool enableWindowVerification, + const std::string& prestoFrameClause); void testAlternativePlans( const std::vector& partitionKeys, @@ -124,6 +160,27 @@ class WindowFuzzer : public AggregationFuzzerBase { void print(size_t numIterations) const; } stats_; + + // For k PRECEDING/FOLLOWING frame bounds in RANGE mode, where k is constant, + // Velox uses a column with the pre-computed offset values as the frame bound, + // instead of the constant k value which is used by Presto. This vector + // represents the constant value of the frame bound for such frames, with each + // pair in the vector corresponding to a window in the plan node. A pair is + // used to represent the frame start and end bounds since they both could be + // of type k-RANGE. When Presto is used as reference DB for verification, the + // frame bound values are obtained from this variable. Eg: If the window plan + // node operator being tested by the window fuzzer has two window operators + // i.e window({"sum(c) OVER (ORDER BY s0 FRAME BETWEEN k2 PRECEDING AND k3 + // FOLLOWING)", "avg(c) OVER (ORDER BY s1 DESC RANGE BETWEEN UNBOUNDED + // PRECEDING AND k9 FOLLOWING)"}); where the offset columns k2, k3, and k9 are + // pre-computed as k2[row] = s0[row] - 2, k3[row] = s0[row] + 3, k9[row] = + // s1[row]- 9, the variable prestoFrames_ corresponding to this window plan + // node would look be: std::vector{ std::pair{"2", "3"}, std::pair{"", "3"} }, + // and the frame clause in Presto, corresponding to the two window operators + // would be "RANGE BETWEEN 2 PRECEDING AND 3 FOLLOWING", "RANGE BETWEEN + // UNBOUNDED PRECEDING AND 9 FOLLOWING". This frame clause can be constructed + // by looking up the frame bound values from this variable. + std::vector> prestoFrames_; }; /// Runs the window fuzzer. diff --git a/velox/vector/fuzzer/VectorFuzzer.cpp b/velox/vector/fuzzer/VectorFuzzer.cpp index bb4578d42b203..228f6e559774b 100644 --- a/velox/vector/fuzzer/VectorFuzzer.cpp +++ b/velox/vector/fuzzer/VectorFuzzer.cpp @@ -817,8 +817,10 @@ TypePtr VectorFuzzer::randType(int maxDepth) { return velox::randType(rng_, maxDepth); } -TypePtr VectorFuzzer::randOrderableType(int maxDepth) { - return velox::randOrderableType(rng_, maxDepth); +TypePtr VectorFuzzer::randOrderableType( + int maxDepth, + std::vector possibleScalarTypes) { + return velox::randOrderableType(rng_, maxDepth, possibleScalarTypes); } TypePtr VectorFuzzer::randType( @@ -1019,37 +1021,20 @@ VectorPtr VectorLoaderWrap::makeEncodingPreservedCopy( std::move(nulls), std::move(indices), vectorSize, baseResult); } -namespace { - -const std::vector defaultScalarTypes() { - // @TODO Add decimal TypeKinds to randType. - // Refer https://github.com/facebookincubator/velox/issues/3942 - static std::vector kScalarTypes{ - BOOLEAN(), - TINYINT(), - SMALLINT(), - INTEGER(), - BIGINT(), - REAL(), - DOUBLE(), - VARCHAR(), - VARBINARY(), - TIMESTAMP(), - DATE(), - INTERVAL_DAY_TIME(), - }; - return kScalarTypes; -} -} // namespace - -TypePtr randType(FuzzerGenerator& rng, int maxDepth) { - return randType(rng, defaultScalarTypes(), maxDepth); +TypePtr randType( + FuzzerGenerator& rng, + int maxDepth, + std::vector possibleScalarTypes) { + return randType(rng, possibleScalarTypes, maxDepth); } -TypePtr randOrderableType(FuzzerGenerator& rng, int maxDepth) { +TypePtr randOrderableType( + FuzzerGenerator& rng, + int maxDepth, + std::vector possibleScalarTypes) { // Should we generate a scalar type? if (maxDepth <= 1 || rand(rng)) { - return randType(rng, 0); + return randType(rng, 0, possibleScalarTypes); } // ARRAY or ROW? @@ -1089,7 +1074,7 @@ TypePtr randType( } RowTypePtr randRowType(FuzzerGenerator& rng, int maxDepth) { - return randRowType(rng, defaultScalarTypes(), maxDepth); + return randRowType(rng, kScalarTypes, maxDepth); } RowTypePtr randRowType( diff --git a/velox/vector/fuzzer/VectorFuzzer.h b/velox/vector/fuzzer/VectorFuzzer.h index 663fb5471f697..67086d7b575c9 100644 --- a/velox/vector/fuzzer/VectorFuzzer.h +++ b/velox/vector/fuzzer/VectorFuzzer.h @@ -33,6 +33,23 @@ enum UTF8CharList { MATHEMATICAL_SYMBOLS = 3 // Mathematical Symbols. }; +// @TODO Add decimal TypeKinds to randType. +// Refer https://github.com/facebookincubator/velox/issues/3942 +static std::vector kScalarTypes{ + BOOLEAN(), + TINYINT(), + SMALLINT(), + INTEGER(), + BIGINT(), + REAL(), + DOUBLE(), + VARCHAR(), + VARBINARY(), + TIMESTAMP(), + DATE(), + INTERVAL_DAY_TIME(), +}; + /// VectorFuzzer is a helper class that generates randomized vectors and their /// data for testing, with a high degree of entropy. /// @@ -258,7 +275,9 @@ class VectorFuzzer { /// Same as the function above, but only generate orderable types. /// MAP types are not generated as they are not orderable. - TypePtr randOrderableType(int maxDepth = 5); + TypePtr randOrderableType( + int maxDepth = 5, + std::vector possibleScalarTypes = kScalarTypes); TypePtr randType(const std::vector& scalarTypes, int maxDepth = 5); RowTypePtr randRowType(int maxDepth = 5); @@ -359,12 +378,19 @@ class VectorFuzzer { /// Generates a random type, including maps, structs, and arrays. maxDepth /// limits the maximum level of nesting for complex types. maxDepth <= 1 means -/// no complex types are allowed. -TypePtr randType(FuzzerGenerator& rng, int maxDepth = 5); +/// no complex types are allowed. possibleScalarTypes limits the possible types +/// to be chosen from this list. +TypePtr randType( + FuzzerGenerator& rng, + int maxDepth = 5, + std::vector possibleScalarTypes = kScalarTypes); /// Same as the function above, but only generate orderable types. /// MAP types are not generated as they are not orderable. -TypePtr randOrderableType(FuzzerGenerator& rng, int maxDepth = 5); +TypePtr randOrderableType( + FuzzerGenerator& rng, + int maxDepth = 5, + std::vector possibleScalarTypes = kScalarTypes); TypePtr randType( FuzzerGenerator& rng,