From f317e2beb7edd9f6ea0ae4d0ae4c9dc5691d8a5a Mon Sep 17 00:00:00 2001 From: Wei He Date: Mon, 10 Feb 2025 10:02:17 -0800 Subject: [PATCH] feat(fuzzer): Add input generator for json_parse in expression fuzzer (#12019) Summary: Make expression fuzzer generate input vectors of valid JSON strings for the json_parse function. To test corner cases, the JSON strings may be randomly truncated or inserted with a space character. Reviewed By: kevinwilfong Differential Revision: D67820571 --- velox/exec/fuzzer/PrestoQueryRunner.cpp | 1 - .../fuzzer/ArgsOverrideFunctions.cpp | 54 +++++++ .../expression/fuzzer/ArgsOverrideFunctions.h | 33 +++++ velox/expression/fuzzer/CMakeLists.txt | 5 +- velox/expression/fuzzer/ExpressionFuzzer.cpp | 121 ++++++---------- velox/expression/fuzzer/ExpressionFuzzer.h | 137 ++++-------------- .../fuzzer/ExpressionFuzzerTest.cpp | 10 ++ .../fuzzer/ExpressionFuzzerVerifier.cpp | 25 +++- .../fuzzer/ExpressionFuzzerVerifier.h | 8 +- velox/expression/fuzzer/FuzzerRunner.cpp | 10 +- velox/expression/fuzzer/FuzzerRunner.h | 7 +- velox/expression/fuzzer/FuzzerToolkit.cpp | 40 +++++ velox/expression/fuzzer/FuzzerToolkit.h | 104 +++++++++++++ .../fuzzer/SparkExpressionFuzzerTest.cpp | 1 + .../fuzzer/tests/ExpressionFuzzerUnitTest.cpp | 4 +- .../fuzzer/ConstrainedVectorGenerator.cpp | 4 +- .../fuzzer/ConstrainedVectorGenerator.h | 4 +- .../tests/ConstrainedVectorGeneratorTest.cpp | 4 +- 18 files changed, 368 insertions(+), 204 deletions(-) create mode 100644 velox/expression/fuzzer/ArgsOverrideFunctions.cpp create mode 100644 velox/expression/fuzzer/ArgsOverrideFunctions.h diff --git a/velox/exec/fuzzer/PrestoQueryRunner.cpp b/velox/exec/fuzzer/PrestoQueryRunner.cpp index e784e6d9f753..1bfce2bf1156 100644 --- a/velox/exec/fuzzer/PrestoQueryRunner.cpp +++ b/velox/exec/fuzzer/PrestoQueryRunner.cpp @@ -456,7 +456,6 @@ bool PrestoQueryRunner::isSupported(const exec::FunctionSignature& signature) { usesTypeName(signature, "interval year to month") || usesTypeName(signature, "hugeint") || usesTypeName(signature, "hyperloglog") || - usesInputTypeName(signature, "json") || usesInputTypeName(signature, "ipaddress") || usesInputTypeName(signature, "ipprefix") || usesInputTypeName(signature, "uuid")); diff --git a/velox/expression/fuzzer/ArgsOverrideFunctions.cpp b/velox/expression/fuzzer/ArgsOverrideFunctions.cpp new file mode 100644 index 000000000000..093ef393e7f2 --- /dev/null +++ b/velox/expression/fuzzer/ArgsOverrideFunctions.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/expression/fuzzer/ArgsOverrideFunctions.h" + +#include "velox/common/fuzzer/ConstrainedGenerators.h" +#include "velox/common/fuzzer/Utils.h" +#include "velox/core/Expressions.h" +#include "velox/vector/fuzzer/VectorFuzzer.h" + +namespace facebook::velox::fuzzer { + +std::vector JsonParseArgValuesGenerator::generate( + const CallableSignature& signature, + const VectorFuzzer::Options& options, + FuzzerGenerator& rng, + ExpressionFuzzerState& state) { + VELOX_CHECK_EQ(signature.args.size(), 1); + std::vector inputExpressions; + + state.inputRowTypes_.emplace_back(signature.args[0]); + state.inputRowNames_.emplace_back( + fmt::format("c{}", state.inputRowTypes_.size() - 1)); + + const auto representedType = facebook::velox::randType(rng, 3); + const auto seed = rand(rng); + const auto nullRatio = options.nullRatio; + state.customInputGenerators_.emplace_back( + std::make_shared( + seed, + signature.args[0], + nullRatio, + fuzzer::getRandomInputGenerator(seed, representedType, nullRatio), + true)); + + inputExpressions.push_back(std::make_shared( + signature.args[0], state.inputRowNames_.back())); + return inputExpressions; +} + +} // namespace facebook::velox::fuzzer diff --git a/velox/expression/fuzzer/ArgsOverrideFunctions.h b/velox/expression/fuzzer/ArgsOverrideFunctions.h new file mode 100644 index 000000000000..7ee891e866eb --- /dev/null +++ b/velox/expression/fuzzer/ArgsOverrideFunctions.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/expression/fuzzer/FuzzerToolkit.h" + +namespace facebook::velox::fuzzer { + +class JsonParseArgValuesGenerator : public ArgValuesGenerator { + public: + ~JsonParseArgValuesGenerator() override = default; + + std::vector generate( + const CallableSignature& signature, + const VectorFuzzer::Options& options, + FuzzerGenerator& rng, + ExpressionFuzzerState& state) override; +}; + +} // namespace facebook::velox::fuzzer diff --git a/velox/expression/fuzzer/CMakeLists.txt b/velox/expression/fuzzer/CMakeLists.txt index fdac9f6d7092..57bcf9a94f5f 100644 --- a/velox/expression/fuzzer/CMakeLists.txt +++ b/velox/expression/fuzzer/CMakeLists.txt @@ -25,6 +25,7 @@ target_link_libraries( add_library( velox_expression_fuzzer + ArgsOverrideFunctions.cpp ArgumentTypeFuzzer.cpp DecimalArgGeneratorBase.cpp ExpressionFuzzer.cpp @@ -38,11 +39,13 @@ target_link_libraries( velox_type velox_vector_fuzzer velox_vector_test_lib + velox_constrained_input_generators velox_function_registry velox_expression_test_utility velox_file velox_hive_connector - velox_fuzzer_util) + velox_fuzzer_util + velox_common_fuzzer_util) add_executable(velox_expression_fuzzer_test ExpressionFuzzerTest.cpp) diff --git a/velox/expression/fuzzer/ExpressionFuzzer.cpp b/velox/expression/fuzzer/ExpressionFuzzer.cpp index e68ad0cfa13e..89843d7c7b8a 100644 --- a/velox/expression/fuzzer/ExpressionFuzzer.cpp +++ b/velox/expression/fuzzer/ExpressionFuzzer.cpp @@ -22,6 +22,7 @@ #include #include "velox/common/base/Exceptions.h" +#include "velox/common/fuzzer/ConstrainedGenerators.h" #include "velox/exec/fuzzer/FuzzerUtil.h" #include "velox/expression/Expr.h" #include "velox/expression/FunctionSignature.h" @@ -272,11 +273,14 @@ ExpressionFuzzer::ExpressionFuzzer( const std::shared_ptr& vectorFuzzer, const std::optional& options, const std::unordered_map>& - argGenerators) + argGenerators, + const std::unordered_map>& + argsOverrideFuncs) : options_(options.value_or(Options())), vectorFuzzer_(vectorFuzzer), - state{rng_, std::max(1, options_.maxLevelOfNesting)}, - argGenerators_(argGenerators) { + state_{rng_, std::max(1, options_.maxLevelOfNesting)}, + argGenerators_(argGenerators), + funcArgOverrides_{argsOverrideFuncs} { VELOX_CHECK(vectorFuzzer, "Vector fuzzer must be provided"); seed(initialSeed); @@ -432,10 +436,6 @@ ExpressionFuzzer::ExpressionFuzzer( addToTypeToExpressionListByTicketTimes("row", "row_constructor"); addToTypeToExpressionListByTicketTimes(kTypeParameterName, "dereference"); } - - // Register function override (for cases where we want to restrict the types - // or parameters we pass to functions). - registerFuncOverride(&ExpressionFuzzer::generateSwitchArgs, "switch"); } bool ExpressionFuzzer::isSupportedSignature( @@ -519,13 +519,6 @@ void ExpressionFuzzer::addToTypeToExpressionListByTicketTimes( } } -template -void ExpressionFuzzer::registerFuncOverride( - TFunc func, - const std::string& name) { - funcArgOverrides_[name] = std::bind(func, this, std::placeholders::_1); -} - void ExpressionFuzzer::seed(size_t seed) { rng_.seed(seed); vectorFuzzer_->reSeed(seed); @@ -548,22 +541,23 @@ core::TypedExprPtr ExpressionFuzzer::generateArgConstant(const TypePtr& arg) { // columns of the same type exist then there is a 30% chance that it will // re-use one of them. core::TypedExprPtr ExpressionFuzzer::generateArgColumn(const TypePtr& arg) { - auto& listOfCandidateCols = state.typeToColumnNames_[arg->toString()]; + auto& listOfCandidateCols = state_.typeToColumnNames_[arg->toString()]; bool reuseColumn = options_.enableColumnReuse && !listOfCandidateCols.empty() && vectorFuzzer_->coinToss(0.3); if (!reuseColumn && options_.maxInputsThreshold.has_value() && - state.inputRowTypes_.size() >= options_.maxInputsThreshold.value()) { + state_.inputRowTypes_.size() >= options_.maxInputsThreshold.value()) { reuseColumn = !listOfCandidateCols.empty(); } if (!reuseColumn) { - state.inputRowTypes_.emplace_back(arg); - state.inputRowNames_.emplace_back( - fmt::format("c{}", state.inputRowTypes_.size() - 1)); - listOfCandidateCols.push_back(state.inputRowNames_.back()); + state_.inputRowTypes_.emplace_back(arg); + state_.inputRowNames_.emplace_back( + fmt::format("c{}", state_.inputRowTypes_.size() - 1)); + state_.customInputGenerators_.emplace_back(nullptr); + listOfCandidateCols.push_back(state_.inputRowNames_.back()); return std::make_shared( - arg, state.inputRowNames_.back()); + arg, state_.inputRowNames_.back()); } size_t chosenColIndex = rand32(0, listOfCandidateCols.size() - 1); return std::make_shared( @@ -582,7 +576,7 @@ core::TypedExprPtr ExpressionFuzzer::generateArg(const TypePtr& arg) { // - Lambdas // - Try if (argClass >= kArgExpression) { - if (state.remainingLevelOfNesting_ > 0) { + if (state_.remainingLevelOfNesting_ > 0) { return generateExpression(arg); } argClass = rand32(0, 1); @@ -732,9 +726,9 @@ std::vector ExpressionFuzzer::generateSwitchArgs( ExpressionFuzzer::FuzzedExpressionData ExpressionFuzzer::fuzzExpressions( const RowTypePtr& outType) { - state.reset(); + state_.reset(); VELOX_CHECK_EQ( - state.remainingLevelOfNesting_, std::max(1, options_.maxLevelOfNesting)); + state_.remainingLevelOfNesting_, std::max(1, options_.maxLevelOfNesting)); std::vector expressions; for (int i = 0; i < outType->size(); i++) { @@ -742,8 +736,9 @@ ExpressionFuzzer::FuzzedExpressionData ExpressionFuzzer::fuzzExpressions( } return { std::move(expressions), - ROW(std::move(state.inputRowNames_), std::move(state.inputRowTypes_)), - std::move(state.expressionStats_)}; + ROW(std::move(state_.inputRowNames_), std::move(state_.inputRowTypes_)), + std::move(state_.customInputGenerators_), + std::move(state_.expressionStats_)}; } ExpressionFuzzer::FuzzedExpressionData ExpressionFuzzer::fuzzExpressions( @@ -760,16 +755,16 @@ ExpressionFuzzer::FuzzedExpressionData ExpressionFuzzer::fuzzExpression() { // chance that it will re-use one of them. core::TypedExprPtr ExpressionFuzzer::generateExpression( const TypePtr& returnType) { - VELOX_CHECK_GT(state.remainingLevelOfNesting_, 0); - --state.remainingLevelOfNesting_; - auto guard = folly::makeGuard([&] { ++state.remainingLevelOfNesting_; }); + VELOX_CHECK_GT(state_.remainingLevelOfNesting_, 0); + --state_.remainingLevelOfNesting_; + auto guard = folly::makeGuard([&] { ++state_.remainingLevelOfNesting_; }); core::TypedExprPtr expression; bool reuseExpression = options_.enableExpressionReuse && vectorFuzzer_->coinToss(0.3); if (reuseExpression) { - expression = state.expressionBank_.getRandomExpression( - returnType, state.remainingLevelOfNesting_ + 1); + expression = state_.expressionBank_.getRandomExpression( + returnType, state_.remainingLevelOfNesting_ + 1); if (expression) { return expression; } @@ -796,11 +791,11 @@ core::TypedExprPtr ExpressionFuzzer::generateExpression( auto exprTransformer = options_.exprTransformers.find(chosenFunctionName); if (exprTransformer != options_.exprTransformers.end()) { - state.remainingLevelOfNesting_ -= + state_.remainingLevelOfNesting_ -= exprTransformer->second->extraLevelOfNesting(); } - if (state.remainingLevelOfNesting_ >= 0) { + if (state_.remainingLevelOfNesting_ >= 0) { if (chosenFunctionName == "cast") { expression = generateCastExpression(returnType); } else if (chosenFunctionName == "row_constructor") { @@ -825,7 +820,7 @@ core::TypedExprPtr ExpressionFuzzer::generateExpression( if (expression) { expression = exprTransformer->second->transform(std::move(expression)); } - state.remainingLevelOfNesting_ += + state_.remainingLevelOfNesting_ += exprTransformer->second->extraLevelOfNesting(); } } @@ -841,17 +836,32 @@ core::TypedExprPtr ExpressionFuzzer::generateExpression( return generateArgColumn(returnType); } } - state.expressionBank_.insert(expression); + state_.expressionBank_.insert(expression); return expression; } std::vector ExpressionFuzzer::getArgsForCallable( const CallableSignature& callable) { + // Special case for switch because it has a variable number of arguments not + // specified in the signature. Other functions' argument override should be + // specified through funcArgOverrides_. + if (callable.name == "switch") { + return generateSwitchArgs(callable); + } + auto funcIt = funcArgOverrides_.find(callable.name); if (funcIt == funcArgOverrides_.end()) { return generateArgs(callable); } - return funcIt->second(callable); + auto args = funcIt->second->generate( + callable, vectorFuzzer_->getOptions(), rng_, state_); + for (auto i = 0; i < args.size(); ++i) { + // Generate arguments not specified in the override. + if (args[i] == nullptr) { + args[i] = generateArg(callable.args.at(i), callable.constantArgs.at(i)); + } + } + return args; } core::TypedExprPtr ExpressionFuzzer::getCallExprFromCallable( @@ -1124,45 +1134,6 @@ core::TypedExprPtr ExpressionFuzzer::generateDereferenceExpression( inputExpressions[0], fmt::format("row_field{}", referencedIndex)); } -void ExpressionFuzzer::ExprBank::insert(const core::TypedExprPtr& expression) { - auto typeString = expression->type()->toString(); - if (typeToExprsByLevel_.find(typeString) == typeToExprsByLevel_.end()) { - typeToExprsByLevel_.insert( - {typeString, ExprsIndexedByLevel(maxLevelOfNesting_ + 1)}); - } - auto& expressionsByLevel = typeToExprsByLevel_[typeString]; - int nestingLevel = getNestedLevel(expression); - VELOX_CHECK_LE(nestingLevel, maxLevelOfNesting_); - expressionsByLevel[nestingLevel].push_back(expression); -} - -core::TypedExprPtr ExpressionFuzzer::ExprBank::getRandomExpression( - const facebook::velox::TypePtr& returnType, - int uptoLevelOfNesting) { - VELOX_CHECK_LE(uptoLevelOfNesting, maxLevelOfNesting_); - auto typeString = returnType->toString(); - if (typeToExprsByLevel_.find(typeString) == typeToExprsByLevel_.end()) { - return nullptr; - } - auto& expressionsByLevel = typeToExprsByLevel_[typeString]; - int totalToConsider = 0; - for (int i = 0; i <= uptoLevelOfNesting; i++) { - totalToConsider += expressionsByLevel[i].size(); - } - if (totalToConsider > 0) { - int choice = boost::random::uniform_int_distribution( - 0, totalToConsider - 1)(rng_); - for (int i = 0; i <= uptoLevelOfNesting; i++) { - if (choice >= expressionsByLevel[i].size()) { - choice -= expressionsByLevel[i].size(); - continue; - } - return expressionsByLevel[i][choice]; - } - VELOX_CHECK(false, "Should have found an expression."); - } - return nullptr; -} TypePtr ExpressionFuzzer::fuzzReturnType() { auto chooseFromConcreteSignatures = rand32(0, 1); diff --git a/velox/expression/fuzzer/ExpressionFuzzer.h b/velox/expression/fuzzer/ExpressionFuzzer.h index 9f89abe48bb0..704d081b59ab 100644 --- a/velox/expression/fuzzer/ExpressionFuzzer.h +++ b/velox/expression/fuzzer/ExpressionFuzzer.h @@ -17,16 +17,12 @@ #pragma once #include "velox/core/ITypedExpr.h" -#include "velox/core/QueryCtx.h" #include "velox/exec/fuzzer/ExprTransformer.h" #include "velox/exec/fuzzer/ReferenceQueryRunner.h" -#include "velox/expression/Expr.h" #include "velox/expression/fuzzer/ArgGenerator.h" #include "velox/expression/fuzzer/FuzzerToolkit.h" -#include "velox/expression/tests/ExpressionVerifier.h" #include "velox/functions/FunctionRegistry.h" #include "velox/vector/fuzzer/VectorFuzzer.h" -#include "velox/vector/tests/utils/VectorMaker.h" namespace facebook::velox::fuzzer { @@ -36,6 +32,8 @@ using facebook::velox::exec::test::ExprTransformer; // A tool that can be used to generate random expressions. class ExpressionFuzzer { public: + using State = ExpressionFuzzerState; + struct Options { // The maximum number of variadic arguments fuzzer will generate for // functions that accept variadic arguments. Fuzzer will generate up to @@ -120,10 +118,10 @@ class ExpressionFuzzer { const std::shared_ptr& vectorFuzzer, const std::optional& options = std::nullopt, const std::unordered_map>& - argGenerators = {}); - - template - void registerFuncOverride(TFunc func, const std::string& name); + argGenerators = {}, + const std::unordered_map< + std::string, + std::shared_ptr>& argsOverrideFuncs = {}); struct FuzzedExpressionData { // A list of generated expressions. @@ -132,6 +130,12 @@ class ExpressionFuzzer { // The input vector type that is expected by the generated expressions. RowTypePtr inputType; + // Custom input generators for input vectors. The generator at index i + // corresponds to the i-th field in inputType. If customInputGenerators[i] + // doesn't exist or is nullptr, then no custom input generator is used for + // the i-th field. + std::vector customInputGenerators; + // Count how many times each expression has been selected in expressions. std::unordered_map selectionStats; }; @@ -145,58 +149,6 @@ class ExpressionFuzzer { // Fuzz a single expression and return it along with the input row type. FuzzedExpressionData fuzzExpression(); - /// Used to enable re-use of sub-expressions by exposing an API that allows - /// for randomly picking an expression that has a specific return type and a - /// nesting level less than or equal to a specified limit. It ensures that - /// all expressions that are valid candidates have an equal probability of - /// selection. - class ExprBank { - public: - ExprBank(FuzzerGenerator& rng, int maxLevelOfNesting) - : rng_(rng), maxLevelOfNesting_(maxLevelOfNesting) {} - - /// Adds an expression to the bank. - void insert(const core::TypedExprPtr& expression); - - /// Returns a randomly selected expression of the requested 'returnType' - /// which is guaranteed to have a nesting level less than or equal to - /// 'uptoLevelOfNesting'. Returns a nullptr if no such function can be - /// found. - core::TypedExprPtr getRandomExpression( - const TypePtr& returnType, - int uptoLevelOfNesting); - - /// Removes all the expressions from the bank. Should be called after - /// every fuzzer iteration. - void reset() { - typeToExprsByLevel_.clear(); - } - - private: - int getNestedLevel(const core::TypedExprPtr& expression) { - int level = 0; - for (auto& input : expression->inputs()) { - level = std::max(level, getNestedLevel(input) + 1); - } - return level; - } - - /// Reference to the random generator of the expression fuzzer. - FuzzerGenerator& rng_; - - /// Only expression having less than or equal to this level of nesting - /// will be generated. - int maxLevelOfNesting_; - - /// Represents a vector where each index contains a list of expressions - /// such that the depth of each expression tree is equal to that index. - using ExprsIndexedByLevel = std::vector>; - - /// Maps a 'Type' serialized as a string to an object of type - /// ExprsIndexedByLevel - std::unordered_map typeToExprsByLevel_; - }; - void seed(size_t seed); const std::vector& supportedFunctions() const { @@ -341,7 +293,7 @@ class ExpressionFuzzer { /// Should be called whenever a function is selected by the fuzzer. void markSelected(const std::string& funcName) { - state.expressionStats_[funcName]++; + state_.expressionStats_[funcName]++; } // Returns random integer between min and max inclusive. @@ -382,61 +334,32 @@ class ExpressionFuzzer { // --assign_function_tickets startup flag . std::unordered_map functionsToTickets_; - /// We allow the arg generation routine to be specialized for particular - /// functions. This map stores the mapping between function name and the - /// overridden method. - using ArgsOverrideFunc = std::function( - const CallableSignature& input)>; - - std::unordered_map funcArgOverrides_; - std::shared_ptr vectorFuzzer_; FuzzerGenerator rng_; std::vector supportedFunctions_; - struct State { - void reset() { - inputRowTypes_.clear(); - inputRowNames_.clear(); - typeToColumnNames_.clear(); - expressionBank_.reset(); - expressionStats_.clear(); - } - - State(FuzzerGenerator& rng, int maxLevelOfNesting) - : expressionBank_(rng, maxLevelOfNesting), - remainingLevelOfNesting_(maxLevelOfNesting) {} - - /// Used to track all generated expressions within a single iteration and - /// support expression re-use. - ExprBank expressionBank_; - - /// Contains the types and names of the input vector that the generated - /// expressions consume. - std::vector inputRowTypes_; - std::vector inputRowNames_; - - // Count how many times each function has been selected. - std::unordered_map expressionStats_; - - /// Maps a 'Type' serialized as a string to the column names that have - /// already been generated. Used to easily look up columns that can be - /// re-used when a specific type is required as input to a callable. - std::unordered_map> - typeToColumnNames_; - - /// The remaining levels of expression nesting. It's initialized by - /// FLAGS_max_level_of_nesting and updated in generateExpression(). When - /// its value decreases to 0, we don't generate subexpressions anymore. - int32_t remainingLevelOfNesting_; - - } state; - friend class ExpressionFuzzerUnitTest; + State state_; // Maps from function name to a specific generator of argument types. std::unordered_map> argGenerators_; + + /// We allow the arg generation routine to be specialized for particular + /// functions. This map stores the mapping between function name and the + /// overridden method. + /// The overridden method can specify all or a subset of arguments of the + /// input function signature. For a given function signature, the overridden + /// method returns a vector of TypedExprPtr, with unspecified arguments being + /// nullptr at the corresponding index. ExpressionFuzzer then generates random + /// arguments for these unspecified ones with the types specified in the + /// function signature. (Functions of variable arity must determine the number + /// of arguments in the overridden method. Arguments at indices beyond the + /// argument size in the input function signature cannot be left unspecified.) + std::unordered_map> + funcArgOverrides_; + + friend class ExpressionFuzzerUnitTest; }; } // namespace facebook::velox::fuzzer diff --git a/velox/expression/fuzzer/ExpressionFuzzerTest.cpp b/velox/expression/fuzzer/ExpressionFuzzerTest.cpp index f87abb912110..33c2e29452a9 100644 --- a/velox/expression/fuzzer/ExpressionFuzzerTest.cpp +++ b/velox/expression/fuzzer/ExpressionFuzzerTest.cpp @@ -19,6 +19,8 @@ #include "velox/exec/fuzzer/PrestoQueryRunner.h" #include "velox/expression/fuzzer/ArgGenerator.h" +#include "velox/expression/fuzzer/ArgsOverrideFunctions.h" +#include "velox/expression/fuzzer/ExpressionFuzzer.h" #include "velox/expression/fuzzer/FuzzerRunner.h" #include "velox/expression/fuzzer/SpecialFormSignatureGenerator.h" #include "velox/functions/prestosql/fuzzer/DivideArgGenerator.h" @@ -52,7 +54,10 @@ DEFINE_uint32( using namespace facebook::velox::exec::test; using facebook::velox::exec::test::PrestoQueryRunner; using facebook::velox::fuzzer::ArgGenerator; +using facebook::velox::fuzzer::ArgValuesGenerator; +using facebook::velox::fuzzer::ExpressionFuzzer; using facebook::velox::fuzzer::FuzzerRunner; +using facebook::velox::fuzzer::JsonParseArgValuesGenerator; using facebook::velox::test::ReferenceQueryRunner; int main(int argc, char** argv) { @@ -122,6 +127,10 @@ int main(int argc, char** argv) { {"map_keys", std::make_shared()}, {"map_values", std::make_shared()}}; + std::unordered_map> + argsOverrideFuncs = { + {"json_parse", std::make_shared()}}; + std::shared_ptr rootPool{ facebook::velox::memory::memoryManager()->addRootPool()}; std::shared_ptr referenceQueryRunner{nullptr}; @@ -140,6 +149,7 @@ int main(int argc, char** argv) { {{"session_timezone", "America/Los_Angeles"}, {"adjust_timestamp_to_session_timezone", "true"}}, argGenerators, + argsOverrideFuncs, referenceQueryRunner, std::make_shared< facebook::velox::fuzzer::SpecialFormSignatureGenerator>()); diff --git a/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp b/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp index 80f07c3a1e2a..aa3a33b14d7e 100644 --- a/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp +++ b/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp @@ -57,7 +57,9 @@ ExpressionFuzzerVerifier::ExpressionFuzzerVerifier( size_t initialSeed, const ExpressionFuzzerVerifier::Options& options, const std::unordered_map>& - argGenerators) + argGenerators, + const std::unordered_map>& + argsOverrideFuncs) : options_(options), queryCtx_(core::QueryCtx::create( nullptr, @@ -77,7 +79,8 @@ ExpressionFuzzerVerifier::ExpressionFuzzerVerifier( initialSeed, vectorFuzzer_, options_.expressionFuzzerOptions, - argGenerators), + argGenerators, + argsOverrideFuncs), referenceQueryRunner_{ options_.expressionFuzzerOptions.referenceQueryRunner} { filesystems::registerLocalFileSystem(); @@ -101,7 +104,8 @@ ExpressionFuzzerVerifier::ExpressionFuzzerVerifier( std::pair, InputRowMetadata> ExpressionFuzzerVerifier::generateInput( const RowTypePtr& rowType, - VectorFuzzer& vectorFuzzer) { + VectorFuzzer& vectorFuzzer, + const std::vector& inputGenerators) { // Randomly pick to generate one or two input rows. std::vector inputs; int numInputs = vectorFuzzer.coinToss(0.5) ? 1 : 2; @@ -123,6 +127,8 @@ ExpressionFuzzerVerifier::generateInput( std::vector children; children.reserve(rowType->size() + 1); for (auto i = 0; i < rowType->size(); ++i) { + const auto& inputGenerator = + inputGenerators.size() > i ? inputGenerators[i] : nullptr; if (std::binary_search( metadata.columnsToWrapInCommonDictionary.begin(), metadata.columnsToWrapInCommonDictionary.end(), @@ -130,12 +136,15 @@ ExpressionFuzzerVerifier::generateInput( // These will be wrapped in common dictionary later. if (vectorFuzzer.getOptions().allowConstantVector && vectorFuzzer.coinToss(0.2)) { - children.push_back(vectorFuzzer.fuzzConstant(rowType->childAt(i))); + children.push_back( + vectorFuzzer.fuzzConstant(rowType->childAt(i), inputGenerator)); } else { - children.push_back(vectorFuzzer.fuzzFlat(rowType->childAt(i))); + children.push_back( + vectorFuzzer.fuzzFlat(rowType->childAt(i), inputGenerator)); } } else { - children.push_back(vectorFuzzer.fuzz(rowType->childAt(i))); + children.push_back( + vectorFuzzer.fuzz(rowType->childAt(i), inputGenerator)); } } @@ -377,7 +386,7 @@ void ExpressionFuzzerVerifier::go() { // set. int numExpressionTrees = boost::random::uniform_int_distribution( 1, options_.maxExpressionTreesPerStep)(rng_); - auto [expressions, inputType, selectionStats] = + auto [expressions, inputType, inputGenerators, selectionStats] = expressionFuzzer_.fuzzExpressions(numExpressionTrees); // Project a row number column in the output to enable epsilon-comparison // for floating-point columns and make investigation of failures easier. @@ -391,7 +400,7 @@ void ExpressionFuzzerVerifier::go() { std::vector plans = std::move(expressions); auto [inputTestCases, inputRowMetadata] = - generateInput(inputType, *vectorFuzzer_); + generateInput(inputType, *vectorFuzzer_, inputGenerators); auto resultVectors = generateResultVectors(plans); std::vector results; diff --git a/velox/expression/fuzzer/ExpressionFuzzerVerifier.h b/velox/expression/fuzzer/ExpressionFuzzerVerifier.h index ec9b4f2f27c1..90200455def0 100644 --- a/velox/expression/fuzzer/ExpressionFuzzerVerifier.h +++ b/velox/expression/fuzzer/ExpressionFuzzerVerifier.h @@ -55,7 +55,10 @@ class ExpressionFuzzerVerifier { size_t initialSeed, const Options& options, const std::unordered_map>& - argGenerators); + argGenerators, + const std::unordered_map< + std::string, + std::shared_ptr>& argsOverrideFuncs); // This function starts the test that is performed by the // ExpressionFuzzerVerifier which is generating random expressions and @@ -173,7 +176,8 @@ class ExpressionFuzzerVerifier { // 4. Appends a row number column to the input row vector. std::pair, InputRowMetadata> generateInput( const RowTypePtr& rowType, - VectorFuzzer& vectorFuzzer); + VectorFuzzer& vectorFuzzer, + const std::vector& inputGenerators); /// Randomize initial result vector data to test for correct null and data /// setting in functions. diff --git a/velox/expression/fuzzer/FuzzerRunner.cpp b/velox/expression/fuzzer/FuzzerRunner.cpp index 3923c5dade28..69ab97f69758 100644 --- a/velox/expression/fuzzer/FuzzerRunner.cpp +++ b/velox/expression/fuzzer/FuzzerRunner.cpp @@ -15,6 +15,7 @@ */ #include "velox/expression/fuzzer/FuzzerRunner.h" + #include "velox/expression/fuzzer/ExpressionFuzzer.h" DEFINE_int32(steps, 10, "Number of expressions to generate and execute."); @@ -236,6 +237,8 @@ int FuzzerRunner::run( const std::unordered_map& queryConfigs, const std::unordered_map>& argGenerators, + const std::unordered_map>& + argsOverrideFuncs, std::shared_ptr referenceQueryRunner, const std::shared_ptr& specialFormSignatureGenerator) { @@ -245,6 +248,7 @@ int FuzzerRunner::run( exprTransformers, queryConfigs, argGenerators, + argsOverrideFuncs, referenceQueryRunner, specialFormSignatureGenerator); return RUN_ALL_TESTS(); @@ -259,6 +263,8 @@ void FuzzerRunner::runFromGtest( const std::unordered_map& queryConfigs, const std::unordered_map>& argGenerators, + const std::unordered_map>& + argsOverrideFuncs, std::shared_ptr referenceQueryRunner, const std::shared_ptr& specialFormSignatureGenerator) { @@ -271,6 +277,8 @@ void FuzzerRunner::runFromGtest( // Insert generated signatures of special forms into the signature map. specialFormSignatureGenerator->appendSpecialForms( signatures, options.expressionFuzzerOptions.specialForms); - ExpressionFuzzerVerifier(signatures, seed, options, argGenerators).go(); + ExpressionFuzzerVerifier( + signatures, seed, options, argGenerators, argsOverrideFuncs) + .go(); } } // namespace facebook::velox::fuzzer diff --git a/velox/expression/fuzzer/FuzzerRunner.h b/velox/expression/fuzzer/FuzzerRunner.h index ad51618f8acd..f66f2c73380b 100644 --- a/velox/expression/fuzzer/FuzzerRunner.h +++ b/velox/expression/fuzzer/FuzzerRunner.h @@ -20,7 +20,6 @@ #include #include #include -#include #include "velox/exec/fuzzer/ExprTransformer.h" #include "velox/exec/fuzzer/ReferenceQueryRunner.h" @@ -44,6 +43,9 @@ class FuzzerRunner { const std::unordered_map& queryConfigs, const std::unordered_map>& argGenerators, + const std::unordered_map< + std::string, + std::shared_ptr>& argsOverrideFuncs, std::shared_ptr referenceQueryRunner, const std::shared_ptr& signatureGenerator); @@ -56,6 +58,9 @@ class FuzzerRunner { const std::unordered_map& queryConfigs, const std::unordered_map>& argGenerators, + const std::unordered_map< + std::string, + std::shared_ptr>& argsOverrideFuncs, std::shared_ptr referenceQueryRunner, const std::shared_ptr& signatureGenerator); }; diff --git a/velox/expression/fuzzer/FuzzerToolkit.cpp b/velox/expression/fuzzer/FuzzerToolkit.cpp index 421c96bf6951..73526e140528 100644 --- a/velox/expression/fuzzer/FuzzerToolkit.cpp +++ b/velox/expression/fuzzer/FuzzerToolkit.cpp @@ -195,4 +195,44 @@ InputRowMetadata InputRowMetadata::restoreFromFile( return ret; } +void ExprBank::insert(const core::TypedExprPtr& expression) { + auto typeString = expression->type()->toString(); + if (typeToExprsByLevel_.find(typeString) == typeToExprsByLevel_.end()) { + typeToExprsByLevel_.insert( + {typeString, ExprsIndexedByLevel(maxLevelOfNesting_ + 1)}); + } + auto& expressionsByLevel = typeToExprsByLevel_[typeString]; + int nestingLevel = getNestedLevel(expression); + VELOX_CHECK_LE(nestingLevel, maxLevelOfNesting_); + expressionsByLevel[nestingLevel].push_back(expression); +} + +core::TypedExprPtr ExprBank::getRandomExpression( + const facebook::velox::TypePtr& returnType, + int uptoLevelOfNesting) { + VELOX_CHECK_LE(uptoLevelOfNesting, maxLevelOfNesting_); + auto typeString = returnType->toString(); + if (typeToExprsByLevel_.find(typeString) == typeToExprsByLevel_.end()) { + return nullptr; + } + auto& expressionsByLevel = typeToExprsByLevel_[typeString]; + int totalToConsider = 0; + for (int i = 0; i <= uptoLevelOfNesting; i++) { + totalToConsider += expressionsByLevel[i].size(); + } + if (totalToConsider > 0) { + int choice = boost::random::uniform_int_distribution( + 0, totalToConsider - 1)(rng_); + for (int i = 0; i <= uptoLevelOfNesting; i++) { + if (choice >= expressionsByLevel[i].size()) { + choice -= expressionsByLevel[i].size(); + continue; + } + return expressionsByLevel[i][choice]; + } + VELOX_CHECK(false, "Should have found an expression."); + } + return nullptr; +} + } // namespace facebook::velox::fuzzer diff --git a/velox/expression/fuzzer/FuzzerToolkit.h b/velox/expression/fuzzer/FuzzerToolkit.h index a6c326250165..95a80b58d018 100644 --- a/velox/expression/fuzzer/FuzzerToolkit.h +++ b/velox/expression/fuzzer/FuzzerToolkit.h @@ -15,8 +15,10 @@ */ #pragma once +#include "velox/core/ITypedExpr.h" #include "velox/expression/FunctionSignature.h" #include "velox/vector/ComplexVector.h" +#include "velox/vector/fuzzer/VectorFuzzer.h" namespace facebook::velox::fuzzer { @@ -143,4 +145,106 @@ struct InputRowMetadata { const char* filePath, memory::MemoryPool* pool); }; + +/// Used to enable re-use of sub-expressions in expression fuzzer by exposing an +/// API that allows for randomly picking an expression that has a specific +/// return type and a nesting level less than or equal to a specified limit. It +/// ensures that all expressions that are valid candidates have an equal +/// probability of selection. +class ExprBank { + public: + ExprBank(FuzzerGenerator& rng, int maxLevelOfNesting) + : rng_(rng), maxLevelOfNesting_(maxLevelOfNesting) {} + + /// Adds an expression to the bank. + void insert(const core::TypedExprPtr& expression); + + /// Returns a randomly selected expression of the requested 'returnType' + /// which is guaranteed to have a nesting level less than or equal to + /// 'uptoLevelOfNesting'. Returns a nullptr if no such function can be + /// found. + core::TypedExprPtr getRandomExpression( + const TypePtr& returnType, + int uptoLevelOfNesting); + + /// Removes all the expressions from the bank. Should be called after + /// every fuzzer iteration. + void reset() { + typeToExprsByLevel_.clear(); + } + + private: + int getNestedLevel(const core::TypedExprPtr& expression) { + int level = 0; + for (auto& input : expression->inputs()) { + level = std::max(level, getNestedLevel(input) + 1); + } + return level; + } + + /// Reference to the random generator of the expression fuzzer. + FuzzerGenerator& rng_; + + /// Only expression having less than or equal to this level of nesting + /// will be generated. + int maxLevelOfNesting_; + + /// Represents a vector where each index contains a list of expressions + /// such that the depth of each expression tree is equal to that index. + using ExprsIndexedByLevel = std::vector>; + + /// Maps a 'Type' serialized as a string to an object of type + /// ExprsIndexedByLevel + std::unordered_map typeToExprsByLevel_; +}; + +struct ExpressionFuzzerState { + void reset() { + inputRowTypes_.clear(); + inputRowNames_.clear(); + typeToColumnNames_.clear(); + expressionBank_.reset(); + expressionStats_.clear(); + customInputGenerators_.clear(); + } + + ExpressionFuzzerState(FuzzerGenerator& rng, int maxLevelOfNesting) + : expressionBank_(rng, maxLevelOfNesting), + remainingLevelOfNesting_(maxLevelOfNesting) {} + + /// Used to track all generated expressions within a single iteration and + /// support expression re-use. + ExprBank expressionBank_; + + /// Contains the types and names of the input vector that the generated + /// expressions consume. + std::vector inputRowTypes_; + std::vector inputRowNames_; + /// Contains the custom input generators for the input vectors. + std::vector customInputGenerators_; + + // Count how many times each function has been selected. + std::unordered_map expressionStats_; + + /// Maps a 'Type' serialized as a string to the column names that have + /// already been generated. Used to easily look up columns that can be + /// re-used when a specific type is required as input to a callable. + std::unordered_map> typeToColumnNames_; + + /// The remaining levels of expression nesting. It's initialized by + /// FLAGS_max_level_of_nesting and updated in generateExpression(). When + /// its value decreases to 0, we don't generate subexpressions anymore. + int32_t remainingLevelOfNesting_; +}; + +class ArgValuesGenerator { + public: + virtual ~ArgValuesGenerator() = default; + + virtual std::vector generate( + const CallableSignature& signature, + const VectorFuzzer::Options& options, + FuzzerGenerator& rng, + ExpressionFuzzerState& state) = 0; +}; } // namespace facebook::velox::fuzzer diff --git a/velox/expression/fuzzer/SparkExpressionFuzzerTest.cpp b/velox/expression/fuzzer/SparkExpressionFuzzerTest.cpp index aa80e74d3e49..d9ce3a99ad37 100644 --- a/velox/expression/fuzzer/SparkExpressionFuzzerTest.cpp +++ b/velox/expression/fuzzer/SparkExpressionFuzzerTest.cpp @@ -107,6 +107,7 @@ int main(int argc, char** argv) { {{}}, queryConfigs, argGenerators, + {{}}, referenceQueryRunner, std::make_shared< facebook::velox::fuzzer::SparkSpecialFormSignatureGenerator>()); diff --git a/velox/expression/fuzzer/tests/ExpressionFuzzerUnitTest.cpp b/velox/expression/fuzzer/tests/ExpressionFuzzerUnitTest.cpp index b619cd984452..e8fdd336e825 100644 --- a/velox/expression/fuzzer/tests/ExpressionFuzzerUnitTest.cpp +++ b/velox/expression/fuzzer/tests/ExpressionFuzzerUnitTest.cpp @@ -143,7 +143,7 @@ TEST_F(ExpressionFuzzerUnitTest, exprBank) { 0, vectorfuzzer, makeOptionsWithMaxLevelNesting(maxLevelOfNesting)}; - ExpressionFuzzer::ExprBank exprBank(seed, maxLevelOfNesting); + ExprBank exprBank(seed, maxLevelOfNesting); for (int i = 0; i < 5000; ++i) { auto expression = fuzzer.fuzzExpression().expressions[0]; // Verify that if there is a single expression then it is returned @@ -171,7 +171,7 @@ TEST_F(ExpressionFuzzerUnitTest, exprBank) { 0, vectorfuzzer, makeOptionsWithMaxLevelNesting(maxLevelOfNesting)}; - ExpressionFuzzer::ExprBank exprBank(seed, maxLevelOfNesting); + ExprBank exprBank(seed, maxLevelOfNesting); for (int i = 0; i < 1000; ++i) { auto expression = fuzzer.fuzzExpression().expressions[0]; exprBank.insert(expression); diff --git a/velox/vector/fuzzer/ConstrainedVectorGenerator.cpp b/velox/vector/fuzzer/ConstrainedVectorGenerator.cpp index 72db3075f3ad..835fc10634b1 100644 --- a/velox/vector/fuzzer/ConstrainedVectorGenerator.cpp +++ b/velox/vector/fuzzer/ConstrainedVectorGenerator.cpp @@ -25,7 +25,7 @@ using exec::VectorWriter; // static VectorPtr ConstrainedVectorGenerator::generateConstant( - const std::shared_ptr& customGenerator, + const AbstractInputGeneratorPtr& customGenerator, vector_size_t size, memory::MemoryPool* pool) { VELOX_CHECK_NOT_NULL(customGenerator); @@ -119,7 +119,7 @@ void writeOne(const variant& v, GenericWriter& writer) { // static VectorPtr ConstrainedVectorGenerator::generateFlat( - const std::shared_ptr& customGenerator, + const AbstractInputGeneratorPtr& customGenerator, vector_size_t size, memory::MemoryPool* pool) { VELOX_CHECK_NOT_NULL(customGenerator); diff --git a/velox/vector/fuzzer/ConstrainedVectorGenerator.h b/velox/vector/fuzzer/ConstrainedVectorGenerator.h index f543624739ed..78e6f3ca7b72 100644 --- a/velox/vector/fuzzer/ConstrainedVectorGenerator.h +++ b/velox/vector/fuzzer/ConstrainedVectorGenerator.h @@ -26,12 +26,12 @@ class ConstrainedVectorGenerator { ConstrainedVectorGenerator() = delete; static VectorPtr generateConstant( - const std::shared_ptr& customGenerator, + const AbstractInputGeneratorPtr& customGenerator, vector_size_t size, memory::MemoryPool* pool); static VectorPtr generateFlat( - const std::shared_ptr& customGenerator, + const AbstractInputGeneratorPtr& customGenerator, vector_size_t size, memory::MemoryPool* pool); }; diff --git a/velox/vector/fuzzer/tests/ConstrainedVectorGeneratorTest.cpp b/velox/vector/fuzzer/tests/ConstrainedVectorGeneratorTest.cpp index ef6a59d091fa..a7cda0bee230 100644 --- a/velox/vector/fuzzer/tests/ConstrainedVectorGeneratorTest.cpp +++ b/velox/vector/fuzzer/tests/ConstrainedVectorGeneratorTest.cpp @@ -35,7 +35,7 @@ class ConstrainedVectorGeneratorTest : public testing::Test, const variant& excludedValue) { using T = typename TypeTraits::NativeType; const uint32_t kSize = 1000; - std::shared_ptr generator = + AbstractInputGeneratorPtr generator = std::make_shared( 0, type, @@ -69,7 +69,7 @@ class ConstrainedVectorGeneratorTest : public testing::Test, void testGenerateVectorsComplex(const TypePtr& type) { using T = typename TypeTraits::ImplType; const uint32_t kSize = 1000; - std::shared_ptr generator = + AbstractInputGeneratorPtr generator = std::make_shared>(0, type, 0.5); auto vector = ConstrainedVectorGenerator::generateFlat(generator, kSize, pool());