Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SparkSql function to_pretty_string #10359

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions velox/docs/functions/spark/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,16 @@ String Functions
SELECT substring_index('aaaaa', 'aa', 5); -- "aaaaa"
SELECT substring_index('aaaaa', 'aa', -5); -- "aaaaa"

.. spark:function:: to_pretty_string(x) -> varchar

Returns pretty string for ``x``. All scalar types are supported.
Adjusts the timestamp input to the given time zone if set through ``session_timezone`` config.
The result is different from that of casting ``x`` as string in the following aspects.

- It prints null input as "NULL" rather than producing null output.

- It prints binary values using the hex format.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It prints binary value as hex string representation rather than UTF-8.


.. spark:function:: translate(string, match, replace) -> varchar

Returns a new translated string. It translates the character in ``string`` by a
Expand Down
7 changes: 7 additions & 0 deletions velox/functions/lib/RegistrationHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ void registerUnaryFloatingPoint(const std::vector<std::string>& aliases) {
registerFunction<T, float, float>(aliases);
}

template <template <class> class T, typename TReturn>
void registerUnaryFloatingPointWithReturn(
const std::vector<std::string>& aliases) {
registerFunction<T, TReturn, double>(aliases);
registerFunction<T, TReturn, float>(aliases);
}

template <template <class> class T>
void registerUnaryNumeric(const std::vector<std::string>& aliases) {
registerUnaryIntegral<T>(aliases);
Expand Down
24 changes: 24 additions & 0 deletions velox/functions/sparksql/Register.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
#include "velox/functions/sparksql/Split.h"
#include "velox/functions/sparksql/String.h"
#include "velox/functions/sparksql/StringToMap.h"
#include "velox/functions/sparksql/ToPrettyString.h"
#include "velox/functions/sparksql/UnscaledValueFunction.h"
#include "velox/functions/sparksql/Uuid.h"
#include "velox/functions/sparksql/specialforms/DecimalRound.h"
Expand Down Expand Up @@ -169,6 +170,27 @@ inline void registerArrayMinMaxFunctions(const std::string& prefix) {
registerArrayMinMaxFunctions<Timestamp>(prefix);
registerArrayMinMaxFunctions<Date>(prefix);
}

void registerToPrettyStringFunctions(const std::string& prefix) {
const std::vector<std::string> aliases = {prefix + "to_pretty_string"};
registerUnaryIntegralWithTReturn<ToPrettyStringFunction, Varchar>(aliases);
registerUnaryFloatingPointWithReturn<ToPrettyStringFunction, Varchar>(
aliases);
registerFunction<ToPrettyStringFunction, Varchar, bool>(aliases);
registerFunction<ToPrettyStringFunction, Varchar, Varchar>(aliases);
registerFunction<ToPrettyStringVarbinaryFunction, Varchar, Varbinary>(
aliases);
registerFunction<ToPrettyStringFunction, Varchar, Date>(aliases);
registerFunction<ToPrettyStringTimestampFunction, Varchar, Timestamp>(
aliases);
registerFunction<ToPrettyStringFunction, Varchar, UnknownValue>(aliases);
registerFunction<
ToPrettyStringDecimalFunction,
Varchar,
ShortDecimal<P1, S1>>(aliases);
registerFunction<ToPrettyStringDecimalFunction, Varchar, LongDecimal<P1, S1>>(
aliases);
}
} // namespace

void registerFunctions(const std::string& prefix) {
Expand Down Expand Up @@ -223,6 +245,8 @@ void registerFunctions(const std::string& prefix) {
Varchar,
Varchar>({prefix + "str_to_map"});

registerToPrettyStringFunctions(prefix);
rui-mo marked this conversation as resolved.
Show resolved Hide resolved

registerFunction<sparksql::LeftFunction, Varchar, Varchar, int32_t>(
{prefix + "left"});

Expand Down
205 changes: 205 additions & 0 deletions velox/functions/sparksql/ToPrettyString.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include "velox/expression/CastExpr-inl.h"
#include "velox/functions/Udf.h"
#include "velox/type/Conversions.h"

namespace facebook::velox::functions::sparksql {
namespace detail {
static const StringView kNull = "NULL";
}

/// to_pretty_string(x) -> varchar
/// Returns pretty string for int8, int16, int32, int64, bool, Date, Varchar. It
/// has one difference with casting value to string:
/// 1) It prints null input as "NULL" rather than producing null output.
template <typename TExec>
struct ToPrettyStringFunction {
rui-mo marked this conversation as resolved.
Show resolved Hide resolved
VELOX_DEFINE_FUNCTION_TYPES(TExec);

// Results refer to strings in the first argument.
static constexpr int32_t reuse_strings_from_arg = 0;
rui-mo marked this conversation as resolved.
Show resolved Hide resolved

template <typename A>
void initialize(
const std::vector<TypePtr>& inputTypes,
const core::QueryConfig& /*config*/,
A* /*a*/) {
inputType_ = inputTypes[0];
}

template <typename TInput>
Status callNullable(out_type<Varchar>& result, const TInput* input) {
if (input) {
if constexpr (std::is_same_v<TInput, StringView>) {
result.setNoCopy(*input);
return Status::OK();
}
if constexpr (std::is_same_v<TInput, int32_t>) {
if (inputType_->isDate()) {
try {
auto output = DATE()->toString(*input);
result.append(output);
} catch (const std::exception& e) {
return Status::Invalid(e.what());
}
return Status::OK();
}
}
const auto castResult =
util::Converter<TypeKind::VARCHAR, void, util::SparkCastPolicy>::
tryCast(*input);
VELOX_DCHECK(!castResult.hasError());
result.copy_from(castResult.value());
} else {
result.setNoCopy(detail::kNull);
}
return Status::OK();
}

private:
TypePtr inputType_;
};

/// Returns pretty string for varbinary. It has several differences with
/// cast(varbinary as string):
/// 1) It prints null input as "NULL" rather than producing null output.
/// 2) It prints binary value as hex string representation rather than UTF-8.
/// The pretty string is composed of the hex digits of bytes and spaces between
/// them. E.g., the result of to_pretty_string("abc") is "[31 32 33]".
template <typename TExec>
struct ToPrettyStringVarbinaryFunction {
VELOX_DEFINE_FUNCTION_TYPES(TExec);

template <typename TInput>
void callNullable(out_type<Varchar>& result, const TInput* input) {
if (input) {
// One byte spares 2 char, and with the spaces and the boxes.
// Byte size: 2 * input->size(), spaces size: input->size() - 1, boxes
// size: 2, its sum is 1 + 3 * input->size().
result.resize(1 + 3 * input->size());
rui-mo marked this conversation as resolved.
Show resolved Hide resolved
char* const startPosition = result.data();
char* pos = startPosition;
*pos++ = '[';
for (auto i = 0; i < input->size(); i++) {
int count = std::sprintf(pos, "%02X", input->data()[i]);
VELOX_DCHECK_EQ(count, 2);
pos += 2;
*pos++ = ' ';
}
*--pos = ']';
} else {
result.setNoCopy(detail::kNull);
}
}
};

/// Returns pretty string for Timestamp. It has one difference with
/// cast(timestamp as string):
/// 1) It prints null input as "NULL" rather than producing null output.
template <typename TExec>
struct ToPrettyStringTimestampFunction {
VELOX_DEFINE_FUNCTION_TYPES(TExec);

void initialize(
const std::vector<TypePtr>& /*inputTypes*/,
const core::QueryConfig& config,
const arg_type<Timestamp>* /*timestamp*/) {
auto timezone = config.sessionTimezone();
if (!timezone.empty()) {
options_.timeZone = tz::locateZone(timezone);
}
timestampRowSize_ = getMaxStringLength(options_);
}

Status callNullable(
out_type<Varchar>& result,
const arg_type<Timestamp>* timestamp) {
if (timestamp) {
Timestamp inputValue(*timestamp);
try {
if (options_.timeZone) {
inputValue.toTimezone(*(options_.timeZone));
}
result.reserve(timestampRowSize_);
const auto stringView =
Timestamp::tsToStringView(inputValue, options_, result.data());
result.resize(stringView.size());
} catch (const std::exception& e) {
return Status::Invalid(e.what());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Try-catch is used for date and timestamp types. Wondering if it is preferred to make these APIs return status.

}
} else {
result.setNoCopy(detail::kNull);
}
return Status::OK();
}

private:
TimestampToStringOptions options_ = {
.precision = TimestampToStringOptions::Precision::kMicroseconds,
.leadingPositiveSign = true,
.skipTrailingZeros = true,
.zeroPaddingYear = true,
.dateTimeSeparator = ' ',
};
std::string::size_type timestampRowSize_;
};

/// Returns pretty string for short decimal and long decimal. It has one
/// difference with cast(decimal as string):
/// 1) It prints null input as "NULL" rather than producing null output.
template <typename TExec>
struct ToPrettyStringDecimalFunction {
VELOX_DEFINE_FUNCTION_TYPES(TExec);

template <typename A>
void initialize(
const std::vector<TypePtr>& inputTypes,
const core::QueryConfig& /*config*/,
A* /*a*/) {
auto [precision, scale] = getDecimalPrecisionScale(*inputTypes[0]);
precision_ = precision;
scale_ = scale;
maxRowSize_ = velox::DecimalUtil::maxStringViewSize(precision, scale);
}

template <typename TInput>
void callNullable(out_type<Varchar>& result, const TInput* input) {
if (input) {
if (StringView::isInline(maxRowSize_)) {
DecimalUtil::castToString<TInput>(
*input, scale_, maxRowSize_, inlined_);
result.setNoCopy(inlined_);
} else {
result.reserve(maxRowSize_);
auto actualSize = DecimalUtil::castToString<TInput>(
*input, scale_, maxRowSize_, result.data());
result.resize(actualSize);
}
} else {
result.setNoCopy(detail::kNull);
}
}

private:
uint8_t precision_;
uint8_t scale_;
int32_t maxRowSize_;
char inlined_[StringView::kInlineSize];
};
} // namespace facebook::velox::functions::sparksql
1 change: 1 addition & 0 deletions velox/functions/sparksql/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ add_executable(
SplitTest.cpp
StringTest.cpp
StringToMapTest.cpp
ToPrettyStringTest.cpp
UnscaledValueFunctionTest.cpp
UuidTest.cpp
XxHash64Test.cpp)
Expand Down
Loading
Loading