Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[YQL-17103] Support StartsWith with pg types in extract_predicate library #854

Merged
merged 4 commits into from
Jan 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,18 @@

**Сигнатуры**
```
StartsWith(Utf8, Utf8)->Bool
StartsWith(Utf8[?], Utf8[?])->Bool?
StartsWith(String, String)->Bool
StartsWith(String[?], String[?])->Bool?
StartsWith(T str, U prefix)->Bool[?]

EndsWith(Utf8, Utf8)->Bool
EndsWith(Utf8[?], Utf8[?])->Bool?
EndsWith(String, String)->Bool
EndsWith(String[?], String[?])->Bool?
EndsWith(T str, U suffix)->Bool[?]
```

Обязательные аргументы:

* Исходная строка;
* Искомая подстрока.

Аргументы могут быть типов `String` или `Utf8` и могут быть опциональными.
Аргументы должны иметь тип `String`/`Utf8` (или опциональный String`/`Utf8`) либо строковый PostgreSQL тип (`PgText`/`PgBytea`/`PgVarchar`).
Результатом функции является опциональный Bool, за исключением случая, когда оба аргумента неопциональные – в этом случае возвращается Bool.

**Примеры**
``` yql
Expand All @@ -35,3 +30,6 @@ SELECT StartsWith("abcd", NULL); -- null
``` yql
SELECT EndsWith(NULL, Utf8("")); -- null
```
``` yql
SELECT StartsWith("abc_efg"u, "abc"p) AND EndsWith("abc_efg", "efg"pv); -- true
```
10 changes: 10 additions & 0 deletions ydb/library/yql/core/common_opt/yql_co_simple1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4665,6 +4665,16 @@ void RegisterCoSimpleCallables1(TCallableOptimizerMap& map) {
map["IsDistinctFrom"] = std::bind(&OptimizeDistinctFrom<false>, _1, _2);

map["StartsWith"] = map["EndsWith"] = map["StringContains"] = [](const TExprNode::TPtr& node, TExprContext& ctx, TOptimizeContext& /*optCtx*/) {
if (node->Head().GetTypeAnn()->GetKind() == ETypeAnnotationKind::Pg || node->Tail().GetTypeAnn()->GetKind() == ETypeAnnotationKind::Pg) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why this overload is needed?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This overload helps to simplify logic of predicate analysis. This way we make StartsWith with pg types to look exactly the same as with plain YQL types (no intermediate FromPg is needed), so the code which looks for StartsWith over Member() call will work without pg-specific tweaks.

Also, accepting PG types here looks like a nice feature to me.

TExprNodeList converted;
for (auto& child : node->ChildrenList()) {
const bool isPg = child->GetTypeAnn()->GetKind() == ETypeAnnotationKind::Pg;
converted.emplace_back(ctx.WrapByCallableIf(isPg, "FromPg", std::move(child)));
}
YQL_CLOG(DEBUG, Core) << "Converting Pg strings to YQL strings in " << node->Content();
return ctx.ChangeChildren(*node, std::move(converted));
}

if (node->Tail().IsCallable("String") && node->Tail().Head().Content().empty()) {
YQL_CLOG(DEBUG, Core) << node->Content() << " with empty string in second argument";
if (node->GetTypeAnn()->GetKind() == ETypeAnnotationKind::Optional) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "extract_predicate_impl.h"

#include <ydb/library/yql/core/type_ann/type_ann_pg.h>
#include <ydb/library/yql/core/yql_expr_type_annotation.h>
#include <ydb/library/yql/core/yql_opt_utils.h>
#include <ydb/library/yql/core/yql_expr_constraint.h>
Expand Down Expand Up @@ -781,6 +782,17 @@ TExprNode::TPtr OptimizeNodeForRangeExtraction(const TExprNode::TPtr& node, cons
}
}

if (node->IsCallable("StartsWith")) {
if (node->Head().IsCallable("FromPg")) {
YQL_CLOG(DEBUG, Core) << "Get rid of FromPg() in " << node->Content() << " first argument";
return ctx.ChangeChild(*node, 0, node->Head().HeadPtr());
}
if (node->Tail().GetTypeAnn()->GetKind() == ETypeAnnotationKind::Pg) {
YQL_CLOG(DEBUG, Core) << "Convert second argument of " << node->Content() << " from PG type";
return ctx.ChangeChild(*node, 1, ctx.NewCallable(node->Tail().Pos(), "FromPg", {node->TailPtr()}));
}
}

return node;
}

Expand Down Expand Up @@ -911,13 +923,22 @@ TExprNode::TPtr BuildSingleComputeRange(const TStructExprType& rowType,

if (opNode->IsCallable("StartsWith")) {
YQL_ENSURE(keys.size() == 1);
return ctx.Builder(pos)
const bool keyIsPg = firstKeyType->GetKind() == ETypeAnnotationKind::Pg;
const TTypeAnnotationNode* rangeForType = firstKeyType;
if (keyIsPg) {
const TTypeAnnotationNode* yqlType = NTypeAnnImpl::FromPgImpl(pos, firstKeyType, ctx);
YQL_ENSURE(yqlType);
rangeForType = yqlType;
YQL_ENSURE(opNode->Tail().GetTypeAnn()->GetKind() != ETypeAnnotationKind::Pg);
}
auto rangeForNode = ctx.Builder(pos)
.Callable("RangeFor")
.Atom(0, hasNot ? "NotStartsWith" : "StartsWith", TNodeFlags::Default)
.Add(1, opNode->TailPtr())
.Add(2, ExpandType(pos, *firstKeyType, ctx))
.Add(2, ExpandType(pos, *rangeForType, ctx))
.Seal()
.Build();
return ctx.WrapByCallableIf(keyIsPg, "RangeToPg", std::move(rangeForNode));
}

if (opNode->IsCallable("SqlIn")) {
Expand Down
1 change: 1 addition & 0 deletions ydb/library/yql/core/extract_predicate/ya.make
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ SRCS(

PEERDIR(
ydb/library/yql/core/services
ydb/library/yql/core/type_ann
)

YQL_LAST_ABI_VERSION()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7602,6 +7602,7 @@ struct TPeepHoleRules {
{"RangeEmpty", &ExpandRangeEmpty},
{"AsRange", &ExpandAsRange},
{"RangeFor", &ExpandRangeFor},
{"RangeToPg", &ExpandRangeToPg},
{"ToFlow", &DropToFlowDeps},
{"CheckedAdd", &ExpandCheckedAdd},
{"CheckedSub", &ExpandCheckedSub},
Expand Down
71 changes: 66 additions & 5 deletions ydb/library/yql/core/type_ann/type_ann_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3225,14 +3225,32 @@ namespace NTypeAnnImpl {
return IGraphTransformer::TStatus::Repeat;
}

bool isOptional1, isOptional2;
if (const TDataExprType *dataTypeOne, *dataTypeTwo;
!(EnsureDataOrOptionalOfData(input->Head(), isOptional1, dataTypeOne, ctx.Expr) && EnsureDataOrOptionalOfData(input->Tail(), isOptional2, dataTypeTwo, ctx.Expr)
&& EnsureStringOrUtf8Type(input->Head().Pos(), *dataTypeOne, ctx.Expr) && EnsureStringOrUtf8Type(input->Tail().Pos(), *dataTypeTwo, ctx.Expr))) {
if (!EnsureComputable(input->Head(), ctx.Expr) || !EnsureComputable(input->Tail(), ctx.Expr)) {
return IGraphTransformer::TStatus::Error;
}

if (isOptional1 || isOptional2)
bool hasOptionals = false;
for (auto& child : input->ChildrenList()) {
const TTypeAnnotationNode* type = child->GetTypeAnn();
if (type->GetKind() == ETypeAnnotationKind::Pg) {
type = FromPgImpl(child->Pos(), type, ctx.Expr);
if (!type) {
return IGraphTransformer::TStatus::Error;
}
}
bool isOptional = false;
const TDataExprType* dataType = nullptr;
if (!IsDataOrOptionalOfData(type, isOptional, dataType) ||
!(dataType->GetSlot() == EDataSlot::String || dataType->GetSlot() == EDataSlot::Utf8))
{
ctx.Expr.AddError(TIssue(ctx.Expr.GetPosition(child->Pos()), TStringBuilder()
<< "Expected (optional) string/utf8 or corresponding Pg type, but got: " << *child->GetTypeAnn()));
return IGraphTransformer::TStatus::Error;
}
hasOptionals = hasOptionals || isOptional;
}

if (hasOptionals)
input->SetTypeAnn(ctx.Expr.MakeType<TOptionalExprType>(ctx.Expr.MakeType<TDataExprType>(EDataSlot::Bool)));
else
input->SetTypeAnn(ctx.Expr.MakeType<TDataExprType>(EDataSlot::Bool));
Expand Down Expand Up @@ -11107,6 +11125,48 @@ template <NKikimr::NUdf::EDataSlot DataSlot>
return IGraphTransformer::TStatus::Ok;
}

IGraphTransformer::TStatus RangeToPgWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TContext& ctx) {
Y_UNUSED(output);

if (!EnsureArgsCount(*input, 1, ctx.Expr)) {
return IGraphTransformer::TStatus::Error;
}

if (!EnsureListType(input->Head(), ctx.Expr)) {
return IGraphTransformer::TStatus::Error;
}

auto argType = input->Head().GetTypeAnn();
auto rangeType = argType->Cast<TListExprType>()->GetItemType();
if (!EnsureValidRange(input->Head().Pos(), rangeType, ctx.Expr)) {
return IGraphTransformer::TStatus::Error;
}

auto boundaryType = rangeType->Cast<TTupleExprType>()->GetItems().front();
const auto& boundaryItems = boundaryType->Cast<TTupleExprType>()->GetItems();

TTypeAnnotationNode::TListType resultBoundaryItems;
resultBoundaryItems.reserve(boundaryItems.size());
for (size_t i = 0; i < boundaryItems.size(); ++i) {
if (i % 2 == 0) {
resultBoundaryItems.push_back(boundaryItems[i]);
} else {
auto keyType = boundaryItems[i]->Cast<TOptionalExprType>()->GetItemType();
auto pgKeyType = ToPgImpl(input->Head().Pos(), keyType, ctx.Expr);
if (!pgKeyType) {
return IGraphTransformer::TStatus::Error;
}
resultBoundaryItems.push_back(ctx.Expr.MakeType<TOptionalExprType>(pgKeyType));
}
}

const TTypeAnnotationNode* resultBoundaryType = ctx.Expr.MakeType<TTupleExprType>(resultBoundaryItems);
const TTypeAnnotationNode* resultRangeType =
ctx.Expr.MakeType<TTupleExprType>(TTypeAnnotationNode::TListType{resultBoundaryType, resultBoundaryType});
input->SetTypeAnn(ctx.Expr.MakeType<TListExprType>(resultRangeType));
return IGraphTransformer::TStatus::Ok;
}

IGraphTransformer::TStatus RangeCreateWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TContext& ctx) {
Y_UNUSED(output);

Expand Down Expand Up @@ -12164,6 +12224,7 @@ template <NKikimr::NUdf::EDataSlot DataSlot>
ExtFunctions["OrderedSqlRename"] = &SqlRenameWrapper;

Functions["AsRange"] = &AsRangeWrapper;
Functions["RangeToPg"] = &RangeToPgWrapper;
Functions["RangeCreate"] = &RangeCreateWrapper;
Functions["RangeEmpty"] = &RangeEmptyWrapper;
Functions["RangeFor"] = &RangeForWrapper;
Expand Down
47 changes: 47 additions & 0 deletions ydb/library/yql/core/yql_opt_range.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -519,4 +519,51 @@ TExprNode::TPtr ExpandRangeFor(const TExprNode::TPtr& node, TExprContext& ctx) {
return result;
}

TExprNode::TPtr ExpandRangeToPg(const TExprNode::TPtr& node, TExprContext& ctx) {
YQL_ENSURE(node->IsCallable("RangeToPg"));
const size_t numComponents = node->Head().GetTypeAnn()->Cast<TListExprType>()->GetItemType()->
Cast<TTupleExprType>()->GetItems().front()->Cast<TTupleExprType>()->GetSize();
return ctx.Builder(node->Pos())
.Callable("OrderedMap")
.Add(0, node->HeadPtr())
.Lambda(1)
.Param("range")
.Callable("StaticMap")
.Arg(0, "range")
.Lambda(1)
.Param("boundary")
.List()
.Do([&](TExprNodeBuilder& parent) -> TExprNodeBuilder& {
for (size_t i = 0; i < numComponents; ++i) {
if (i % 2 == 0) {
parent
.Callable(i, "Nth")
.Arg(0, "boundary")
.Atom(1, i)
.Seal();
} else {
parent
.Callable(i, "Map")
.Callable(0, "Nth")
.Arg(0, "boundary")
.Atom(1, i)
.Seal()
.Lambda(1)
.Param("unwrapped")
.Callable("ToPg")
.Arg(0, "unwrapped")
.Seal()
.Seal()
.Seal();
}
}
return parent;
})
.Seal()
.Seal()
.Seal()
.Seal()
.Seal()
.Build();
}
}
1 change: 1 addition & 0 deletions ydb/library/yql/core/yql_opt_range.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ namespace NYql {
TExprNode::TPtr ExpandRangeEmpty(const TExprNode::TPtr& node, TExprContext& ctx);
TExprNode::TPtr ExpandAsRange(const TExprNode::TPtr& node, TExprContext& ctx);
TExprNode::TPtr ExpandRangeFor(const TExprNode::TPtr& node, TExprContext& ctx);
TExprNode::TPtr ExpandRangeToPg(const TExprNode::TPtr& node, TExprContext& ctx);

}

22 changes: 22 additions & 0 deletions ydb/library/yql/tests/sql/dq_file/part5/canondata/result.json
Original file line number Diff line number Diff line change
Expand Up @@ -2210,6 +2210,28 @@
}
],
"test.test[pg-select_win_count-default.txt-Results]": [],
"test.test[pg-str_lookup_pg-default.txt-Analyze]": [
{
"checksum": "a48ccc9922567dfee1170d2c2df45b6e",
"size": 2153,
"uri": "https://{canondata_backend}/1784826/cbc63541f63d78da712c6e11ae70c4ee10dfb428/resource.tar.gz#test.test_pg-str_lookup_pg-default.txt-Analyze_/plan.txt"
}
],
"test.test[pg-str_lookup_pg-default.txt-Debug]": [
{
"checksum": "851bbcc3bbf2c5f21c51a7d61851aba1",
"size": 1657,
"uri": "https://{canondata_backend}/1784826/cbc63541f63d78da712c6e11ae70c4ee10dfb428/resource.tar.gz#test.test_pg-str_lookup_pg-default.txt-Debug_/opt.yql_patched"
}
],
"test.test[pg-str_lookup_pg-default.txt-Plan]": [
{
"checksum": "a48ccc9922567dfee1170d2c2df45b6e",
"size": 2153,
"uri": "https://{canondata_backend}/1784826/cbc63541f63d78da712c6e11ae70c4ee10dfb428/resource.tar.gz#test.test_pg-str_lookup_pg-default.txt-Plan_/plan.txt"
}
],
"test.test[pg-str_lookup_pg-default.txt-Results]": [],
"test.test[pg-sublink_order_any_corr-default.txt-Analyze]": [
{
"checksum": "b4dd508a329723c74293d80f0278c705",
Expand Down
28 changes: 28 additions & 0 deletions ydb/library/yql/tests/sql/sql2yql/canondata/result.json
Original file line number Diff line number Diff line change
Expand Up @@ -3947,6 +3947,13 @@
"uri": "https://{canondata_backend}/1773845/fe2146df711e0729e3c3cc1bc9b2c5b1fdfcfea1/resource.tar.gz#test_sql2yql.test_compute_range-pg_sqlin_/sql.yql"
}
],
"test_sql2yql.test[compute_range-pg_startswith]": [
{
"checksum": "f2e42e95b7b84fd210244e0c61c3f614",
"size": 4450,
"uri": "https://{canondata_backend}/1031349/96841816c51116681477e138bb81b6493013c777/resource.tar.gz#test_sql2yql.test_compute_range-pg_startswith_/sql.yql"
}
],
"test_sql2yql.test[compute_range-preserve_rest_predicates_order]": [
{
"checksum": "4915841ad83886d7f63fe939e0848687",
Expand Down Expand Up @@ -12067,6 +12074,13 @@
"uri": "https://{canondata_backend}/1599023/af9c2f81df0601cf266a0926b5ce73b6101b9115/resource.tar.gz#test_sql2yql.test_pg-single_input_filter_over_join_/sql.yql"
}
],
"test_sql2yql.test[pg-str_lookup_pg]": [
{
"checksum": "15ae2647f3110534a4e0e10d89a19e35",
"size": 6373,
"uri": "https://{canondata_backend}/1775059/5625478e977a363be64a17bebddbd8ed18706eac/resource.tar.gz#test_sql2yql.test_pg-str_lookup_pg_/sql.yql"
}
],
"test_sql2yql.test[pg-struct_tuple_cast]": [
{
"checksum": "e99eaf940d72eb246c5fe60c7f2f687d",
Expand Down Expand Up @@ -21251,6 +21265,13 @@
"uri": "https://{canondata_backend}/1773845/fe2146df711e0729e3c3cc1bc9b2c5b1fdfcfea1/resource.tar.gz#test_sql_format.test_compute_range-pg_sqlin_/formatted.sql"
}
],
"test_sql_format.test[compute_range-pg_startswith]": [
{
"checksum": "b06b88f1965f643fea24cb7e5d8d0459",
"size": 955,
"uri": "https://{canondata_backend}/1031349/96841816c51116681477e138bb81b6493013c777/resource.tar.gz#test_sql_format.test_compute_range-pg_startswith_/formatted.sql"
}
],
"test_sql_format.test[compute_range-preserve_rest_predicates_order]": [
{
"checksum": "77cd36176a336f2a79ee10f5697b124f",
Expand Down Expand Up @@ -28055,6 +28076,13 @@
"uri": "https://{canondata_backend}/1880306/64654158d6bfb1289c66c626a8162239289559d0/resource.tar.gz#test_sql_format.test_pg-simple_ops_/formatted.sql"
}
],
"test_sql_format.test[pg-str_lookup_pg]": [
{
"checksum": "f1954f2bb0c2bf59abe9752284f424cc",
"size": 637,
"uri": "https://{canondata_backend}/1775059/5625478e977a363be64a17bebddbd8ed18706eac/resource.tar.gz#test_sql_format.test_pg-str_lookup_pg_/formatted.sql"
}
],
"test_sql_format.test[pg-struct_tuple_cast]": [
{
"checksum": "d77766b8458d94c8c4af56c3d439d2dd",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/* syntax version 1 */
/* postgres can not */
/* dq can not */
/* dqfile can not */
/* yt can not */
pragma warning("disable", "4510");
pragma warning("disable", "1108");

-- like 'aaaa'
select YQL::RangeComputeFor(
Struct<a:PgInt4,b:PgText>,
($row) -> (StartsWith(FromPg($row.b), 'aaaa') ?? false),
AsTuple(AsAtom("b"))
);

-- not like 'aaaa'
select YQL::RangeComputeFor(
Struct<a:PgInt4,b:PgText>,
($row) -> (not (StartsWith(FromPg($row.b), 'aaaa') ?? true)),
AsTuple(AsAtom("b"))
);


-- like <invalid utf8>
select YQL::RangeComputeFor(
Struct<a:PgInt4,b:PgText>,
($row) -> (StartsWith(FromPg($row.b), 'a\xf5') ?? false),
AsTuple(AsAtom("b"))
);

-- not like <invalid utf8>
select YQL::RangeComputeFor(
Struct<a:PgInt4,b:PgText>,
($row) -> (not (StartsWith(FromPg($row.b), 'a\xf5') ?? true)),
AsTuple(AsAtom("b"))
);
Loading
Loading