Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Kernel][Expressions] Add support for LIKE expression #3103

Merged
merged 10 commits into from
May 23, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@
* <li>Since version: 3.2.0</li>
* </ul>
* </li>
* <li>Name: <code>LIKE</code>
* <ul>
* <li>SQL semantic: <code>expr LIKE expr</code></li>
* <li>Since version: 3.3.0</li>
* </ul>
* </li>
* </ol>
*
* @since 3.0.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,15 @@ public static UnsupportedOperationException unsupportedExpressionException(
reason);
return new UnsupportedOperationException(message);
}

/**
* Exception class for invalid escape sequence used in input for LIKE expressions
* @param pattern the invalid pattern
* @param index character index of occurrence of the offending escape in the pattern
*/
public static IllegalArgumentException invalidEscapeSequence(String pattern, int index) {
return new IllegalArgumentException(
format("LIKE expression has invalid escape sequence '%s' at index %d",
pattern, index));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.util.stream.Collectors;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import static java.util.stream.Collectors.toList;

import io.delta.kernel.data.ColumnVector;
import io.delta.kernel.data.ColumnarBatch;
Expand All @@ -31,8 +32,6 @@
import static io.delta.kernel.internal.util.ExpressionUtils.getRight;
import static io.delta.kernel.internal.util.ExpressionUtils.getUnaryChild;
import static io.delta.kernel.internal.util.Preconditions.checkArgument;


import io.delta.kernel.defaults.internal.data.vector.DefaultBooleanVector;
import io.delta.kernel.defaults.internal.data.vector.DefaultConstantVector;
import static io.delta.kernel.defaults.internal.DefaultEngineErrors.unsupportedExpressionException;
Expand Down Expand Up @@ -280,6 +279,21 @@ ExpressionTransformResult visitCoalesce(ScalarExpression coalesce) {
);
}

@Override
ExpressionTransformResult visitLike(final Predicate like) {
List<ExpressionTransformResult> children =
like.getChildren().stream()
.map(this::visit)
.collect(toList());
Predicate transformedExpression =
LikeExpressionEvaluator.validateAndTransform(
like,
children.stream().map(e -> e.expression).collect(toList()),
children.stream().map(e -> e.outputType).collect(toList()));

return new ExpressionTransformResult(transformedExpression, BooleanType.BOOLEAN);
}

private Predicate validateIsPredicate(
Expression baseExpression,
ExpressionTransformResult result) {
Expand Down Expand Up @@ -560,6 +574,15 @@ ColumnVector visitCoalesce(ScalarExpression coalesce) {
);
}

@Override
ColumnVector visitLike(final Predicate like) {
List<Expression> children = like.getChildren();
return LikeExpressionEvaluator.eval(
children.stream()
.map(this::visit)
.collect(toList()));
}

/**
* Utility method to evaluate inputs to the binary input expression. Also validates the
* evaluated expression result {@link ColumnVector}s are of the same size.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ abstract class ExpressionVisitor<R> {

abstract R visitCoalesce(ScalarExpression ifNull);

abstract R visitLike(Predicate predicate);

final R visit(Expression expression) {
if (expression instanceof PartitionValueExpression) {
return visitPartitionValue((PartitionValueExpression) expression);
Expand Down Expand Up @@ -105,6 +107,8 @@ private R visitScalarExpression(ScalarExpression expression) {
return visitIsNull(new Predicate(name, children));
case "COALESCE":
return visitCoalesce(expression);
case "LIKE":
return visitLike(new Predicate(name, children));
default:
throw new UnsupportedOperationException(
String.format("Scalar expression `%s` is not supported.", name));
Expand All @@ -114,8 +118,8 @@ private R visitScalarExpression(ScalarExpression expression) {
private static Predicate elemAsPredicate(List<Expression> expressions, int index) {
if (expressions.size() <= index) {
throw new RuntimeException(
String.format("Trying to access invalid entry (%d) in list %s", index,
expressions.stream().map(Object::toString).collect(joining(","))));
String.format("Trying to access invalid entry (%d) in list %s", index,
expressions.stream().map(Object::toString).collect(joining(","))));
}
Expression elemExpression = expressions.get(index);
if (!(elemExpression instanceof Predicate)) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
/*
* Copyright (2023) The Delta Lake Project Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.delta.kernel.defaults.internal.expressions;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.regex.Pattern;

import io.delta.kernel.data.ColumnVector;
import io.delta.kernel.expressions.Expression;
import io.delta.kernel.expressions.Literal;
import io.delta.kernel.expressions.Predicate;
import io.delta.kernel.types.BooleanType;
import io.delta.kernel.types.DataType;
import io.delta.kernel.types.StringType;
import io.delta.kernel.internal.util.Utils;

import static io.delta.kernel.defaults.internal.DefaultEngineErrors.invalidEscapeSequence;
import static io.delta.kernel.defaults.internal.DefaultEngineErrors.unsupportedExpressionException;

/**
* Utility methods to evaluate {@code like} expression.
*/
public class LikeExpressionEvaluator {
private LikeExpressionEvaluator() {
}

static Predicate validateAndTransform(
Predicate like,
List<Expression> childrenExpressions,
List<DataType> childrenOutputTypes) {
int size = childrenExpressions.size();
if (size < 2 || size > 3) {
throw unsupportedExpressionException(like,
"Invalid number of inputs to LIKE expression. " +
"Example usage: LIKE(column, 'test%'), LIKE(column, 'test\\[%', '\\')");
}

Expression left = childrenExpressions.get(0);
DataType leftOutputType = childrenOutputTypes.get(0);
Expression right = childrenExpressions.get(1);
DataType rightOutputType = childrenOutputTypes.get(1);
Expression escapeCharExpr = size == 3 ? childrenExpressions.get(2) : null;
DataType escapeCharOutputType = size == 3 ? childrenOutputTypes.get(2) : null;

if (!(StringType.STRING.equivalent(leftOutputType)
&& StringType.STRING.equivalent(rightOutputType))) {
throw unsupportedExpressionException(like,
"LIKE is only supported for string type expressions");
}

if (escapeCharExpr != null &&
(!(escapeCharExpr instanceof Literal &&
StringType.STRING.equivalent(escapeCharOutputType)))) {
throw unsupportedExpressionException(like,
"LIKE expects escape token expression to be a literal of String type");
}

Literal literal = (Literal) escapeCharExpr;
if (literal != null &&
literal.getValue().toString().length() != 1) {
throw unsupportedExpressionException(like,
"LIKE expects escape token to be a single character");
}

List<Expression> children = new ArrayList<>(Arrays.asList(left, right));
if(Objects.nonNull(escapeCharExpr)) {
children.add(escapeCharExpr);
}
return new Predicate(like.getName(), children);
}

static ColumnVector eval(List<ColumnVector> children) {
final char DEFAULT_ESCAPE_CHAR = '\\';

return new ColumnVector() {
final ColumnVector escapeCharVector =
children.size() == 3 ?
children.get(2) :
null;
final ColumnVector left = children.get(0);
final ColumnVector right = children.get(1);

Character escapeChar = null;

public void initEscapeCharIfRequired() {
if (escapeChar == null) {
escapeChar =
escapeCharVector != null && !escapeCharVector.getString(0).isEmpty() ?
escapeCharVector.getString(0).charAt(0) :
DEFAULT_ESCAPE_CHAR;
}
}

@Override
public DataType getDataType() {
return BooleanType.BOOLEAN;
}

@Override
public int getSize() {
return left.getSize();
}

@Override
public void close() {
Utils.closeCloseables(left, right);
}

@Override
public boolean getBoolean(int rowId) {
initEscapeCharIfRequired();
return isLike(left.getString(rowId), right.getString(rowId), escapeChar);
}

@Override
public boolean isNullAt(int rowId) {
return left.isNullAt(rowId) || right.isNullAt(rowId);
}

public boolean isLike(String input, String pattern, char escape) {
if (!Objects.isNull(input) && !Objects.isNull(pattern)) {
String regex = escapeLikeRegex(pattern, escape);
return input.matches(regex);
}
return false;
}
};
}

/**
* utility method to convert a predicate pattern to a java regex
* @param pattern the pattern used in the expression
* @param escape escape character to use
* @return java regex
*/
private static String escapeLikeRegex(String pattern, char escape) {
final int len = pattern.length();
final StringBuilder javaPattern = new StringBuilder(len + len);
for (int i = 0; i < len; i++) {
char c = pattern.charAt(i);

if (c == escape) {
if (i == (pattern.length() - 1)) {
throw invalidEscapeSequence(pattern, i);
krishnanravi marked this conversation as resolved.
Show resolved Hide resolved
}
char nextChar = pattern.charAt(i + 1);
if ((nextChar == '_')
|| (nextChar == '%')
|| (nextChar == escape)) {
javaPattern.append(Pattern.quote(Character.toString(nextChar)));
i++;
} else {
throw invalidEscapeSequence(pattern, i);
}
} else if (c == '_') {
javaPattern.append('.');
} else if (c == '%') {
javaPattern.append(".*");
} else {
javaPattern.append(Pattern.quote(Character.toString(c)));
}

}
return "(?s)" + javaPattern;
}
}
Loading
Loading