Skip to content

Commit

Permalink
First cut at adding case insensitive flag for term, terms, termsInSet…
Browse files Browse the repository at this point in the history
…, prefix, wildcard queries

Closes #61546
  • Loading branch information
markharwood committed Sep 18, 2020
1 parent 2b0418d commit 685a419
Show file tree
Hide file tree
Showing 46 changed files with 902 additions and 134 deletions.
4 changes: 4 additions & 0 deletions docs/reference/query-dsl/prefix-query.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ provided `<field>`.
(Optional, string) Method used to rewrite the query. For valid values and more
information, see the <<query-dsl-multi-term-rewrite, `rewrite` parameter>>.

`case_insensitive`::
(Optional, boolean) allows ASCII case insensitive matching of the
value with the indexed field values when set to true. Setting to false is disallowed.

[[prefix-query-notes]]
==== Notes

Expand Down
8 changes: 6 additions & 2 deletions docs/reference/query-dsl/term-query.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ Boost values are relative to the default value of `1.0`. A boost value between
`0` and `1.0` decreases the relevance score. A value greater than `1.0`
increases the relevance score.

`case_insensitive`::
(Optional, boolean) allows ASCII case insensitive matching of the
value with the indexed field values when set to true. Setting to false is disallowed.

[[term-query-notes]]
==== Notes

Expand All @@ -84,7 +88,7 @@ The `term` query does *not* analyze the search term. The `term` query only
searches for the *exact* term you provide. This means the `term` query may
return poor or no results when searching `text` fields.

To see the difference in search results, try the following example.
To see the difference in search results, try the following example.

. Create an index with a `text` field called `full_text`.
+
Expand Down Expand Up @@ -213,4 +217,4 @@ in the results.
}
----
// TESTRESPONSE[s/"took" : 1/"took" : $body.took/]
--
--
6 changes: 5 additions & 1 deletion docs/reference/query-dsl/wildcard-query.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ This parameter supports two wildcard operators:

WARNING: Avoid beginning patterns with `*` or `?`. This can increase
the iterations needed to find matching terms and slow search performance.
--
--

`boost`::
(Optional, float) Floating point number used to decrease or increase the
Expand All @@ -69,6 +69,10 @@ increases the relevance score.
(Optional, string) Method used to rewrite the query. For valid values and more information, see the
<<query-dsl-multi-term-rewrite, `rewrite` parameter>>.

`case_insensitive`::
(Optional, boolean) allows case insensitive matching of the
pattern with the indexed field values when set to true. Setting to false is disallowed.

[[wildcard-query-notes]]
==== Notes
===== Allow expensive queries
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -281,11 +281,11 @@ public Query existsQuery(QueryShardContext context) {
}

@Override
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, QueryShardContext context) {
if (prefixField == null || prefixField.termLengthWithinBounds(value.length()) == false) {
return super.prefixQuery(value, method, context);
return super.prefixQuery(value, method, caseInsensitive, context);
} else {
final Query query = prefixField.prefixQuery(value, method, context);
final Query query = prefixField.prefixQuery(value, method, caseInsensitive, context);
if (method == null
|| method == MultiTermQuery.CONSTANT_SCORE_REWRITE
|| method == MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE) {
Expand Down Expand Up @@ -365,8 +365,11 @@ boolean termLengthWithinBounds(int length) {
}

@Override
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, QueryShardContext context) {
if (value.length() >= minChars) {
if(caseInsensitive) {
return super.termQueryCaseInsensitive(value, context);
}
return super.termQuery(value, context);
}
List<Automaton> automata = new ArrayList<>();
Expand Down Expand Up @@ -507,11 +510,11 @@ public Query existsQuery(QueryShardContext context) {
}

@Override
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, QueryShardContext context) {
if (prefixFieldType == null || prefixFieldType.termLengthWithinBounds(value.length()) == false) {
return super.prefixQuery(value, method, context);
return super.prefixQuery(value, method, caseInsensitive, context);
} else {
final Query query = prefixFieldType.prefixQuery(value, method, context);
final Query query = prefixFieldType.prefixQuery(value, method, caseInsensitive, context);
if (method == null
|| method == MultiTermQuery.CONSTANT_SCORE_REWRITE
|| method == MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,13 +136,15 @@ public Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int
}

@Override
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method,
boolean caseInsensitive, QueryShardContext context) {
throw new UnsupportedOperationException("[prefix] queries are not supported on [" + CONTENT_TYPE + "] fields.");
}

@Override
public Query wildcardQuery(String value,
@Nullable MultiTermQuery.RewriteMethod method,
boolean caseInsensitive,
QueryShardContext context) {
throw new UnsupportedOperationException("[wildcard] queries are not supported on [" + CONTENT_TYPE + "] fields.");
}
Expand Down
14 changes: 14 additions & 0 deletions server/src/main/java/org/elasticsearch/common/Strings.java
Original file line number Diff line number Diff line change
Expand Up @@ -879,4 +879,18 @@ public static String padStart(String s, int minimumLength, char c) {
return sb.toString();
}
}

public static String toLowercaseAscii(String in) {
StringBuilder out = new StringBuilder();
Iterator<Integer> iter = in.codePoints().iterator();
while (iter.hasNext()) {
int codepoint = iter.next();
if (codepoint > 128) {
out.appendCodePoint(codepoint);
} else {
out.appendCodePoint(Character.toLowerCase(codepoint));
}
}
return out.toString();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.common.lucene.search;

import org.apache.lucene.index.Term;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.MinimizationOperations;
import org.apache.lucene.util.automaton.Operations;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
* Helper functions for creating various forms of {@link AutomatonQuery}
*/
public class AutomatonQueries {



/** Build an automaton query accepting all terms with the specified prefix, ASCII case insensitive. */
public static Automaton caseInsensitivePrefix(String s) {
List<Automaton> list = new ArrayList<>();
Iterator<Integer> iter = s.codePoints().iterator();
while (iter.hasNext()) {
list.add(toCaseInsensitiveChar(iter.next(), Integer.MAX_VALUE));
}
list.add(Automata.makeAnyString());

Automaton a = Operations.concatenate(list);
a = MinimizationOperations.minimize(a, Integer.MAX_VALUE);
return a;
}


/** Build an automaton query accepting all terms with the specified prefix, ASCII case insensitive. */
public static AutomatonQuery caseInsensitivePrefixQuery(Term prefix) {
return new AutomatonQuery(prefix, caseInsensitivePrefix(prefix.text()));
}

/** Build an automaton accepting all terms ASCII case insensitive. */
public static AutomatonQuery caseInsensitiveTermQuery(Term term) {
BytesRef prefix = term.bytes();
return new AutomatonQuery(term, toCaseInsensitiveString(prefix,Integer.MAX_VALUE));
}


/** Build an automaton matching a wildcard pattern, ASCII case insensitive. */
public static AutomatonQuery caseInsensitiveWildcardQuery(Term wildcardquery) {
return new AutomatonQuery(wildcardquery, toCaseInsensitiveWildcardAutomaton(wildcardquery,Integer.MAX_VALUE));
}


/** String equality with support for wildcards */
public static final char WILDCARD_STRING = '*';

/** Char equality with support for wildcards */
public static final char WILDCARD_CHAR = '?';

/** Escape character */
public static final char WILDCARD_ESCAPE = '\\';
/**
* Convert Lucene wildcard syntax into an automaton.
*/
@SuppressWarnings("fallthrough")
public static Automaton toCaseInsensitiveWildcardAutomaton(Term wildcardquery, int maxDeterminizedStates) {
List<Automaton> automata = new ArrayList<>();

String wildcardText = wildcardquery.text();

for (int i = 0; i < wildcardText.length();) {
final int c = wildcardText.codePointAt(i);
int length = Character.charCount(c);
switch(c) {
case WILDCARD_STRING:
automata.add(Automata.makeAnyString());
break;
case WILDCARD_CHAR:
automata.add(Automata.makeAnyChar());
break;
case WILDCARD_ESCAPE:
// add the next codepoint instead, if it exists
if (i + length < wildcardText.length()) {
final int nextChar = wildcardText.codePointAt(i + length);
length += Character.charCount(nextChar);
automata.add(Automata.makeChar(nextChar));
break;
} // else fallthru, lenient parsing with a trailing \
default:
automata.add(toCaseInsensitiveChar(c, maxDeterminizedStates));
}
i += length;
}

return Operations.concatenate(automata);
}

protected static Automaton toCaseInsensitiveString(BytesRef br, int maxDeterminizedStates) {
return toCaseInsensitiveString(br.utf8ToString(), maxDeterminizedStates);
}

public static Automaton toCaseInsensitiveString(String s, int maxDeterminizedStates) {
List<Automaton> list = new ArrayList<>();
Iterator<Integer> iter = s.codePoints().iterator();
while (iter.hasNext()) {
list.add(toCaseInsensitiveChar(iter.next(), maxDeterminizedStates));
}

Automaton a = Operations.concatenate(list);
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
return a;


}

protected static Automaton toCaseInsensitiveChar(int codepoint, int maxDeterminizedStates) {
Automaton case1 = Automata.makeChar(codepoint);
// For now we only work with ASCII characters
if (codepoint > 128) {
return case1;
}
int altCase = Character.isLowerCase(codepoint) ? Character.toUpperCase(codepoint) : Character.toLowerCase(codepoint);
Automaton result;
if (altCase != codepoint) {
result = Operations.union(case1, Automata.makeChar(altCase));
result = MinimizationOperations.minimize(result, maxDeterminizedStates);
} else {
result = case1;
}
return result;
}
}
34 changes: 29 additions & 5 deletions server/src/main/java/org/elasticsearch/common/regex/Regex.java
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,39 @@ public static Automaton simpleMatchToAutomaton(String... patterns) {
* Match a String against the given pattern, supporting the following simple
* pattern styles: "xxx*", "*xxx", "*xxx*" and "xxx*yyy" matches (with an
* arbitrary number of pattern parts), as well as direct equality.
* Matching is case sensitive.
*
* @param pattern the pattern to match against
* @param str the String to match
* @return whether the String matches the given pattern
*/
public static boolean simpleMatch(String pattern, String str) {
return simpleMatch(pattern, str, false);
}


/**
* Match a String against the given pattern, supporting the following simple
* pattern styles: "xxx*", "*xxx", "*xxx*" and "xxx*yyy" matches (with an
* arbitrary number of pattern parts), as well as direct equality.
*
* @param pattern the pattern to match against
* @param str the String to match
* @param caseInsensitive true if ASCII case differences should be ignored
* @return whether the String matches the given pattern
*/
public static boolean simpleMatch(String pattern, String str, boolean caseInsensitive) {
if (pattern == null || str == null) {
return false;
}
if (caseInsensitive) {
pattern = Strings.toLowercaseAscii(pattern);
str = Strings.toLowercaseAscii(str);
}
return simpleMatchWithNormalizedStrings(pattern, str);
}

private static boolean simpleMatchWithNormalizedStrings(String pattern, String str) {
final int firstIndex = pattern.indexOf('*');
if (firstIndex == -1) {
return pattern.equals(str);
Expand All @@ -102,12 +126,12 @@ public static boolean simpleMatch(String pattern, String str) {
return str.regionMatches(str.length() - pattern.length() + 1, pattern, 1, pattern.length() - 1);
} else if (nextIndex == 1) {
// Double wildcard "**" - skipping the first "*"
return simpleMatch(pattern.substring(1), str);
return simpleMatchWithNormalizedStrings(pattern.substring(1), str);
}
final String part = pattern.substring(1, nextIndex);
int partIndex = str.indexOf(part);
while (partIndex != -1) {
if (simpleMatch(pattern.substring(nextIndex), str.substring(partIndex + part.length()))) {
if (simpleMatchWithNormalizedStrings(pattern.substring(nextIndex), str.substring(partIndex + part.length()))) {
return true;
}
partIndex = str.indexOf(part, partIndex + 1);
Expand All @@ -116,9 +140,9 @@ public static boolean simpleMatch(String pattern, String str) {
}
return str.regionMatches(0, pattern, 0, firstIndex)
&& (firstIndex == pattern.length() - 1 // only wildcard in pattern is at the end, so no need to look at the rest of the string
|| simpleMatch(pattern.substring(firstIndex), str.substring(firstIndex)));
}

|| simpleMatchWithNormalizedStrings(pattern.substring(firstIndex), str.substring(firstIndex)));
}
/**
* Match a String against the given patterns, supporting the following simple
* pattern styles: "xxx*", "*xxx", "*xxx*" and "xxx*yyy" matches (with an
Expand Down
Loading

0 comments on commit 685a419

Please sign in to comment.