Skip to content

Commit

Permalink
Search - add range query support to wildcard field (elastic#57881)
Browse files Browse the repository at this point in the history
Add range query support to wildcard field

Closes elastic#57816
  • Loading branch information
markharwood committed Jun 11, 2020
1 parent 54d4f2a commit 70acd90
Show file tree
Hide file tree
Showing 2 changed files with 250 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,19 @@
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.automaton.RegExp.Kind;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.geo.ShapeRelation;
import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.time.DateMathParser;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
Expand Down Expand Up @@ -70,6 +74,7 @@

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
Expand Down Expand Up @@ -613,6 +618,12 @@ static Query simplify(Query input) {
static boolean isMatchAll(Query q) {
return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery;
}

protected String firstNgramToken(String fragment) {
LinkedHashSet<String> tokens = new LinkedHashSet<>();
getNgramTokens(tokens, fragment);
return tokens.iterator().next();
}

protected void getNgramTokens(Set<String> tokens, String fragment) {
if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) {
Expand Down Expand Up @@ -678,6 +689,90 @@ private void addClause(String token, BooleanQuery.Builder bqBuilder, Occur occur
}
}

@Override
public Query rangeQuery(
Object lowerTerm,
Object upperTerm,
boolean includeLower,
boolean includeUpper,
ShapeRelation relation,
ZoneId timeZone,
DateMathParser parser,
QueryShardContext context
) {
if (context.allowExpensiveQueries() == false) {
throw new ElasticsearchException("[range] queries on [wildcard] fields cannot be executed when '" +
ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false.");
}
BytesRef lower = lowerTerm == null ? null : BytesRefs.toBytesRef(lowerTerm);
BytesRef upper = upperTerm == null ? null : BytesRefs.toBytesRef(upperTerm);
Query accelerationQuery = null;
if (lowerTerm != null && upperTerm != null) {
// Long common prefixes e.g. "C:/Program Files/a,txt" to "C:/Program Files/z,txt"
// can be accelerated by searching for all the common leading ngrams e.g. c:/, /pr, rog, gra etc
StringBuilder commonPrefix = new StringBuilder();
String lowerS = addLineEndChars(toLowerCase(lower.utf8ToString()));
String upperS = addLineEndChars(toLowerCase(upper.utf8ToString()));
for (int i = 0; i < Math.min(lowerS.length(), upperS.length());) {
final int cL = lowerS.codePointAt(i);
final int cU = upperS.codePointAt(i);
if (cL == cU) {
commonPrefix.append(Character.toChars(cL));
} else {
break;
}
int length = Character.charCount(cL);
i += length;
}

if (commonPrefix.length() > 0) {
Set<String> tokens = new HashSet<>();
getNgramTokens(tokens, commonPrefix.toString());
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
for (String token : tokens) {
int tokenSize = token.codePointCount(0, token.length());
if (tokenSize < 2 || token.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
continue;
}

if (tokenSize == NGRAM_SIZE) {
TermQuery tq = new TermQuery(new Term(name(), token));
bqBuilder.add(new BooleanClause(tq, Occur.MUST));
} else {
PrefixQuery wq = new PrefixQuery(new Term(name(), token));
wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
bqBuilder.add(new BooleanClause(wq, Occur.MUST));
}
}
BooleanQuery bq = bqBuilder.build();
if (bq.clauses().size() > 0) {
accelerationQuery = bq;
}
}
}
if (accelerationQuery == null) {
// Fallback - if there is no common prefix sequence then we look for the range of ngrams that appear at the start
// of the string e.g. given 100 to 999 we would search for ngrams in the range
// TOKEN_START_OR_END_CHAR + "10" to
// TOKEN_START_OR_END_CHAR + "99"
BytesRef lowerNgram = lower == null ? null : new BytesRef(firstNgramToken(
addLineEndChars(toLowerCase(lower.utf8ToString()))));
BytesRef upperNgram = upper == null ? null : new BytesRef(firstNgramToken(
addLineEndChars(toLowerCase(upper.utf8ToString()))));
accelerationQuery = new TermRangeQuery(name(), lowerNgram, upperNgram, true, true);
}

Supplier <Automaton> deferredAutomatonSupplier = ()->{
return TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper);
};
AutomatonQueryOnBinaryDv slowQuery = new AutomatonQueryOnBinaryDv(name(), lower + "-" + upper, deferredAutomatonSupplier);

BooleanQuery.Builder qBuilder = new BooleanQuery.Builder();
qBuilder.add(accelerationQuery, Occur.MUST);
qBuilder.add(slowQuery, Occur.MUST);
return qBuilder.build();
}

@Override
public Query fuzzyQuery(
Object value,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
Expand Down Expand Up @@ -214,7 +215,7 @@ public void testSearchResultsVersusKeywordField() throws IOException {
Query wildcardFieldQuery = null;
Query keywordFieldQuery = null;
String pattern = null;
switch (randomInt(3)) {
switch (randomInt(4)) {
case 0:
pattern = getRandomWildcardPattern();
wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
Expand Down Expand Up @@ -259,6 +260,14 @@ public void testSearchResultsVersusKeywordField() throws IOException {
keywordFieldQuery = keywordFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50,
transpositions, MOCK_QSC);
break;
case 4:
TermRangeQuery trq = getRandomRange(values);
wildcardFieldQuery = wildcardFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
trq.includesUpper(), null, null, null, MOCK_QSC);
keywordFieldQuery = keywordFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
trq.includesUpper(), null, null, null, MOCK_QSC);
break;

}
TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.RELEVANCE);
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.RELEVANCE);
Expand Down Expand Up @@ -294,6 +303,76 @@ public void testSearchResultsVersusKeywordField() throws IOException {
dir.close();
}

private void indexDoc(RandomIndexWriter iw, String value) throws IOException {
Document doc = new Document();
ParseContext.Document parseDoc = new ParseContext.Document();
addFields(parseDoc, doc, value);
indexDoc(parseDoc, doc, iw);
}

public void testRangeQueryVersusKeywordField() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
iwc.setMergePolicy(newTieredMergePolicy(random()));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);

// Tests for acceleration strategy based on long common prefix
indexDoc(iw, "C:\\Program Files\\a.txt");
indexDoc(iw, "C:\\Program Files\\n.txt");
indexDoc(iw, "C:\\Program Files\\z.txt");

// Tests for acceleration strategy based on no common prefix
indexDoc(iw, "a.txt");
indexDoc(iw, "n.txt");
indexDoc(iw, "z.txt");

iw.forceMerge(1);
DirectoryReader reader = iw.getReader();
IndexSearcher searcher = newSearcher(reader);
iw.close();


String [][] rangeTests = {
{"C:\\Program Files\\a", "C:\\Program Files\\z"},
{"C:\\Program Files\\a", "C:\\Program Files\\n"},
{null, "C:\\Program Files\\z"},
{"C:\\Program Files\\a", null},

{"a.txt", "z.txt"},
{"a.txt", "n.txt"},
{null, "z.txt"},
{"a.txt", null}
};

for (String[] bounds : rangeTests) {
BytesRef lower = bounds[0] == null ? null :new BytesRef(bounds[0]);
BytesRef upper = bounds[1] == null ? null :new BytesRef(bounds[1]);
TermRangeQuery trq = new TermRangeQuery(WILDCARD_FIELD_NAME, lower, upper, randomBoolean(), randomBoolean());
Query wildcardFieldQuery = wildcardFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
trq.includesUpper(), null, null, null, MOCK_QSC);
Query keywordFieldQuery = keywordFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
trq.includesUpper(), null, null, null, MOCK_QSC);


TopDocs kwTopDocs = searcher.search(keywordFieldQuery, 10, Sort.RELEVANCE);
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.RELEVANCE);
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(kwTopDocs.totalHits.value));

HashSet<Integer> expectedDocs = new HashSet<>();
for (ScoreDoc topDoc : kwTopDocs.scoreDocs) {
expectedDocs.add(topDoc.doc);
}
for (ScoreDoc wcTopDoc : wildcardFieldTopDocs.scoreDocs) {
assertTrue(expectedDocs.remove(wcTopDoc.doc));
}
assertThat(expectedDocs.size(), equalTo(0));

}
reader.close();
dir.close();
}


public void testRegexAcceleration() throws IOException, ParseException {
// All these expressions should rewrite to a match all with no verification step required at all
String superfastRegexes[]= { ".*", "...*..", "(foo|bar|.*)", "@"};
Expand Down Expand Up @@ -485,6 +564,54 @@ public void testFuzzyAcceleration() throws IOException, ParseException {
}
}


static class RangeTest {
String lower;
String upper;
String ngrams;

RangeTest(
String lower,
String upper,
String ngrams
) {
super();
this.lower = lower;
this.upper = upper;
this.ngrams = ngrams;
}

Query getRangeQuery() {
return wildcardFieldType.fieldType().rangeQuery(lower, upper, true, true, null, null, null, MOCK_QSC);
}

Query getExpectedApproxQuery() throws ParseException {
BooleanQuery.Builder bq = new BooleanQuery.Builder();
if (ngrams != null) {
String[] tokens = ngrams.split(" ");
for (String token : tokens) {
Query ngramQuery = new TermQuery(
new Term(WILDCARD_FIELD_NAME, token.replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
);
bq.add(ngramQuery, Occur.MUST);
}
}
return bq.build();
}
}

public void testRangeAcceleration() throws IOException, ParseException {

RangeTest[] tests = {
new RangeTest("c:/a.txt", "c:/z.txt", "_c: c:/"),
new RangeTest("C:/ProgramFiles/a.txt", "C:/ProgramFiles/z.txt", "_c: :/p pro ogr ram mfi ile es/"),
};
for (RangeTest test : tests) {
Query wildcardFieldQuery = test.getRangeQuery();
testExpectedAccelerationQuery(test.lower + "-" + test.upper, wildcardFieldQuery, test.getExpectedApproxQuery());
}
}

void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException {

QueryParser qsp = new QueryParser(WILDCARD_FIELD_NAME, new KeywordAnalyzer());
Expand Down Expand Up @@ -530,6 +657,33 @@ private String getRandomFuzzyPattern(HashSet<String> values, int edits, int pref
}
return randomValue;
}

private TermRangeQuery getRandomRange(HashSet<String> values) {
// Pick one of the indexed document values to focus our queries on.
String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];
StringBuilder upper = new StringBuilder();
//Pick a part of the string to change
int substitutionPoint = randomIntBetween(0, randomValue.length()-1);
int substitutionLength = randomIntBetween(1, Math.min(10, randomValue.length() - substitutionPoint));

//Add any head to the result, unchanged
if(substitutionPoint >0) {
upper.append(randomValue.substring(0,substitutionPoint));
}

// Modify the middle...
String replacementPart = randomValue.substring(substitutionPoint, substitutionPoint+substitutionLength);
// .-replace all a chars with z
upper.append(replacementPart.replaceAll("a", "z"));

//add any remaining tail, unchanged
if(substitutionPoint + substitutionLength <= randomValue.length()-1) {
upper.append(randomValue.substring(substitutionPoint + substitutionLength));
}
return new TermRangeQuery(WILDCARD_FIELD_NAME, new BytesRef(randomValue), new BytesRef(upper.toString()),
randomBoolean(), randomBoolean());
}


private String getRandomRegexPattern(HashSet<String> values) {
// Pick one of the indexed document values to focus our queries on.
Expand Down

0 comments on commit 70acd90

Please sign in to comment.