From 32730cfdc596072e2a1b55dd80698f26847a3c5f Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Fri, 3 Jan 2020 09:55:53 +0000 Subject: [PATCH] Add fuzzy intervals source (#49762) This intervals source will return terms that are similar to an input term, up to an edit distance defined by fuzziness, similar to FuzzyQuery. Closes #49595 --- .../query-dsl/intervals-query.asciidoc | 39 ++++- .../test/search/230_interval_query.yml | 21 +++ .../org/apache/lucene/queries/XIntervals.java | 4 + .../index/query/IntervalsSourceProvider.java | 149 ++++++++++++++++++ .../elasticsearch/search/SearchModule.java | 2 + .../query/IntervalQueryBuilderTests.java | 58 +++++++ 6 files changed, 272 insertions(+), 1 deletion(-) diff --git a/docs/reference/query-dsl/intervals-query.asciidoc b/docs/reference/query-dsl/intervals-query.asciidoc index 9f9280c80a484..7fc9d60b26397 100644 --- a/docs/reference/query-dsl/intervals-query.asciidoc +++ b/docs/reference/query-dsl/intervals-query.asciidoc @@ -73,6 +73,7 @@ Valid rules include: * <> * <> * <> +* <> * <> * <> -- @@ -97,7 +98,7 @@ set to `0`, the terms must appear next to each other. -- `ordered`:: -(Optional, boolean) +(Optional, boolean) If `true`, matching terms must appear in their specified order. Defaults to `false`. @@ -177,6 +178,42 @@ The `pattern` is normalized using the search analyzer from this field, unless `analyzer` is specified separately. -- +[[intervals-fuzzy]] +==== `fuzzy` rule parameters + +The `fuzzy` rule matches terms that are similar to the provided term, within an +edit distance defined by <>. If the fuzzy expansion matches more than +128 terms, {es} returns an error. + +`term`:: +(Required, string) The term to match + +`prefix_length`:: +(Optional, string) Number of beginning characters left unchanged when creating +expansions. Defaults to `0`. + +`transpositions`:: +(Optional, boolean) Indicates whether edits include transpositions of two +adjacent characters (ab → ba). Defaults to `true`. + +`fuzziness`:: +(Optional, string) Maximum edit distance allowed for matching. See <> +for valid values and more information. Defaults to `auto`. + +`analyzer`:: +(Optional, string) <> used to normalize the `term`. +Defaults to the top-level `` 's analyzer. + +`use_field`:: ++ +-- +(Optional, string) If specified, match intervals from this field rather than the +top-level ``. + +The `term` is normalized using the search analyzer from this field, unless +`analyzer` is specified separately. +-- + [[intervals-all_of]] ==== `all_of` rule parameters diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search/230_interval_query.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search/230_interval_query.yml index 82aa0883008a8..654ef2a2e173f 100644 --- a/rest-api-spec/src/main/resources/rest-api-spec/test/search/230_interval_query.yml +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search/230_interval_query.yml @@ -424,3 +424,24 @@ setup: pattern: out?ide - match: { hits.total.value: 3 } +--- +"Test fuzzy match": + - skip: + version: " - 8.0.0" + reason: "TODO: change to 7.6 in backport" + - do: + search: + index: test + body: + query: + intervals: + text: + all_of: + intervals: + - fuzzy: + query: cald + - prefix: + prefix: out + - match: { hits.total.value: 3 } + + diff --git a/server/src/main/java/org/apache/lucene/queries/XIntervals.java b/server/src/main/java/org/apache/lucene/queries/XIntervals.java index b389a29c21115..1d77094bd1427 100644 --- a/server/src/main/java/org/apache/lucene/queries/XIntervals.java +++ b/server/src/main/java/org/apache/lucene/queries/XIntervals.java @@ -67,6 +67,10 @@ public static IntervalsSource prefix(BytesRef prefix) { return new MultiTermIntervalsSource(ca, 128, prefix.utf8ToString()); } + public static IntervalsSource multiterm(CompiledAutomaton ca, String label) { + return new MultiTermIntervalsSource(ca, 128, label); + } + static class MultiTermIntervalsSource extends IntervalsSource { private final CompiledAutomaton automaton; diff --git a/server/src/main/java/org/elasticsearch/index/query/IntervalsSourceProvider.java b/server/src/main/java/org/elasticsearch/index/query/IntervalsSourceProvider.java index 4918d7c7c7f3f..dbd8f339ca66f 100644 --- a/server/src/main/java/org/elasticsearch/index/query/IntervalsSourceProvider.java +++ b/server/src/main/java/org/elasticsearch/index/query/IntervalsSourceProvider.java @@ -20,12 +20,15 @@ package org.elasticsearch.index.query; import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.Term; import org.apache.lucene.queries.XIntervals; import org.apache.lucene.queries.intervals.FilteredIntervalsSource; import org.apache.lucene.queries.intervals.IntervalIterator; import org.apache.lucene.queries.intervals.Intervals; import org.apache.lucene.queries.intervals.IntervalsSource; +import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.CompiledAutomaton; import org.elasticsearch.Version; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.ParsingException; @@ -33,7 +36,9 @@ import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.common.unit.Fuzziness; import org.elasticsearch.common.xcontent.ConstructingObjectParser; +import org.elasticsearch.common.xcontent.ObjectParser; import org.elasticsearch.common.xcontent.ToXContentFragment; import org.elasticsearch.common.xcontent.ToXContentObject; import org.elasticsearch.common.xcontent.XContentBuilder; @@ -85,6 +90,8 @@ public static IntervalsSourceProvider fromXContent(XContentParser parser) throws return Prefix.fromXContent(parser); case "wildcard": return Wildcard.fromXContent(parser); + case "fuzzy": + return Fuzzy.fromXContent(parser); } throw new ParsingException(parser.getTokenLocation(), "Unknown interval type [" + parser.currentName() + "], expecting one of [match, any_of, all_of, prefix, wildcard]"); @@ -691,6 +698,148 @@ String getUseField() { } } + public static class Fuzzy extends IntervalsSourceProvider { + + public static final String NAME = "fuzzy"; + + private final String term; + private final int prefixLength; + private final boolean transpositions; + private final Fuzziness fuzziness; + private final String analyzer; + private final String useField; + + public Fuzzy(String term, int prefixLength, boolean transpositions, Fuzziness fuzziness, String analyzer, String useField) { + this.term = term; + this.prefixLength = prefixLength; + this.transpositions = transpositions; + this.fuzziness = fuzziness; + this.analyzer = analyzer; + this.useField = useField; + } + + public Fuzzy(StreamInput in) throws IOException { + this.term = in.readString(); + this.prefixLength = in.readVInt(); + this.transpositions = in.readBoolean(); + this.fuzziness = new Fuzziness(in); + this.analyzer = in.readOptionalString(); + this.useField = in.readOptionalString(); + } + + @Override + public IntervalsSource getSource(QueryShardContext context, MappedFieldType fieldType) { + NamedAnalyzer analyzer = fieldType.searchAnalyzer(); + if (this.analyzer != null) { + analyzer = context.getMapperService().getIndexAnalyzers().get(this.analyzer); + } + IntervalsSource source; + if (useField != null) { + fieldType = context.fieldMapper(useField); + assert fieldType != null; + checkPositions(fieldType); + if (this.analyzer == null) { + analyzer = fieldType.searchAnalyzer(); + } + } + checkPositions(fieldType); + BytesRef normalizedTerm = analyzer.normalize(fieldType.name(), term); + FuzzyQuery fq = new FuzzyQuery(new Term(fieldType.name(), normalizedTerm), + fuzziness.asDistance(term), prefixLength, 128, transpositions); + CompiledAutomaton ca = new CompiledAutomaton(fq.toAutomaton()); + source = XIntervals.multiterm(ca, term); + if (useField != null) { + source = Intervals.fixField(useField, source); + } + return source; + } + + private void checkPositions(MappedFieldType type) { + if (type.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) { + throw new IllegalArgumentException("Cannot create intervals over field [" + type.name() + "] with no positions indexed"); + } + } + + @Override + public void extractFields(Set fields) { + if (useField != null) { + fields.add(useField); + } + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Fuzzy fuzzy = (Fuzzy) o; + return prefixLength == fuzzy.prefixLength && + transpositions == fuzzy.transpositions && + Objects.equals(term, fuzzy.term) && + Objects.equals(fuzziness, fuzzy.fuzziness) && + Objects.equals(analyzer, fuzzy.analyzer) && + Objects.equals(useField, fuzzy.useField); + } + + @Override + public int hashCode() { + return Objects.hash(term, prefixLength, transpositions, fuzziness, analyzer, useField); + } + + @Override + public String getWriteableName() { + return NAME; + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeString(term); + out.writeVInt(prefixLength); + out.writeBoolean(transpositions); + fuzziness.writeTo(out); + out.writeOptionalString(analyzer); + out.writeOptionalString(useField); + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(NAME); + builder.field("term", term); + builder.field("prefix_length", prefixLength); + builder.field("transpositions", transpositions); + fuzziness.toXContent(builder, params); + if (analyzer != null) { + builder.field("analyzer", analyzer); + } + if (useField != null) { + builder.field("use_field", useField); + } + builder.endObject(); + return builder; + } + + private static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>(NAME, args -> { + String term = (String) args[0]; + int prefixLength = (args[1] == null) ? FuzzyQueryBuilder.DEFAULT_PREFIX_LENGTH : (int) args[1]; + boolean transpositions = (args[2] == null) ? FuzzyQueryBuilder.DEFAULT_TRANSPOSITIONS : (boolean) args[2]; + Fuzziness fuzziness = (args[3] == null) ? FuzzyQueryBuilder.DEFAULT_FUZZINESS : (Fuzziness) args[3]; + String analyzer = (String) args[4]; + String useField = (String) args[5]; + return new Fuzzy(term, prefixLength, transpositions, fuzziness, analyzer, useField); + }); + static { + PARSER.declareString(constructorArg(), new ParseField("term")); + PARSER.declareInt(optionalConstructorArg(), new ParseField("prefix_length")); + PARSER.declareBoolean(optionalConstructorArg(), new ParseField("transpositions")); + PARSER.declareField(optionalConstructorArg(), (p, c) -> Fuzziness.parse(p), Fuzziness.FIELD, ObjectParser.ValueType.VALUE); + PARSER.declareString(optionalConstructorArg(), new ParseField("analyzer")); + PARSER.declareString(optionalConstructorArg(), new ParseField("use_field")); + } + + public static Fuzzy fromXContent(XContentParser parser) throws IOException { + return PARSER.parse(parser, null); + } + } + static class ScriptFilterSource extends FilteredIntervalsSource { final IntervalFilterScript script; diff --git a/server/src/main/java/org/elasticsearch/search/SearchModule.java b/server/src/main/java/org/elasticsearch/search/SearchModule.java index cdfd28760869c..ca64b749a5809 100644 --- a/server/src/main/java/org/elasticsearch/search/SearchModule.java +++ b/server/src/main/java/org/elasticsearch/search/SearchModule.java @@ -804,6 +804,8 @@ private void registerIntervalsSourceProviders() { IntervalsSourceProvider.Prefix.NAME, IntervalsSourceProvider.Prefix::new)); namedWriteables.add(new NamedWriteableRegistry.Entry(IntervalsSourceProvider.class, IntervalsSourceProvider.Wildcard.NAME, IntervalsSourceProvider.Wildcard::new)); + namedWriteables.add(new NamedWriteableRegistry.Entry(IntervalsSourceProvider.class, + IntervalsSourceProvider.Fuzzy.NAME, IntervalsSourceProvider.Fuzzy::new)); } private void registerQuery(QuerySpec spec) { diff --git a/server/src/test/java/org/elasticsearch/index/query/IntervalQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/IntervalQueryBuilderTests.java index 763d10ddf30e8..ed7caeb0473de 100644 --- a/server/src/test/java/org/elasticsearch/index/query/IntervalQueryBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/IntervalQueryBuilderTests.java @@ -19,17 +19,22 @@ package org.elasticsearch.index.query; +import org.apache.lucene.index.Term; import org.apache.lucene.queries.XIntervals; import org.apache.lucene.queries.intervals.IntervalQuery; import org.apache.lucene.queries.intervals.Intervals; +import org.apache.lucene.queries.intervals.IntervalsSource; import org.apache.lucene.search.BoostQuery; +import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.CompiledAutomaton; import org.elasticsearch.common.ParsingException; import org.elasticsearch.common.Strings; import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.unit.Fuzziness; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.index.mapper.MapperService; @@ -529,4 +534,57 @@ public void testWildcard() throws IOException { assertEquals(expected, builder.toQuery(createShardContext())); } + private static IntervalsSource buildFuzzySource(String term, String label, int prefixLength, boolean transpositions, int editDistance) { + FuzzyQuery fq = new FuzzyQuery(new Term("field", term), editDistance, prefixLength, 128, transpositions); + return XIntervals.multiterm(new CompiledAutomaton(fq.toAutomaton()), label); + } + + public void testFuzzy() throws IOException { + + String json = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " + + "\"fuzzy\" : { \"term\" : \"Term\" } } } }"; + IntervalQueryBuilder builder = (IntervalQueryBuilder) parseQuery(json); + + Query expected = new IntervalQuery(STRING_FIELD_NAME, + buildFuzzySource("term", "Term", FuzzyQueryBuilder.DEFAULT_PREFIX_LENGTH, true, Fuzziness.AUTO.asDistance("term"))); + assertEquals(expected, builder.toQuery(createShardContext())); + + String json_with_prefix = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " + + "\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2 } } } }"; + builder = (IntervalQueryBuilder) parseQuery(json_with_prefix); + expected = new IntervalQuery(STRING_FIELD_NAME, + buildFuzzySource("term", "Term", 2, true, Fuzziness.AUTO.asDistance("term"))); + assertEquals(expected, builder.toQuery(createShardContext())); + + String json_with_fuzziness = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " + + "\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"fuzziness\" : \"1\" } } } }"; + builder = (IntervalQueryBuilder) parseQuery(json_with_fuzziness); + expected = new IntervalQuery(STRING_FIELD_NAME, + buildFuzzySource("term", "Term", 2, true, Fuzziness.ONE.asDistance("term"))); + assertEquals(expected, builder.toQuery(createShardContext())); + + String json_no_transpositions = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " + + "\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"transpositions\" : false } } } }"; + builder = (IntervalQueryBuilder) parseQuery(json_no_transpositions); + expected = new IntervalQuery(STRING_FIELD_NAME, + buildFuzzySource("term", "Term", 2, false, Fuzziness.AUTO.asDistance("term"))); + assertEquals(expected, builder.toQuery(createShardContext())); + + String json_with_analyzer = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " + + "\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"analyzer\" : \"keyword\" } } } }"; + builder = (IntervalQueryBuilder) parseQuery(json_with_analyzer); + expected = new IntervalQuery(STRING_FIELD_NAME, + buildFuzzySource("Term", "Term", 2, true, Fuzziness.AUTO.asDistance("term"))); + assertEquals(expected, builder.toQuery(createShardContext())); + + String json_with_fixfield = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " + + "\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"fuzziness\" : \"1\", " + + "\"use_field\" : \"" + MASKED_FIELD + "\" } } } }"; + builder = (IntervalQueryBuilder) parseQuery(json_with_fixfield); + expected = new IntervalQuery(STRING_FIELD_NAME, Intervals.fixField(MASKED_FIELD, + buildFuzzySource("term", "Term", 2, true, Fuzziness.ONE.asDistance("term")))); + assertEquals(expected, builder.toQuery(createShardContext())); + + } + }