Skip to content

Commit

Permalink
Fix synonym phrase query expansion for cross_fields parsing (#28045)
Browse files Browse the repository at this point in the history
* Fix synonym phrase query expansion for cross_fields parsing

The `cross_fields` mode for query parser ignores phrase query generated by multi-word synonyms.
In such case only the first field of each analyzer group is kept. This change fixes this issue
by expanding the phrase query for each analyzer group to **all** fields using a disjunction max query.
  • Loading branch information
jimczi committed Jan 15, 2018
1 parent 9334fa0 commit b93996f
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.MultiTermQuery;
Expand Down Expand Up @@ -351,7 +352,12 @@ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws
throw exc;
}
}
return super.analyzePhrase(field, stream, slop);
Query query = super.analyzePhrase(field, stream, slop);
if (query instanceof PhraseQuery) {
// synonyms that expand to multiple terms can return a phrase query.
return blendPhraseQuery((PhraseQuery) query, mapper);
}
return query;
}

/**
Expand Down Expand Up @@ -476,6 +482,14 @@ private Query boolToExtendedCommonTermsQuery(BooleanQuery bq, Occur highFreqOccu
}
}

/**
* Called when a phrase query is built with {@link QueryBuilder#analyzePhrase(String, TokenStream, int)}.
* Subclass can override this function to blend this query to multiple fields.
*/
protected Query blendPhraseQuery(PhraseQuery query, MappedFieldType fieldType) {
return query;
}

protected Query blendTermsQuery(Term[] terms, MappedFieldType fieldType) {
return new SynonymQuery(terms);
}
Expand All @@ -498,5 +512,4 @@ protected Query blendTermQuery(Term term, MappedFieldType fieldType) {
}
return termQuery(fieldType, term.bytes(), lenient);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.lucene.search.Queries;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.query.AbstractQueryBuilder;
Expand Down Expand Up @@ -141,6 +141,10 @@ public Query blendTerms(Term[] terms, MappedFieldType fieldType) {
public Query termQuery(MappedFieldType fieldType, BytesRef value) {
return MultiMatchQuery.this.termQuery(fieldType, value, lenient);
}

public Query blendPhrase(PhraseQuery query, MappedFieldType type) {
return MultiMatchQuery.super.blendPhraseQuery(query, type);
}
}

final class CrossFieldsQueryBuilder extends QueryBuilder {
Expand Down Expand Up @@ -224,6 +228,17 @@ public Query termQuery(MappedFieldType fieldType, BytesRef value) {
*/
return blendTerm(new Term(fieldType.name(), value.utf8ToString()), fieldType);
}

@Override
public Query blendPhrase(PhraseQuery query, MappedFieldType type) {
if (blendedFields == null) {
return super.blendPhrase(query, type);
}
/**
* We build phrase queries for multi-word synonyms when {@link QueryBuilder#autoGenerateSynonymsPhraseQuery} is true.
*/
return MultiMatchQuery.blendPhrase(query, blendedFields);
}
}

static Query blendTerm(QueryShardContext context, BytesRef value, Float commonTermsCutoff, float tieBreaker,
Expand Down Expand Up @@ -293,6 +308,28 @@ static Query blendTerms(QueryShardContext context, BytesRef[] values, Float comm
}
}

/**
* Expand a {@link PhraseQuery} to multiple fields that share the same analyzer.
* Returns a {@link DisjunctionMaxQuery} with a disjunction for each expanded field.
*/
static Query blendPhrase(PhraseQuery query, FieldAndFieldType... fields) {
List<Query> disjunctions = new ArrayList<>();
for (FieldAndFieldType field : fields) {
int[] positions = query.getPositions();
Term[] terms = query.getTerms();
PhraseQuery.Builder builder = new PhraseQuery.Builder();
for (int i = 0; i < terms.length; i++) {
builder.add(new Term(field.fieldType.name(), terms[i].bytes()), positions[i]);
}
Query q = builder.build();
if (field.boost != AbstractQueryBuilder.DEFAULT_BOOST) {
q = new BoostQuery(q, field.boost);
}
disjunctions.add(q);
}
return new DisjunctionMaxQuery(disjunctions, 0.0f);
}

@Override
protected Query blendTermQuery(Term term, MappedFieldType fieldType) {
if (queryBuilder == null) {
Expand All @@ -309,6 +346,14 @@ protected Query blendTermsQuery(Term[] terms, MappedFieldType fieldType) {
return queryBuilder.blendTerms(terms, fieldType);
}

@Override
protected Query blendPhraseQuery(PhraseQuery query, MappedFieldType fieldType) {
if (queryBuilder == null) {
return super.blendPhraseQuery(query, fieldType);
}
return queryBuilder.blendPhrase(query, fieldType);
}

static final class FieldAndFieldType {
final MappedFieldType fieldType;
final float boost;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,16 @@

package org.elasticsearch.index.search;

import org.apache.lucene.analysis.MockSynonymAnalyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.BlendedTermQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
Expand All @@ -43,7 +47,11 @@
import org.junit.Before;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import static org.elasticsearch.index.query.QueryBuilders.multiMatchQuery;
import static org.hamcrest.Matchers.equalTo;
Expand Down Expand Up @@ -215,4 +223,45 @@ public void testMultiMatchCrossFieldsWithSynonyms() throws IOException {
assertThat(parsedQuery, equalTo(expectedQuery));

}

public void testMultiMatchCrossFieldsWithSynonymsPhrase() throws IOException {
QueryShardContext queryShardContext = indexService.newQueryShardContext(
randomInt(20), null, () -> { throw new UnsupportedOperationException(); }, null);
MultiMatchQuery parser = new MultiMatchQuery(queryShardContext);
parser.setAnalyzer(new MockSynonymAnalyzer());
Map<String, Float> fieldNames = new HashMap<>();
fieldNames.put("name.first", 1.0f);
fieldNames.put("name.last", 1.0f);
Query query = parser.parse(MultiMatchQueryBuilder.Type.CROSS_FIELDS, fieldNames, "guinea pig", null);

Term[] terms = new Term[2];
terms[0] = new Term("name.first", "cavy");
terms[1] = new Term("name.last", "cavy");
float[] boosts = new float[2];
Arrays.fill(boosts, 1.0f);

List<Query> phraseDisjuncts = new ArrayList<>();
phraseDisjuncts.add(
new PhraseQuery.Builder()
.add(new Term("name.first", "guinea"))
.add(new Term("name.first", "pig"))
.build()
);
phraseDisjuncts.add(
new PhraseQuery.Builder()
.add(new Term("name.last", "guinea"))
.add(new Term("name.last", "pig"))
.build()
);
BooleanQuery expected = new BooleanQuery.Builder()
.add(
new BooleanQuery.Builder()
.add(new DisjunctionMaxQuery(phraseDisjuncts, 0.0f), BooleanClause.Occur.SHOULD)
.add(BlendedTermQuery.dismaxBlendedQuery(terms, boosts, 1.0f), BooleanClause.Occur.SHOULD)
.build(),
BooleanClause.Occur.SHOULD
)
.build();
assertEquals(expected, query);
}
}

0 comments on commit b93996f

Please sign in to comment.