From 74bbc02f71ec026eb4cee1c29dcad0bad1c664a6 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Thu, 19 Apr 2018 02:38:13 +0100 Subject: [PATCH 01/36] [SOLR-12238] Synonym Queries boost by payload + tests --- .../org/apache/lucene/util/QueryBuilder.java | 229 ++++++++++-------- .../graph/GraphTokenStreamFiniteStrings.java | 13 +- .../solr/parser/SolrQueryParserBase.java | 219 +++++++++++++++-- .../solr/collection1/conf/schema12.xml | 54 +++++ .../solr/collection1/conf/synonyms.txt | 8 +- .../solr/search/TestSolrQueryParser.java | 109 +++++++++ ...field-type-definitions-and-properties.adoc | 2 +- 7 files changed, 513 insertions(+), 121 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index 2cb066bd2085..a28af67ced71 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; @@ -54,7 +55,7 @@ * *

* This can also be used as a subclass for query parsers to make it easier - * to interact with the analysis chain. Factory methods such as {@code newTermQuery} + * to interact with the analysis chain. Factory methods such as {@code newTermQuery} * are provided so that the generated queries can be customized. */ public class QueryBuilder { @@ -67,8 +68,8 @@ public class QueryBuilder { public QueryBuilder(Analyzer analyzer) { this.analyzer = analyzer; } - - /** + + /** * Creates a boolean query from the query text. *

* This is equivalent to {@code createBooleanQuery(field, queryText, Occur.SHOULD)} @@ -80,14 +81,14 @@ public QueryBuilder(Analyzer analyzer) { public Query createBooleanQuery(String field, String queryText) { return createBooleanQuery(field, queryText, BooleanClause.Occur.SHOULD); } - - /** + + /** * Creates a boolean query from the query text. *

* @param field field name * @param queryText text to be passed to the analyzer * @param operator operator used for clauses between analyzer tokens. - * @return {@code TermQuery} or {@code BooleanQuery}, based on the analysis + * @return {@code TermQuery} or {@code BooleanQuery}, based on the analysis * of {@code queryText} */ public Query createBooleanQuery(String field, String queryText, BooleanClause.Occur operator) { @@ -96,8 +97,8 @@ public Query createBooleanQuery(String field, String queryText, BooleanClause.Oc } return createFieldQuery(analyzer, operator, field, queryText, false, 0); } - - /** + + /** * Creates a phrase query from the query text. *

* This is equivalent to {@code createPhraseQuery(field, queryText, 0)} @@ -109,8 +110,8 @@ public Query createBooleanQuery(String field, String queryText, BooleanClause.Oc public Query createPhraseQuery(String field, String queryText) { return createPhraseQuery(field, queryText, 0); } - - /** + + /** * Creates a phrase query from the query text. *

* @param field field name @@ -122,26 +123,26 @@ public Query createPhraseQuery(String field, String queryText) { public Query createPhraseQuery(String field, String queryText, int phraseSlop) { return createFieldQuery(analyzer, BooleanClause.Occur.MUST, field, queryText, true, phraseSlop); } - - /** + + /** * Creates a minimum-should-match query from the query text. *

* @param field field name * @param queryText text to be passed to the analyzer - * @param fraction of query terms {@code [0..1]} that should match - * @return {@code TermQuery} or {@code BooleanQuery}, based on the analysis + * @param fraction of query terms {@code [0..1]} that should match + * @return {@code TermQuery} or {@code BooleanQuery}, based on the analysis * of {@code queryText} */ public Query createMinShouldMatchQuery(String field, String queryText, float fraction) { if (Float.isNaN(fraction) || fraction < 0 || fraction > 1) { throw new IllegalArgumentException("fraction should be >= 0 and <= 1"); } - + // TODO: weird that BQ equals/rewrite/scorer doesn't handle this? if (fraction == 1) { return createBooleanQuery(field, queryText, BooleanClause.Occur.MUST); } - + Query query = createFieldQuery(analyzer, BooleanClause.Occur.SHOULD, field, queryText, false, 0); if (query instanceof BooleanQuery) { query = addMinShouldMatchToBoolean((BooleanQuery) query, fraction); @@ -162,21 +163,21 @@ private BooleanQuery addMinShouldMatchToBoolean(BooleanQuery query, float fracti return builder.build(); } - /** - * Returns the analyzer. + /** + * Returns the analyzer. * @see #setAnalyzer(Analyzer) */ public Analyzer getAnalyzer() { return analyzer; } - - /** + + /** * Sets the analyzer used to tokenize text. */ public void setAnalyzer(Analyzer analyzer) { this.analyzer = analyzer; } - + /** * Returns true if position increments are enabled. * @see #setEnablePositionIncrements(boolean) @@ -184,7 +185,7 @@ public void setAnalyzer(Analyzer analyzer) { public boolean getEnablePositionIncrements() { return enablePositionIncrements; } - + /** * Set to true to enable position increments in result query. *

@@ -272,18 +273,18 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato // Build an appropriate query based on the analysis chain. try (CachingTokenFilter stream = new CachingTokenFilter(source)) { - + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLenAtt = stream.addAttribute(PositionLengthAttribute.class); if (termAtt == null) { - return null; + return null; } - + // phase 1: read through the stream and assess the situation: // counting the number of tokens/positions and marking if we have any synonyms. - + int numTokens = 0; int positionCount = 0; boolean hasSynonyms = false; @@ -304,10 +305,10 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato isGraph = true; } } - + // phase 2: based on token count, presence of synonyms, and options // formulate a single term, boolean, or phrase. - + if (numTokens == 0) { return null; } else if (numTokens == 1) { @@ -348,7 +349,7 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato * Creates a span query from the tokenstream. In the case of a single token, a simple SpanTermQuery is * returned. When multiple tokens, an ordered SpanNearQuery with slop of 0 is returned. */ - protected final SpanQuery createSpanQuery(TokenStream in, String field) throws IOException { + protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOException { TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class); if (termAtt == null) { return null; @@ -368,80 +369,80 @@ protected final SpanQuery createSpanQuery(TokenStream in, String field) throws I } } - /** - * Creates simple term query from the cached tokenstream contents + /** + * Creates simple term query from the cached tokenstream contents */ protected Query analyzeTerm(String field, TokenStream stream) throws IOException { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); - + stream.reset(); if (!stream.incrementToken()) { throw new AssertionError(); } - + return newTermQuery(new Term(field, termAtt.getBytesRef())); } - - /** - * Creates simple boolean query from the cached tokenstream contents + + /** + * Creates simple boolean query from the cached tokenstream contents */ protected Query analyzeBoolean(String field, TokenStream stream) throws IOException { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); - + stream.reset(); List terms = new ArrayList<>(); while (stream.incrementToken()) { terms.add(new Term(field, termAtt.getBytesRef())); } - - return newSynonymQuery(terms.toArray(new Term[terms.size()])); + + return newSynonymQuery(terms.toArray(new Term[terms.size()]),stream); } - protected void add(BooleanQuery.Builder q, List current, BooleanClause.Occur operator) { + protected void add(BooleanQuery.Builder q, List current, TokenStream sourceTokenStream, BooleanClause.Occur operator) { if (current.isEmpty()) { return; } if (current.size() == 1) { q.add(newTermQuery(current.get(0)), operator); } else { - q.add(newSynonymQuery(current.toArray(new Term[current.size()])), operator); + q.add(newSynonymQuery(current.toArray(new Term[current.size()]), sourceTokenStream), operator); } } - /** - * Creates complex boolean query from the cached tokenstream contents + /** + * Creates complex boolean query from the cached tokenstream contents */ protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException { BooleanQuery.Builder q = newBooleanQuery(); List currentQuery = new ArrayList<>(); - + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); - + stream.reset(); while (stream.incrementToken()) { if (posIncrAtt.getPositionIncrement() != 0) { - add(q, currentQuery, operator); + add(q, currentQuery, stream, operator); currentQuery.clear(); } currentQuery.add(new Term(field, termAtt.getBytesRef())); } - add(q, currentQuery, operator); - + add(q, currentQuery, stream, operator); + return q.build(); } - - /** - * Creates simple phrase query from the cached tokenstream contents + + /** + * Creates simple phrase query from the cached tokenstream contents */ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws IOException { PhraseQuery.Builder builder = new PhraseQuery.Builder(); builder.setSlop(slop); - + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); - int position = -1; - + int position = -1; + stream.reset(); while (stream.incrementToken()) { if (enablePositionIncrements) { @@ -454,24 +455,24 @@ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws return builder.build(); } - - /** - * Creates complex phrase query from the cached tokenstream contents + + /** + * Creates complex phrase query from the cached tokenstream contents */ protected Query analyzeMultiPhrase(String field, TokenStream stream, int slop) throws IOException { MultiPhraseQuery.Builder mpqb = newMultiPhraseQueryBuilder(); mpqb.setSlop(slop); - + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); - int position = -1; - + int position = -1; + List multiTerms = new ArrayList<>(); stream.reset(); while (stream.incrementToken()) { int positionIncrement = posIncrAtt.getPositionIncrement(); - + if (positionIncrement > 0 && multiTerms.size() > 0) { if (enablePositionIncrements) { mpqb.add(multiTerms.toArray(new Term[0]), position); @@ -483,7 +484,7 @@ protected Query analyzeMultiPhrase(String field, TokenStream stream, int slop) t position += positionIncrement; multiTerms.add(new Term(field, termAtt.getBytesRef())); } - + if (enablePositionIncrements) { mpqb.add(multiTerms.toArray(new Term[0]), position); } else { @@ -509,38 +510,69 @@ protected Query analyzeGraphBoolean(String field, TokenStream source, BooleanCla end = articulationPoints[i]; } lastState = end; - final Query queryPos; + final Query queryClause; if (graph.hasSidePath(start)) { - final Iterator it = graph.getFiniteStrings(start, end); - Iterator queries = new Iterator() { + final Iterator sidePaths = graph.getFiniteStrings(start, end); + Iterator sidePathsQueries = new Iterator() { @Override public boolean hasNext() { - return it.hasNext(); + return sidePaths.hasNext(); } @Override public Query next() { - TokenStream ts = it.next(); - return createFieldQuery(ts, BooleanClause.Occur.MUST, field, getAutoGenerateMultiTermSynonymsPhraseQuery(), 0); + TokenStream sidePath = sidePaths.next(); + return createFieldQuery(sidePath, BooleanClause.Occur.MUST, field, getAutoGenerateMultiTermSynonymsPhraseQuery(), 0); } }; - queryPos = newGraphSynonymQuery(queries); + final Iterator sidePathsForPayloads= graph.getFiniteStrings(start, end); + Iterator sidePathsPayloads = new Iterator() { + @Override + public boolean hasNext() { + return sidePathsForPayloads.hasNext(); + } + + @Override + public BytesRef[] next() { + TokenStream sidePath = sidePathsForPayloads.next(); + return getPayloadsFromStream(sidePath); + } + }; + queryClause = newGraphSynonymQuery(sidePathsQueries, sidePathsPayloads); } else { Term[] terms = graph.getTerms(field, start); assert terms.length > 0; if (terms.length == 1) { - queryPos = newTermQuery(terms[0]); + queryClause = newTermQuery(terms[0]); } else { - queryPos = newSynonymQuery(terms); + queryClause = newSynonymQuery(terms, source); } } - if (queryPos != null) { - builder.add(queryPos, operator); + if (queryClause != null) { + builder.add(queryClause, operator); } } return builder.build(); } + protected BytesRef[] getPayloadsFromStream(TokenStream source) { + try (CachingTokenFilter stream = new CachingTokenFilter(source)) { + PayloadAttribute payloadAtt = stream.getAttribute(PayloadAttribute.class); + stream.reset(); + List payloads = new ArrayList<>(); + while (stream.incrementToken()) { + if (payloadAtt != null) { + payloads.add(payloadAtt.getPayload()); + } + } + stream.end(); + stream.close(); + return payloads.toArray(new BytesRef[payloads.size()]); + } catch (IOException e) { + throw new RuntimeException("Error analyzing query text", e); + } + } + /** * Creates a span near (phrase) query from a graph token stream. The articulation points of the graph are visited in * order and the queries created at each point are merged in the returned near query. @@ -559,42 +591,33 @@ protected SpanQuery analyzeGraphPhrase(TokenStream source, String field, int phr end = articulationPoints[i]; } lastState = end; - final SpanQuery queryPos; + final SpanQuery queryClause; if (graph.hasSidePath(start)) { - List queries = new ArrayList<>(); - Iterator it = graph.getFiniteStrings(start, end); - while (it.hasNext()) { - TokenStream ts = it.next(); - SpanQuery q = createSpanQuery(ts, field); - if (q != null) { - queries.add(q); - } - } - if (queries.size() > 0) { - queryPos = new SpanOrQuery(queries.toArray(new SpanQuery[0])); + Iterator sidePaths = graph.getFiniteStrings(start, end); + List sidePathQueries = newGraphSpanQueries(field, sidePaths); + if (sidePathQueries.size() > 0) { + queryClause = new SpanOrQuery(sidePathQueries.toArray(new SpanQuery[0])); } else { - queryPos = null; + queryClause = null; } } else { Term[] terms = graph.getTerms(field, start); assert terms.length > 0; if (terms.length == 1) { - queryPos = new SpanTermQuery(terms[0]); + queryClause = new SpanTermQuery(terms[0]); } else { SpanTermQuery[] orClauses = new SpanTermQuery[terms.length]; for (int idx = 0; idx < terms.length; idx++) { orClauses[idx] = new SpanTermQuery(terms[idx]); } - queryPos = new SpanOrQuery(orClauses); + queryClause = new SpanOrQuery(orClauses); } } - - if (queryPos != null) { - clauses.add(queryPos); + if (queryClause != null) { + clauses.add(queryClause); } } - if (clauses.isEmpty()) { return null; } else if (clauses.size() == 1) { @@ -604,6 +627,18 @@ protected SpanQuery analyzeGraphPhrase(TokenStream source, String field, int phr } } + protected List newGraphSpanQueries(String field, Iterator sidePaths) throws IOException { + List queries = new ArrayList<>(); + while (sidePaths.hasNext()) { + TokenStream sidePath = sidePaths.next(); + SpanQuery sidePathQuery = createSpanQuery(sidePath, field); + if (sidePathQuery != null) { + queries.add(sidePathQuery); + } + } + return queries; + } + /** * Builds a new BooleanQuery instance. *

@@ -613,14 +648,14 @@ protected SpanQuery analyzeGraphPhrase(TokenStream source, String field, int phr protected BooleanQuery.Builder newBooleanQuery() { return new BooleanQuery.Builder(); } - + /** * Builds a new SynonymQuery instance. *

* This is intended for subclasses that wish to customize the generated queries. * @return new Query instance */ - protected Query newSynonymQuery(Term terms[]) { + protected Query newSynonymQuery(Term[] terms, TokenStream sourceTokenStream) { return new SynonymQuery(terms); } @@ -630,7 +665,7 @@ protected Query newSynonymQuery(Term terms[]) { * This is intended for subclasses that wish to customize the generated queries. * @return new Query instance */ - protected Query newGraphSynonymQuery(Iterator queries) { + protected Query newGraphSynonymQuery(Iterator queries, Iterator sidePathsPayloads) { BooleanQuery.Builder builder = new BooleanQuery.Builder(); while (queries.hasNext()) { builder.add(queries.next(), BooleanClause.Occur.SHOULD); @@ -641,7 +676,7 @@ protected Query newGraphSynonymQuery(Iterator queries) { } return bq; } - + /** * Builds a new TermQuery instance. *

@@ -652,7 +687,7 @@ protected Query newGraphSynonymQuery(Iterator queries) { protected Query newTermQuery(Term term) { return new TermQuery(term); } - + /** * Builds a new MultiPhraseQuery instance. *

diff --git a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java index a7005012b730..3d5e00d6c118 100644 --- a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java +++ b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java @@ -29,6 +29,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.BytesTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; @@ -49,12 +50,14 @@ */ public final class GraphTokenStreamFiniteStrings { private final Map idToTerm = new HashMap<>(); + private final Map idToPayload = new HashMap<>(); private final Map idToInc = new HashMap<>(); private final Automaton det; private final Transition transition = new Transition(); private class FiniteStringsTokenStream extends TokenStream { private final BytesTermAttribute termAtt = addAttribute(BytesTermAttribute.class); + private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private final IntsRef ids; private final int end; @@ -73,6 +76,7 @@ public boolean incrementToken() throws IOException { clearAttributes(); int id = ids.ints[offset]; termAtt.setBytesRef(idToTerm.get(id)); + payloadAtt.setPayload(idToPayload.get(id)); int incr = 1; if (idToInc.containsKey(id)) { @@ -203,6 +207,7 @@ public int[] articulationPoints() { private Automaton build(final TokenStream in) throws IOException { Automaton.Builder builder = new Automaton.Builder(); final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); + final PayloadAttribute payloadAtt = in.addAttribute(PayloadAttribute.class); final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); @@ -229,7 +234,8 @@ private Automaton build(final TokenStream in) throws IOException { } BytesRef term = termBytesAtt.getBytesRef(); - int id = getTermID(currentIncr, prevIncr, term); + BytesRef payload = payloadAtt.getPayload(); + int id = getTermID(currentIncr, prevIncr, term, payload); builder.addTransition(pos, endPos, id); // only save last increment on non-zero increment in case we have multiple stacked tokens @@ -248,11 +254,14 @@ private Automaton build(final TokenStream in) throws IOException { /** * Gets an integer id for a given term and saves the position increment if needed. */ - private int getTermID(int incr, int prevIncr, BytesRef term) { + private int getTermID(int incr, int prevIncr, BytesRef term, BytesRef payload) { assert term != null; boolean isStackedGap = incr == 0 && prevIncr > 1; int id = idToTerm.size(); idToTerm.put(id, BytesRef.deepCopyOf(term)); + if (payload != null) { + idToPayload.put(id, BytesRef.deepCopyOf(payload)); + } // stacked token should have the same increment as original token at this position if (isStackedGap) { idToInc.put(id, prevIncr); diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java index 0068c9aaa0f0..591d35c828e0 100644 --- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java +++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java @@ -16,17 +16,26 @@ */ package org.apache.solr.parser; +import java.io.IOException; import java.io.StringReader; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.ArrayList; import java.util.Collections; import java.util.EnumSet; import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CachingTokenFilter; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.reverse.ReverseStringFilter; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.index.Term; import org.apache.lucene.search.AutomatonQuery; @@ -43,6 +52,11 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.RegexpQuery; import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.search.spans.SpanBoostQuery; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.QueryBuilder; import org.apache.lucene.util.Version; import org.apache.lucene.util.automaton.Automata; @@ -105,13 +119,26 @@ public static enum SynonymQueryStyle { * */ PICK_BEST, - /** each synonym scored indepedently, then added together (ie boolean query) - * so if "pants" has df 500, and "khakis" a df of 50, khakis matches are scored higher but - * summed with any "pants" matches - * appropriate when more specific synonyms should score higher, but we don't want to ignore - * less specific synonyms - * */ - AS_DISTINCT_TERMS + /** + * each synonym scored indepedently, then added together (ie boolean query) + * so if "pants" has df 500, and "khakis" a df of 50, khakis matches are scored higher but + * summed with any "pants" matches + * appropriate when more specific synonyms should score higher, but we don't want to ignore + * less specific synonyms + */ + AS_DISTINCT_TERMS, + + /** + * this approach is an extension of the pick_best, + * it adds a boost to each synonym based on the payload associated to the term + */ + PICK_BEST_BOOST_BY_PAYLOAD, + + /** + * this approach is an extension of the as_distinct_terms, + * it adds a boost to each synonym based on the payload associated to the term + */ + AS_DISTINCT_TERMS_BOOST_BY_PAYLOAD, } // make it possible to call setDefaultOperator() without accessing @@ -592,28 +619,180 @@ protected Query newRegexpQuery(Term regexp) { } @Override - protected Query newSynonymQuery(Term terms[]) { + protected Query newSynonymQuery(Term[] terms, TokenStream sourceTokenStream) { switch (synonymQueryStyle) { case PICK_BEST: - List currPosnClauses = new ArrayList(terms.length); - for (Term term : terms) { - currPosnClauses.add(newTermQuery(term)); - } - DisjunctionMaxQuery dm = new DisjunctionMaxQuery(currPosnClauses, 0.0f); - return dm; + return getDisjunctionSynonymQuery(terms, sourceTokenStream, false); + case PICK_BEST_BOOST_BY_PAYLOAD: + return getDisjunctionSynonymQuery(terms, sourceTokenStream, true); case AS_DISTINCT_TERMS: - BooleanQuery.Builder builder = new BooleanQuery.Builder(); - for (Term term : terms) { - builder.add(newTermQuery(term), BooleanClause.Occur.SHOULD); - } - return builder.build(); + return getBooleanSynonymQuery(terms, sourceTokenStream,false); + case AS_DISTINCT_TERMS_BOOST_BY_PAYLOAD: + return getBooleanSynonymQuery(terms, sourceTokenStream,true); case AS_SAME_TERM: - return super.newSynonymQuery(terms); + return super.newSynonymQuery(terms, sourceTokenStream); default: throw new AssertionError("unrecognized synonymQueryStyle passed when creating newSynonymQuery"); } } + private Query getBooleanSynonymQuery(Term[] terms, TokenStream sourceTokenStream, boolean payloadBoost) { + BooleanQuery.Builder builder; + builder = new BooleanQuery.Builder(); + List synonymQueries = getSynonymQueries(terms, sourceTokenStream, payloadBoost); + for(Query synonymQuery:synonymQueries){ + builder.add(synonymQuery,BooleanClause.Occur.SHOULD); + } + return builder.build(); + } + + private Query getDisjunctionSynonymQuery(Term[] terms, TokenStream sourceTokenStream, boolean payloadBoost) { + List synonymQueries = getSynonymQueries(terms, sourceTokenStream, payloadBoost); + DisjunctionMaxQuery synonymQuery; + synonymQuery = new DisjunctionMaxQuery(synonymQueries, 0.0f); + return synonymQuery; + } + + private List getSynonymQueries(Term[] terms, TokenStream sourceTokenStream, boolean payloadBoost) { + List synonymQueries = new ArrayList<>(terms.length); + BytesRef[] currentStreamPayloads = null; + if (payloadBoost) { + currentStreamPayloads = super.getPayloadsFromStream(sourceTokenStream); + } + BytesRef termPayload = null; + for (int i = 0; i < terms.length; i++) { + Term currentTerm = terms[i]; + if (payloadBoost) { + termPayload = currentStreamPayloads[i]; + } + if (termPayload != null) { + float payloadValue = ByteBuffer.wrap(termPayload.bytes).order(ByteOrder.BIG_ENDIAN).getFloat(); + synonymQueries.add(new BoostQuery(newTermQuery(currentTerm), payloadValue)); + } else { + synonymQueries.add(newTermQuery(currentTerm)); + } + } + return synonymQueries; + } + + /** + * Builds a new GraphQuery for multi-terms synonyms. + *

+ * This is intended for subclasses that wish to customize the generated queries. + * + * @return new Query instance + */ + @Override + protected Query newGraphSynonymQuery(Iterator sidePaths, Iterator sidePathsPayloads) { + switch (synonymQueryStyle) { + case PICK_BEST_BOOST_BY_PAYLOAD: + List boostedSidePaths = boostQueriesByPayload(sidePaths, sidePathsPayloads); + DisjunctionMaxQuery graphSynonymQuery = new DisjunctionMaxQuery(boostedSidePaths, 0.0f); + return graphSynonymQuery; + case AS_DISTINCT_TERMS_BOOST_BY_PAYLOAD: + boostedSidePaths = boostQueriesByPayload(sidePaths, sidePathsPayloads); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (Query boostedSidePath : boostedSidePaths) { + builder.add(boostedSidePath, BooleanClause.Occur.SHOULD); + } + BooleanQuery graphBooleanSynonymQuery = builder.build(); + if (graphBooleanSynonymQuery.clauses().size() == 1) { + return graphBooleanSynonymQuery.clauses().get(0).getQuery(); + } + return graphBooleanSynonymQuery; + default: + return super.newGraphSynonymQuery(sidePaths, sidePathsPayloads); + } + } + + private List boostQueriesByPayload(Iterator sidePaths, Iterator sidePathsPayloads) { + List boostedSidePaths = new LinkedList<>(); + while (sidePaths.hasNext()) { + Query sidePath = sidePaths.next(); + BytesRef[] sidePathPayloads = sidePathsPayloads.next(); + float overallQueryPayload = extractQueryPayload(sidePathPayloads); + if (overallQueryPayload != 0) { + boostedSidePaths.add(new BoostQuery(sidePath, overallQueryPayload)); + } else { + boostedSidePaths.add(sidePath); + } + } + return boostedSidePaths; + } + + /*Current assumption is that the user will associate a single payload to the multi terms synonym + * that generated the phrase query, so a valid value for the payload associated to the query is just the first not null payload + * e.g. + * lion => panthera leo|0.99 + * "panthera leo" query will have associated Payloads [null,0.99] + * So the payload associated to the query will be 0.99 which is the first not null + * */ + private float extractQueryPayload(BytesRef[] payloadsForQueryTerms) { + for (BytesRef singlePayload : payloadsForQueryTerms) { + if (singlePayload != null) { + float decodedPayload = ByteBuffer.wrap(singlePayload.bytes).order(ByteOrder.BIG_ENDIAN).getFloat(); + return decodedPayload; + } + } + return 0; + } + + /** + * Creates a span query from the tokenstream. In the case of a single token, a simple SpanTermQuery is + * returned. When multiple tokens, an ordered SpanNearQuery with slop of 0 is returned. + * In case the synonym query style involves payload boosting a SpanBoostQuery is returned + */ + @Override + protected SpanQuery createSpanQuery(TokenStream source, String field) throws IOException { + if (synonymQueryStyle == PICK_BEST_BOOST_BY_PAYLOAD || synonymQueryStyle == AS_DISTINCT_TERMS_BOOST_BY_PAYLOAD) { + try (CachingTokenFilter stream = new CachingTokenFilter(source)) { + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); + PayloadAttribute payloadAttribute = stream.getAttribute(PayloadAttribute.class); + List terms = new ArrayList<>(); + List payloads = new ArrayList<>(); + + stream.reset(); + if (termAtt == null) { + return null; + } + while (stream.incrementToken()) { + terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef()))); + payloads.add(payloadAttribute.getPayload()); + } + stream.end(); + stream.close(); + + BytesRef[] queryPayloadsArray = payloads.toArray(new BytesRef[payloads.size()]); + float queryPayloadBoost = 0; + if (!payloads.isEmpty()) { + queryPayloadBoost = extractQueryPayload(queryPayloadsArray); + } + + if (terms.isEmpty()) { + return null; + } else if (terms.size() == 1) { + SpanTermQuery singleTermQuery = terms.get(0); + if (queryPayloadBoost != 0) { + return new SpanBoostQuery(singleTermQuery, queryPayloadBoost); + } else { + return singleTermQuery; + } + } else { + SpanNearQuery multiTermQuery = new SpanNearQuery(terms.toArray(new SpanTermQuery[0]), 0, true); + if (queryPayloadBoost != 0) { + return new SpanBoostQuery(multiTermQuery, queryPayloadBoost); + } else { + return multiTermQuery; + } + } + } catch (IOException e) { + throw new RuntimeException("Error analyzing query text", e); + } + } else { + return super.createSpanQuery(source, field); + } + } + /** * Builds a new FuzzyQuery instance * @param term Term diff --git a/solr/core/src/test-files/solr/collection1/conf/schema12.xml b/solr/core/src/test-files/solr/collection1/conf/schema12.xml index de1b99809748..c74bdbc15b90 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema12.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema12.xml @@ -197,6 +197,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + @@ -227,6 +252,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + @@ -663,7 +713,11 @@ + + + + diff --git a/solr/core/src/test-files/solr/collection1/conf/synonyms.txt b/solr/core/src/test-files/solr/collection1/conf/synonyms.txt index 68dbf0bf62b1..25466ca7b860 100644 --- a/solr/core/src/test-files/solr/collection1/conf/synonyms.txt +++ b/solr/core/src/test-files/solr/collection1/conf/synonyms.txt @@ -37,4 +37,10 @@ crow blackbird, grackle tabby => tabby, cat, feline, animal persian => persian, cat, feline, animal -jeans, denim pants \ No newline at end of file +jeans, denim pants + +# Synonyms used by Payload Boost +tiger => tiger|1.0, Big_Cat|0.8, Shere_Khan|0.9 +leopard => leopard, Big_Cat|0.8, Bagheera|0.9 +lion => lion|1.0, panthera leo|0.99, Simba|0.8 +snow_leopard => panthera uncia|0.99, snow leopard|1.0 diff --git a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java index 37347b38ad53..7c851d2b4118 100644 --- a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java +++ b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java @@ -1091,6 +1091,115 @@ public void testSynonymQueryStyle() throws Exception { } + public void testGraphSynonyms_singleTermSynonymsPayloadBoost_shouldParseBoostedQuery() throws Exception { + Query q = QParser.getParser("tiger", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); + assertEquals("((t_pick_best_boost_by_payload_foo:tiger)^1.0 | (t_pick_best_boost_by_payload_foo:big_cat)^0.8 | (t_pick_best_boost_by_payload_foo:shere_khan)^0.9)", q.toString()); + + q = QParser.getParser("tiger", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); + assertEquals("(t_as_distinct_boost_by_payload_foo:tiger)^1.0 (t_as_distinct_boost_by_payload_foo:big_cat)^0.8 (t_as_distinct_boost_by_payload_foo:shere_khan)^0.9", q.toString()); + + /*confirm autoGeneratePhraseQueries always builds OR queries*/ + q = QParser.getParser("jeans", req(params("df", "t_pick_best_boost_by_payload_foo", "sow", "false"))).getQuery(); + assertEquals("(t_pick_best_boost_by_payload_foo:\"denim pants\" | t_pick_best_boost_by_payload_foo:jeans)", q.toString()); + + q = QParser.getParser("jeans", req(params("df", "t_as_distinct_boost_by_payload_foo", "sow", "false"))).getQuery(); + assertEquals("(t_as_distinct_boost_by_payload_foo:\"denim pants\" t_as_distinct_boost_by_payload_foo:jeans)", q.toString()); + } + + public void testGraphSynonyms_multiTermSynonymsPayloadBoost_shouldParseBoostedPhraseQuery() throws Exception { + Query q = QParser.getParser("lion", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); + assertEquals("((t_pick_best_boost_by_payload_foo:lion)^1.0 | (t_pick_best_boost_by_payload_foo:\"panthera leo\")^0.99 | (t_pick_best_boost_by_payload_foo:simba)^0.8)", q.toString()); + + q = QParser.getParser("lion", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); + assertEquals("((t_as_distinct_boost_by_payload_foo:lion)^1.0 (t_as_distinct_boost_by_payload_foo:\"panthera leo\")^0.99 (t_as_distinct_boost_by_payload_foo:simba)^0.8)", q.toString()); + } + + public void testGraphSynonyms_multiTermQueryMultiTermSynonymsPayloadBoost_shouldParseBoostedPhraseQuery() throws Exception { + Query q = QParser.getParser("snow_leopard lion", req(params("df", "t_pick_best_boost_by_payload_foo", "sow", "false"))).getQuery(); + assertEquals("((t_pick_best_boost_by_payload_foo:\"panthera uncia\")^0.99 | (t_pick_best_boost_by_payload_foo:\"snow leopard\")^1.0) " + + "((t_pick_best_boost_by_payload_foo:lion)^1.0 | (t_pick_best_boost_by_payload_foo:\"panthera leo\")^0.99 | (t_pick_best_boost_by_payload_foo:simba)^0.8)", q.toString()); + + q = QParser.getParser("snow_leopard lion", req(params("df", "t_as_distinct_boost_by_payload_foo", "sow", "false"))).getQuery(); + assertEquals("((t_as_distinct_boost_by_payload_foo:\"panthera uncia\")^0.99 (t_as_distinct_boost_by_payload_foo:\"snow leopard\")^1.0) " + + "((t_as_distinct_boost_by_payload_foo:lion)^1.0 (t_as_distinct_boost_by_payload_foo:\"panthera leo\")^0.99 (t_as_distinct_boost_by_payload_foo:simba)^0.8)", q.toString()); + } + + public void testGraphSynonyms_phraseQueryMultiTermSynonymsPayloadBoost_shouldParseBoostedSpanQuery() throws Exception { + Query q = QParser.getParser("\"snow_leopard lion\"", req(params("df", "t_pick_best_boost_by_payload_foo", "sow", "false"))).getQuery(); + assertEquals("spanNear(" + + "[spanOr([" + + "(spanNear([t_pick_best_boost_by_payload_foo:panthera, t_pick_best_boost_by_payload_foo:uncia], 0, true))^0.99, " + + "(spanNear([t_pick_best_boost_by_payload_foo:snow, t_pick_best_boost_by_payload_foo:leopard], 0, true))^1.0]), " + + "spanOr([" + + "(t_pick_best_boost_by_payload_foo:lion)^1.0, " + + "(spanNear([t_pick_best_boost_by_payload_foo:panthera, t_pick_best_boost_by_payload_foo:leo], 0, true))^0.99, " + + "(t_pick_best_boost_by_payload_foo:simba)^0.8])]," + + " 0, true)", q.toString()); + + q = QParser.getParser("\"snow_leopard lion\"", req(params("df", "t_as_distinct_boost_by_payload_foo", "sow", "false"))).getQuery(); + assertEquals("spanNear(" + + "[spanOr([" + + "(spanNear([t_as_distinct_boost_by_payload_foo:panthera, t_as_distinct_boost_by_payload_foo:uncia], 0, true))^0.99, " + + "(spanNear([t_as_distinct_boost_by_payload_foo:snow, t_as_distinct_boost_by_payload_foo:leopard], 0, true))^1.0]), " + + "spanOr([" + + "(t_as_distinct_boost_by_payload_foo:lion)^1.0, " + + "(spanNear([t_as_distinct_boost_by_payload_foo:panthera, t_as_distinct_boost_by_payload_foo:leo], 0, true))^0.99, " + + "(t_as_distinct_boost_by_payload_foo:simba)^0.8])]," + + " 0, true)", q.toString()); + } + + /* If you have single terms synonims, a flat token stream is still OK */ + public void testFlatSynonyms_singleTermSynonymsPayloadBoost_shouldParseBoostedQuery() throws Exception { + Query q = QParser.getParser("tiger", req(params("df", "t_pick_best_boost_by_payload_flat_foo"))).getQuery(); + assertEquals("((t_pick_best_boost_by_payload_flat_foo:tiger)^1.0 | (t_pick_best_boost_by_payload_flat_foo:big_cat)^0.8 | (t_pick_best_boost_by_payload_flat_foo:shere_khan)^0.9)", q.toString()); + + q = QParser.getParser("tiger", req(params("df", "t_as_distinct_boost_by_payload_flat_foo"))).getQuery(); + assertEquals("(t_as_distinct_boost_by_payload_flat_foo:tiger)^1.0 (t_as_distinct_boost_by_payload_flat_foo:big_cat)^0.8 (t_as_distinct_boost_by_payload_flat_foo:shere_khan)^0.9", q.toString()); + + /*confirm autoGeneratePhraseQueries builds disjunction queries or boolean queries accordingly*/ + q = QParser.getParser("jeans", req(params("df", "t_pick_best_boost_by_payload_flat_foo", "sow", "false"))).getQuery(); + assertEquals("(t_pick_best_boost_by_payload_flat_foo:\"denim pants\" | t_pick_best_boost_by_payload_flat_foo:jeans)", q.toString()); + + q = QParser.getParser("jeans", req(params("df", "t_as_distinct_boost_by_payload_flat_foo", "sow", "false"))).getQuery(); + assertEquals("(t_as_distinct_boost_by_payload_flat_foo:\"denim pants\" t_as_distinct_boost_by_payload_flat_foo:jeans)", q.toString()); + } + + /* If you have multi term synonyms a flat token stream is not going to produce the ideal query, this test check it builds the best it can do*/ + public void testFlatSynonyms_multiTermQueryMultiTermSynonymsPayloadBoost_shouldParseBoostedQuery() throws Exception { + Query q = QParser.getParser("snow_leopard lion", req(params("df", "t_pick_best_boost_by_payload_flat_foo", "sow", "false"))).getQuery(); + assertEquals("(t_pick_best_boost_by_payload_flat_foo:panthera | t_pick_best_boost_by_payload_flat_foo:snow) " + + "(t_pick_best_boost_by_payload_flat_foo:uncia | t_pick_best_boost_by_payload_flat_foo:leopard) " + + "((t_pick_best_boost_by_payload_flat_foo:lion)^1.0 | (t_pick_best_boost_by_payload_flat_foo:\"panthera leo\")^0.99 | (t_pick_best_boost_by_payload_flat_foo:simba)^0.8)", q.toString()); + + q = QParser.getParser("snow_leopard lion", req(params("df", "t_as_distinct_boost_by_payload_flat_foo", "sow", "false"))).getQuery(); + assertEquals("(t_as_distinct_boost_by_payload_flat_foo:panthera t_as_distinct_boost_by_payload_flat_foo:snow) " + + "(t_as_distinct_boost_by_payload_flat_foo:uncia t_as_distinct_boost_by_payload_flat_foo:leopard) " + + "((t_as_distinct_boost_by_payload_flat_foo:lion)^1.0 (t_as_distinct_boost_by_payload_flat_foo:\"panthera leo\")^0.99 (t_as_distinct_boost_by_payload_flat_foo:simba)^0.8)", q.toString()); + } + + /* If you have multi term synonyms a flat token stream is not going to produce the ideal query, , this test check it builds the best it can do*/ + public void testFlatSynonyms_phraseQueryMultiTermSynonymsPayloadBoost_shouldParseBoostedSpanQuery() throws Exception { + Query q = QParser.getParser("\"snow_leopard lion\"", req(params("df", "t_pick_best_boost_by_payload_flat_foo", "sow", "false"))).getQuery(); + assertEquals("spanNear([" + + "spanOr([t_pick_best_boost_by_payload_flat_foo:panthera, t_pick_best_boost_by_payload_flat_foo:snow]), " + + "spanOr([t_pick_best_boost_by_payload_flat_foo:uncia, t_pick_best_boost_by_payload_flat_foo:leopard]), " + + "spanOr([(t_pick_best_boost_by_payload_flat_foo:lion)^1.0, (spanNear([t_pick_best_boost_by_payload_flat_foo:panthera, t_pick_best_boost_by_payload_flat_foo:leo], 0, true))^0.99, (t_pick_best_boost_by_payload_flat_foo:simba)^0.8])], 0, true)", q.toString()); + + q = QParser.getParser("\"snow_leopard lion\"", req(params("df", "t_as_distinct_boost_by_payload_flat_foo", "sow", "false"))).getQuery(); + assertEquals("spanNear([" + + "spanOr([t_as_distinct_boost_by_payload_flat_foo:panthera, t_as_distinct_boost_by_payload_flat_foo:snow]), " + + "spanOr([t_as_distinct_boost_by_payload_flat_foo:uncia, t_as_distinct_boost_by_payload_flat_foo:leopard]), " + + "spanOr([(t_as_distinct_boost_by_payload_flat_foo:lion)^1.0, (spanNear([t_as_distinct_boost_by_payload_flat_foo:panthera, t_as_distinct_boost_by_payload_flat_foo:leo], 0, true))^0.99, (t_as_distinct_boost_by_payload_flat_foo:simba)^0.8])], 0, true)", q.toString()); + } + + public void testGraphSynonyms_PayloadBoostMissing_shouldAssignDefaultBoost() throws Exception { + Query q = QParser.getParser("leopard", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); + assertEquals("(t_pick_best_boost_by_payload_foo:leopard | (t_pick_best_boost_by_payload_foo:big_cat)^0.8 | (t_pick_best_boost_by_payload_foo:bagheera)^0.9)", q.toString()); + + q = QParser.getParser("leopard", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); + assertEquals("t_as_distinct_boost_by_payload_foo:leopard (t_as_distinct_boost_by_payload_foo:big_cat)^0.8 (t_as_distinct_boost_by_payload_foo:bagheera)^0.9", q.toString()); + } + @Test public void testBadRequestInSetQuery() throws SyntaxError { SolrQueryRequest req = req(); diff --git a/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc b/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc index 0f64785c7651..38048f7b09c2 100644 --- a/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc +++ b/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc @@ -90,7 +90,7 @@ For multivalued fields, specifies a distance between multiple values, which prev `synonymQueryStyle`:: Query used to combine scores of overlapping query terms (i.e., synonyms). Consider a search for "blue tee" with query-time synonyms `tshirt,tee`. + -Use `as_same_term` (default) to blend terms, i.e., `SynonymQuery(tshirt,tee)` where each term will be treated as equally important. Use `pick_best` to select the most significant synonym when scoring `Dismax(tee,tshirt)`. Use `as_distinct_terms` to bias scoring towards the most significant synonym `(pants OR slacks)`. +Use `as_same_term` (default) to blend terms, i.e., `SynonymQuery(tshirt,tee)` where each term will be treated as equally important. Use `pick_best` to select the most significant synonym when scoring `Dismax(tee,tshirt)`. Use `as_distinct_terms` to bias scoring towards the most significant synonym `(pants OR slacks)`. Use `*_boost_by_payload` ( 'pick_best_boost_by_payload' 'as_distinct_terms_boost_by_payload') to additionally apply a different boost to each synonym based on the synonym payload. + `as_same_term` is appropriate when terms are true synonyms (television, tv). Use `pick_best` or `as_distinct_terms` when synonyms are expanding to hyponyms `(q=jeans w/ jeans\=>jeans,pants)` and you want exact to come before parent and sibling concepts. See this http://opensourceconnections.com/blog/2017/11/21/solr-synonyms-mea-culpa/[blog article]. From cacf48a162ffbe8af6c2a61d70ec580835b1f1fc Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Thu, 19 Apr 2018 02:52:40 +0100 Subject: [PATCH 02/36] [SOLR-12238] query builder style revert --- .../org/apache/lucene/util/QueryBuilder.java | 114 +++++++++--------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index a28af67ced71..468c2c5d51ed 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -55,7 +55,7 @@ * *

* This can also be used as a subclass for query parsers to make it easier - * to interact with the analysis chain. Factory methods such as {@code newTermQuery} + * to interact with the analysis chain. Factory methods such as {@code newTermQuery} * are provided so that the generated queries can be customized. */ public class QueryBuilder { @@ -68,8 +68,8 @@ public class QueryBuilder { public QueryBuilder(Analyzer analyzer) { this.analyzer = analyzer; } - - /** + + /** * Creates a boolean query from the query text. *

* This is equivalent to {@code createBooleanQuery(field, queryText, Occur.SHOULD)} @@ -81,14 +81,14 @@ public QueryBuilder(Analyzer analyzer) { public Query createBooleanQuery(String field, String queryText) { return createBooleanQuery(field, queryText, BooleanClause.Occur.SHOULD); } - - /** + + /** * Creates a boolean query from the query text. *

* @param field field name * @param queryText text to be passed to the analyzer * @param operator operator used for clauses between analyzer tokens. - * @return {@code TermQuery} or {@code BooleanQuery}, based on the analysis + * @return {@code TermQuery} or {@code BooleanQuery}, based on the analysis * of {@code queryText} */ public Query createBooleanQuery(String field, String queryText, BooleanClause.Occur operator) { @@ -97,8 +97,8 @@ public Query createBooleanQuery(String field, String queryText, BooleanClause.Oc } return createFieldQuery(analyzer, operator, field, queryText, false, 0); } - - /** + + /** * Creates a phrase query from the query text. *

* This is equivalent to {@code createPhraseQuery(field, queryText, 0)} @@ -110,8 +110,8 @@ public Query createBooleanQuery(String field, String queryText, BooleanClause.Oc public Query createPhraseQuery(String field, String queryText) { return createPhraseQuery(field, queryText, 0); } - - /** + + /** * Creates a phrase query from the query text. *

* @param field field name @@ -123,26 +123,26 @@ public Query createPhraseQuery(String field, String queryText) { public Query createPhraseQuery(String field, String queryText, int phraseSlop) { return createFieldQuery(analyzer, BooleanClause.Occur.MUST, field, queryText, true, phraseSlop); } - - /** + + /** * Creates a minimum-should-match query from the query text. *

* @param field field name * @param queryText text to be passed to the analyzer - * @param fraction of query terms {@code [0..1]} that should match - * @return {@code TermQuery} or {@code BooleanQuery}, based on the analysis + * @param fraction of query terms {@code [0..1]} that should match + * @return {@code TermQuery} or {@code BooleanQuery}, based on the analysis * of {@code queryText} */ public Query createMinShouldMatchQuery(String field, String queryText, float fraction) { if (Float.isNaN(fraction) || fraction < 0 || fraction > 1) { throw new IllegalArgumentException("fraction should be >= 0 and <= 1"); } - + // TODO: weird that BQ equals/rewrite/scorer doesn't handle this? if (fraction == 1) { return createBooleanQuery(field, queryText, BooleanClause.Occur.MUST); } - + Query query = createFieldQuery(analyzer, BooleanClause.Occur.SHOULD, field, queryText, false, 0); if (query instanceof BooleanQuery) { query = addMinShouldMatchToBoolean((BooleanQuery) query, fraction); @@ -163,21 +163,21 @@ private BooleanQuery addMinShouldMatchToBoolean(BooleanQuery query, float fracti return builder.build(); } - /** - * Returns the analyzer. + /** + * Returns the analyzer. * @see #setAnalyzer(Analyzer) */ public Analyzer getAnalyzer() { return analyzer; } - - /** + + /** * Sets the analyzer used to tokenize text. */ public void setAnalyzer(Analyzer analyzer) { this.analyzer = analyzer; } - + /** * Returns true if position increments are enabled. * @see #setEnablePositionIncrements(boolean) @@ -185,7 +185,7 @@ public void setAnalyzer(Analyzer analyzer) { public boolean getEnablePositionIncrements() { return enablePositionIncrements; } - + /** * Set to true to enable position increments in result query. *

@@ -273,18 +273,18 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato // Build an appropriate query based on the analysis chain. try (CachingTokenFilter stream = new CachingTokenFilter(source)) { - + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLenAtt = stream.addAttribute(PositionLengthAttribute.class); if (termAtt == null) { - return null; + return null; } - + // phase 1: read through the stream and assess the situation: // counting the number of tokens/positions and marking if we have any synonyms. - + int numTokens = 0; int positionCount = 0; boolean hasSynonyms = false; @@ -305,10 +305,10 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato isGraph = true; } } - + // phase 2: based on token count, presence of synonyms, and options // formulate a single term, boolean, or phrase. - + if (numTokens == 0) { return null; } else if (numTokens == 1) { @@ -369,26 +369,26 @@ protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOExcep } } - /** - * Creates simple term query from the cached tokenstream contents + /** + * Creates simple term query from the cached tokenstream contents */ protected Query analyzeTerm(String field, TokenStream stream) throws IOException { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); - + stream.reset(); if (!stream.incrementToken()) { throw new AssertionError(); } - + return newTermQuery(new Term(field, termAtt.getBytesRef())); } - - /** - * Creates simple boolean query from the cached tokenstream contents + + /** + * Creates simple boolean query from the cached tokenstream contents */ protected Query analyzeBoolean(String field, TokenStream stream) throws IOException { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); - + stream.reset(); List terms = new ArrayList<>(); while (stream.incrementToken()) { @@ -409,16 +409,16 @@ protected void add(BooleanQuery.Builder q, List current, TokenStream sourc } } - /** - * Creates complex boolean query from the cached tokenstream contents + /** + * Creates complex boolean query from the cached tokenstream contents */ protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException { BooleanQuery.Builder q = newBooleanQuery(); List currentQuery = new ArrayList<>(); - + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); - + stream.reset(); while (stream.incrementToken()) { if (posIncrAtt.getPositionIncrement() != 0) { @@ -431,18 +431,18 @@ protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanCla return q.build(); } - - /** - * Creates simple phrase query from the cached tokenstream contents + + /** + * Creates simple phrase query from the cached tokenstream contents */ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws IOException { PhraseQuery.Builder builder = new PhraseQuery.Builder(); builder.setSlop(slop); - + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); - int position = -1; - + int position = -1; + stream.reset(); while (stream.incrementToken()) { if (enablePositionIncrements) { @@ -455,24 +455,24 @@ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws return builder.build(); } - - /** - * Creates complex phrase query from the cached tokenstream contents + + /** + * Creates complex phrase query from the cached tokenstream contents */ protected Query analyzeMultiPhrase(String field, TokenStream stream, int slop) throws IOException { MultiPhraseQuery.Builder mpqb = newMultiPhraseQueryBuilder(); mpqb.setSlop(slop); - + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); - int position = -1; - + int position = -1; + List multiTerms = new ArrayList<>(); stream.reset(); while (stream.incrementToken()) { int positionIncrement = posIncrAtt.getPositionIncrement(); - + if (positionIncrement > 0 && multiTerms.size() > 0) { if (enablePositionIncrements) { mpqb.add(multiTerms.toArray(new Term[0]), position); @@ -484,7 +484,7 @@ protected Query analyzeMultiPhrase(String field, TokenStream stream, int slop) t position += positionIncrement; multiTerms.add(new Term(field, termAtt.getBytesRef())); } - + if (enablePositionIncrements) { mpqb.add(multiTerms.toArray(new Term[0]), position); } else { @@ -648,7 +648,7 @@ protected List newGraphSpanQueries(String field, Iterator @@ -676,7 +676,7 @@ protected Query newGraphSynonymQuery(Iterator queries, Iterator @@ -687,7 +687,7 @@ protected Query newGraphSynonymQuery(Iterator queries, Iterator From ed3aa615d41eb4421de7fdbeac7773537420bda9 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Fri, 20 Apr 2018 10:55:03 +0100 Subject: [PATCH 03/36] [SOLR-12238] query edismax boost test --- .../apache/solr/search/TestSolrQueryParser.java | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java index 7c851d2b4118..4e1cce5a6f99 100644 --- a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java +++ b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java @@ -1117,11 +1117,23 @@ public void testGraphSynonyms_multiTermSynonymsPayloadBoost_shouldParseBoostedPh public void testGraphSynonyms_multiTermQueryMultiTermSynonymsPayloadBoost_shouldParseBoostedPhraseQuery() throws Exception { Query q = QParser.getParser("snow_leopard lion", req(params("df", "t_pick_best_boost_by_payload_foo", "sow", "false"))).getQuery(); assertEquals("((t_pick_best_boost_by_payload_foo:\"panthera uncia\")^0.99 | (t_pick_best_boost_by_payload_foo:\"snow leopard\")^1.0) " + - "((t_pick_best_boost_by_payload_foo:lion)^1.0 | (t_pick_best_boost_by_payload_foo:\"panthera leo\")^0.99 | (t_pick_best_boost_by_payload_foo:simba)^0.8)", q.toString()); + "((t_pick_best_boost_by_payload_foo:lion)^1.0 | (t_pick_best_boost_by_payload_foo:\"panthera leo\")^0.99 | (t_pick_best_boost_by_payload_foo:simba)^0.8)", q.toString()); q = QParser.getParser("snow_leopard lion", req(params("df", "t_as_distinct_boost_by_payload_foo", "sow", "false"))).getQuery(); assertEquals("((t_as_distinct_boost_by_payload_foo:\"panthera uncia\")^0.99 (t_as_distinct_boost_by_payload_foo:\"snow leopard\")^1.0) " + - "((t_as_distinct_boost_by_payload_foo:lion)^1.0 (t_as_distinct_boost_by_payload_foo:\"panthera leo\")^0.99 (t_as_distinct_boost_by_payload_foo:simba)^0.8)", q.toString()); + "((t_as_distinct_boost_by_payload_foo:lion)^1.0 (t_as_distinct_boost_by_payload_foo:\"panthera leo\")^0.99 (t_as_distinct_boost_by_payload_foo:simba)^0.8)", q.toString()); + } + + public void testGraphSynonyms_multiTermQueryMultiTermSynonymsEdismaxBoostAndPayloadBoost_shouldParseBoostedPhraseQuery() throws Exception { + Query q = QParser.getParser("snow_leopard lion","edismax",true, req(params("sow", "false","qf", "t_pick_best_boost_by_payload_foo^10"))).getQuery(); + assertEquals("+(" + + "((((t_pick_best_boost_by_payload_foo:\"panthera uncia\")^0.99 | (t_pick_best_boost_by_payload_foo:\"snow leopard\")^1.0))^10.0) " + + "((((t_pick_best_boost_by_payload_foo:lion)^1.0 | (t_pick_best_boost_by_payload_foo:\"panthera leo\")^0.99 | (t_pick_best_boost_by_payload_foo:simba)^0.8))^10.0))", q.toString()); + + q = QParser.getParser("snow_leopard lion","edismax",true, req(params("sow", "false","qf", "t_as_distinct_boost_by_payload_foo^10"))).getQuery(); + assertEquals("+(" + + "(((t_as_distinct_boost_by_payload_foo:\"panthera uncia\")^0.99 (t_as_distinct_boost_by_payload_foo:\"snow leopard\")^1.0)^10.0) " + + "(((t_as_distinct_boost_by_payload_foo:lion)^1.0 (t_as_distinct_boost_by_payload_foo:\"panthera leo\")^0.99 (t_as_distinct_boost_by_payload_foo:simba)^0.8)^10.0))", q.toString()); } public void testGraphSynonyms_phraseQueryMultiTermSynonymsPayloadBoost_shouldParseBoostedSpanQuery() throws Exception { From 5c1e50c04de3af4d2d3064b2df362392c376cc4d Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Fri, 20 Apr 2018 12:55:43 +0100 Subject: [PATCH 04/36] [SOLR-12238] comments correction --- .../src/test/org/apache/solr/search/TestSolrQueryParser.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java index 4e1cce5a6f99..a9e8e61d8bf8 100644 --- a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java +++ b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java @@ -1098,7 +1098,7 @@ public void testGraphSynonyms_singleTermSynonymsPayloadBoost_shouldParseBoostedQ q = QParser.getParser("tiger", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); assertEquals("(t_as_distinct_boost_by_payload_foo:tiger)^1.0 (t_as_distinct_boost_by_payload_foo:big_cat)^0.8 (t_as_distinct_boost_by_payload_foo:shere_khan)^0.9", q.toString()); - /*confirm autoGeneratePhraseQueries always builds OR queries*/ + /*AutoGenerated queries follow the synonym query style approach */ q = QParser.getParser("jeans", req(params("df", "t_pick_best_boost_by_payload_foo", "sow", "false"))).getQuery(); assertEquals("(t_pick_best_boost_by_payload_foo:\"denim pants\" | t_pick_best_boost_by_payload_foo:jeans)", q.toString()); @@ -1160,7 +1160,7 @@ public void testGraphSynonyms_phraseQueryMultiTermSynonymsPayloadBoost_shouldPar " 0, true)", q.toString()); } - /* If you have single terms synonims, a flat token stream is still OK */ + /* If you have single terms synonyms, a flat token stream is still OK */ public void testFlatSynonyms_singleTermSynonymsPayloadBoost_shouldParseBoostedQuery() throws Exception { Query q = QParser.getParser("tiger", req(params("df", "t_pick_best_boost_by_payload_flat_foo"))).getQuery(); assertEquals("((t_pick_best_boost_by_payload_flat_foo:tiger)^1.0 | (t_pick_best_boost_by_payload_flat_foo:big_cat)^0.8 | (t_pick_best_boost_by_payload_flat_foo:shere_khan)^0.9)", q.toString()); From 3d7ad623059b4788fd6b0a42f9d7573c0b74c741 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Thu, 26 Apr 2018 01:16:59 +0100 Subject: [PATCH 05/36] [SOLR-12238] use PayloadHelper to decode the payload --- .../java/org/apache/solr/parser/SolrQueryParserBase.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java index 591d35c828e0..0a0951b02c0a 100644 --- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java +++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.payloads.PayloadHelper; import org.apache.lucene.analysis.reverse.ReverseStringFilter; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; @@ -666,8 +667,8 @@ private List getSynonymQueries(Term[] terms, TokenStream sourceTokenStrea termPayload = currentStreamPayloads[i]; } if (termPayload != null) { - float payloadValue = ByteBuffer.wrap(termPayload.bytes).order(ByteOrder.BIG_ENDIAN).getFloat(); - synonymQueries.add(new BoostQuery(newTermQuery(currentTerm), payloadValue)); + float decodedPayload = PayloadHelper.decodeFloat(termPayload.bytes, termPayload.offset); + synonymQueries.add(new BoostQuery(newTermQuery(currentTerm), decodedPayload)); } else { synonymQueries.add(newTermQuery(currentTerm)); } @@ -730,7 +731,7 @@ private List boostQueriesByPayload(Iterator sidePaths, Iterator Date: Fri, 27 Apr 2018 16:43:42 +0100 Subject: [PATCH 06/36] [SOLR-12238] tests to verify managed synonym handle weighted synonyms with separator | --- .../TestManagedSynonymGraphFilterFactory.java | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymGraphFilterFactory.java b/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymGraphFilterFactory.java index 906462d07779..9cd68f65e1da 100644 --- a/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymGraphFilterFactory.java +++ b/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymGraphFilterFactory.java @@ -295,4 +295,80 @@ public void testCanHandleDecodingAndEncodingForSynonyms() throws Exception { assertJDelete(endpoint+"/fröhlich", "/error/code==404"); } + + /** + * Can we add and single term synonyms with weight + */ + @Test + public void testManagedSynonyms_singleTermWithWeight_shouldHandleSynonym() throws Exception { + String endpoint = "/schema/analysis/synonyms/englishgraph"; + + assertJQ(endpoint, + "/synonymMappings/initArgs/ignoreCase==false", + "/synonymMappings/managedMap=={}"); + + // does not exist + assertJQ(endpoint+"/tiger", + "/error/code==404"); + + Map> syns = new HashMap<>(); + + // now put a synonym + syns.put("tiger", Arrays.asList("tiger|1.0")); + assertJPut(endpoint, + JSONUtil.toJSON(syns), + "/responseHeader/status==0"); + + // and check if it exists + assertJQ(endpoint, + "/synonymMappings/managedMap/tiger==['tiger|1.0']"); + + // verify delete works + assertJDelete(endpoint+"/tiger", + "/responseHeader/status==0"); + + + // was it really deleted? + assertJDelete(endpoint+"/tiger", + "/error/code==404"); + } + + /** + * Can we add multi term synonyms with weight + */ + @Test + public void testManagedSynonyms_multiTermWithWeight_shouldHandleSynonym() throws Exception { + String endpoint = "/schema/analysis/synonyms/englishgraph"; + + assertJQ(endpoint, + "/synonymMappings/initArgs/ignoreCase==false", + "/synonymMappings/managedMap=={}"); + + // does not exist + assertJQ(endpoint+"/tiger", + "/error/code==404"); + + Map> syns = new HashMap<>(); + + // now put a synonym + List tigerSyonyms = Arrays.asList("tiger|1.0", "panthera tigris|0.9", "Shere Kan|0.8"); + syns.put("tiger", tigerSyonyms); + String jsonTigerSynonyms = JSONUtil.toJSON(syns); + assertJPut(endpoint, + jsonTigerSynonyms, + "/responseHeader/status==0"); + + // and check if it exists + assertJQ(endpoint, + "/synonymMappings/managedMap/tiger==[\"Shere Kan|0.8\",\"panthera tigris|0.9\",\"tiger|1.0\"]"); + + // verify delete works + assertJDelete(endpoint+"/tiger", + "/responseHeader/status==0"); + + + // was it really deleted? + assertJDelete(endpoint+"/tiger", + "/error/code==404"); + } } From e02ed0dc6af1b7ce84a7a137e5d63e9a7680d0b4 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Fri, 27 Apr 2018 17:30:50 +0100 Subject: [PATCH 07/36] [SOLR-12238] minor style --- solr/core/src/test-files/solr/collection1/conf/schema12.xml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/solr/core/src/test-files/solr/collection1/conf/schema12.xml b/solr/core/src/test-files/solr/collection1/conf/schema12.xml index c74bdbc15b90..1493d72131f0 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema12.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema12.xml @@ -221,8 +221,7 @@ - - + @@ -277,7 +276,6 @@ - From 2e641202289d63a0a3a3172c72e91e66deab7e4a Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Thu, 3 May 2018 18:29:36 +0100 Subject: [PATCH 08/36] [SOLR-12238] precommit fixes --- .../src/java/org/apache/solr/parser/SolrQueryParserBase.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java index 0a0951b02c0a..2bd6b397c8ec 100644 --- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java +++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java @@ -18,8 +18,6 @@ import java.io.IOException; import java.io.StringReader; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; import java.util.ArrayList; import java.util.Collections; import java.util.EnumSet; From 42f9a99c9b0dd9f216f049eba76124c50ff1a8ff Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Fri, 24 Jan 2020 14:17:09 +0000 Subject: [PATCH 09/36] [SOLR-12238] merge conflicts fixed --- .../core/src/java/org/apache/lucene/util/QueryBuilder.java | 4 ++-- .../java/org/apache/solr/parser/SolrQueryParserBase.java | 7 ++++--- .../analysis/TestManagedSynonymGraphFilterFactory.java | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index 88f3706e5ea3..46589bdee621 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -615,7 +615,7 @@ protected Query analyzeGraphPhrase(TokenStream source, String field, int phraseS final SpanQuery queryClause; if (graph.hasSidePath(start)) { Iterator sidePaths = graph.getFiniteStrings(start, end); - List sidePathQueries = newGraphSpanQueries(field, sidePaths); + List sidePathQueries = newGraphSpanQueries(field, sidePaths, maxClauseCount); if (sidePathQueries.size() > 0) { queryClause = new SpanOrQuery(sidePathQueries.toArray(new SpanQuery[0])); } else { @@ -655,7 +655,7 @@ protected Query analyzeGraphPhrase(TokenStream source, String field, int phraseS } } - protected List newGraphSpanQueries(String field, Iterator sidePaths) throws IOException { + protected List newGraphSpanQueries(String field, Iterator sidePaths, int maxClauseCount) throws IOException { List queries = new ArrayList<>(); while (sidePaths.hasNext()) { TokenStream sidePath = sidePaths.next(); diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java index adcebb8f1368..faeba41da03d 100644 --- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java +++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java @@ -58,7 +58,6 @@ import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.QueryBuilder; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; @@ -105,8 +104,10 @@ public abstract class SolrQueryParserBase extends QueryBuilder { * {@link #AS_SAME_TERM} * {@link #PICK_BEST} * {@link #AS_DISTINCT_TERMS} + * {@link #PICK_BEST_BOOST_BY_PAYLOAD} + * {@link #AS_DISTINCT_TERMS_BOOST_BY_PAYLOAD} */ - public static enum SynonymQueryStyle { + public enum SynonymQueryStyle { /** (default) synonym terms share doc freq * so if "pants" has df 500, and "khakis" a df of 50, uses 500 df when scoring both terms * appropriate for exact synonyms @@ -752,7 +753,7 @@ private float extractQueryPayload(BytesRef[] payloadsForQueryTerms) { */ @Override protected SpanQuery createSpanQuery(TokenStream source, String field) throws IOException { - if (synonymQueryStyle == PICK_BEST_BOOST_BY_PAYLOAD || synonymQueryStyle == AS_DISTINCT_TERMS_BOOST_BY_PAYLOAD) { + if (synonymQueryStyle == SynonymQueryStyle.PICK_BEST_BOOST_BY_PAYLOAD || synonymQueryStyle == SynonymQueryStyle.AS_DISTINCT_TERMS_BOOST_BY_PAYLOAD) { try (CachingTokenFilter stream = new CachingTokenFilter(source)) { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PayloadAttribute payloadAttribute = stream.getAttribute(PayloadAttribute.class); diff --git a/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymGraphFilterFactory.java b/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymGraphFilterFactory.java index 2f24358169f8..66e9efe5fce4 100644 --- a/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymGraphFilterFactory.java +++ b/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymGraphFilterFactory.java @@ -321,7 +321,7 @@ public void testManagedSynonyms_singleTermWithWeight_shouldHandleSynonym() throw // now put a synonym syns.put("tiger", Arrays.asList("tiger|1.0")); assertJPut(endpoint, - JSONUtil.toJSON(syns), + toJSONString(syns), "/responseHeader/status==0"); // and check if it exists @@ -358,7 +358,7 @@ public void testManagedSynonyms_multiTermWithWeight_shouldHandleSynonym() throws // now put a synonym List tigerSyonyms = Arrays.asList("tiger|1.0", "panthera tigris|0.9", "Shere Kan|0.8"); syns.put("tiger", tigerSyonyms); - String jsonTigerSynonyms = JSONUtil.toJSON(syns); + String jsonTigerSynonyms = toJSONString(syns); assertJPut(endpoint, jsonTigerSynonyms, "/responseHeader/status==0"); From 611365861867786ba00a20e47b2a924323066a04 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Fri, 24 Jan 2020 16:36:21 +0000 Subject: [PATCH 10/36] [SOLR-12238] temp commit --- .../org/apache/lucene/util/QueryBuilder.java | 103 ++++++++++-------- .../solr/parser/SolrQueryParserBase.java | 81 +++++++++----- .../org/apache/solr/schema/FieldType.java | 1 + .../org/apache/solr/schema/TextField.java | 6 + .../solr/collection1/conf/schema12.xml | 8 +- 5 files changed, 121 insertions(+), 78 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index 46589bdee621..8dd351cb65da 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -31,6 +31,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; @@ -63,6 +64,7 @@ public class QueryBuilder { protected boolean enablePositionIncrements = true; protected boolean enableGraphQueries = true; protected boolean autoGenerateMultiTermSynonymsPhraseQuery = false; + protected boolean synonymsBoostByPayload = false; /** Creates a new QueryBuilder using the given analyzer. */ public QueryBuilder(Analyzer analyzer) { @@ -217,6 +219,22 @@ public void setAutoGenerateMultiTermSynonymsPhraseQuery(boolean enable) { this.autoGenerateMultiTermSynonymsPhraseQuery = enable; } + /** + * Returns true if synonyms should be automatically boosted by their payload. + * @see #setAutoGenerateMultiTermSynonymsPhraseQuery(boolean) + */ + public boolean getSynonymsBoostByPayload() { + return synonymsBoostByPayload; + } + + /** + * Set to true if synonyms should be automatically boosted by their payload. + * Default: false. + */ + public void setSynonymsBoostByPayload(boolean enable) { + this.synonymsBoostByPayload = enable; + } + /** * Creates a query from the analysis chain. *

@@ -239,7 +257,7 @@ protected Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator // Use the analyzer to get all the tokens, and then build an appropriate // query based on the analysis chain. try (TokenStream source = analyzer.tokenStream(field, queryText)) { - return createFieldQuery(source, operator, field, quoted, phraseSlop); + return createFieldQuery(source, operator, field, quoted, false, phraseSlop); } catch (IOException e) { throw new RuntimeException("Error analyzing query text", e); } @@ -266,14 +284,18 @@ public boolean getEnableGraphQueries() { * @param operator default boolean operator used for this query * @param field field to create queries against * @param quoted true if phrases should be generated when terms occur at more than one position + * @param boostByPayload true if synonyms must be boosted by payload * @param phraseSlop slop factor for phrase/multiphrase queries */ - protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operator, String field, boolean quoted, int phraseSlop) { + protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operator, String field, boolean quoted, boolean boostByPayload, int phraseSlop) { assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST; // Build an appropriate query based on the analysis chain. try (CachingTokenFilter stream = new CachingTokenFilter(source)) { - + PayloadAttribute payloadAtt = null; + if(boostByPayload){ + payloadAtt = stream.getAttribute(PayloadAttribute.class); + } TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLenAtt = stream.addAttribute(PositionLengthAttribute.class); @@ -289,9 +311,12 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato int positionCount = 0; boolean hasSynonyms = false; boolean isGraph = false; + BytesRef payload ; + float decodedPayload = 0f; stream.reset(); while (stream.incrementToken()) { + numTokens++; int positionIncrement = posIncAtt.getPositionIncrement(); if (positionIncrement != 0) { @@ -305,7 +330,13 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato isGraph = true; } } - + + if (payloadAtt != null) { + payload = payloadAtt.getPayload(); + if(payload != null) { + decodedPayload = decodeFloat(payload.bytes, payload.offset); + } + } // phase 2: based on token count, presence of synonyms, and options // formulate a single term, boolean, or phrase. @@ -313,11 +344,11 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato return null; } else if (numTokens == 1) { // single term - return analyzeTerm(field, stream); + return new BoostQuery(analyzeTerm(field, stream),decodedPayload); } else if (isGraph) { // graph if (quoted) { - return analyzeGraphPhrase(stream, field, phraseSlop); + return analyzeGraphPhrase(stream, field, boostByPayload, phraseSlop); } else { return analyzeGraphBoolean(field, stream, operator); } @@ -325,19 +356,19 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato // phrase if (hasSynonyms) { // complex phrase with synonyms - return analyzeMultiPhrase(field, stream, phraseSlop); + return new BoostQuery(analyzeMultiPhrase(field, stream, phraseSlop),decodedPayload); } else { // simple phrase - return analyzePhrase(field, stream, phraseSlop); + return new BoostQuery(analyzePhrase(field, stream, phraseSlop),decodedPayload); } } else { // boolean if (positionCount == 1) { // only one position, with synonyms - return analyzeBoolean(field, stream); + return new BoostQuery(analyzeBoolean(field, stream),decodedPayload); } else { // complex case: multiple positions - return analyzeMultiBoolean(field, stream, operator); + return new BoostQuery(analyzeMultiBoolean(field, stream, operator),decodedPayload); } } } catch (IOException e) { @@ -345,6 +376,16 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato } } + public static final float decodeFloat(byte [] bytes, int offset){ + + return Float.intBitsToFloat(decodeInt(bytes, offset)); + } + + public static final int decodeInt(byte [] bytes, int offset){ + return ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16) + | ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF); + } + /** * Creates a span query from the tokenstream. In the case of a single token, a simple SpanTermQuery is * returned. When multiple tokens, an ordered SpanNearQuery with slop 0 is returned. @@ -522,23 +563,10 @@ public boolean hasNext() { @Override public Query next() { TokenStream sidePath = sidePaths.next(); - return createFieldQuery(sidePath, BooleanClause.Occur.MUST, field, getAutoGenerateMultiTermSynonymsPhraseQuery(), 0); - } - }; - final Iterator sidePathsForPayloads= graph.getFiniteStrings(start, end); - Iterator sidePathsPayloads = new Iterator() { - @Override - public boolean hasNext() { - return sidePathsForPayloads.hasNext(); - } - - @Override - public BytesRef[] next() { - TokenStream sidePath = sidePathsForPayloads.next(); - return getPayloadsFromStream(sidePath); + return createFieldQuery(sidePath, BooleanClause.Occur.MUST, field, getAutoGenerateMultiTermSynonymsPhraseQuery(),getSynonymsBoostByPayload(), 0); } }; - queryClause = newGraphSynonymQuery(sidePathsQueries, sidePathsPayloads); + queryClause = newGraphSynonymQuery(sidePathsQueries); } else { Term[] terms = graph.getTerms(field, start); assert terms.length > 0; @@ -554,29 +582,10 @@ public BytesRef[] next() { } return builder.build(); } - - protected BytesRef[] getPayloadsFromStream(TokenStream source) { - try (CachingTokenFilter stream = new CachingTokenFilter(source)) { - PayloadAttribute payloadAtt = stream.getAttribute(PayloadAttribute.class); - stream.reset(); - List payloads = new ArrayList<>(); - while (stream.incrementToken()) { - if (payloadAtt != null) { - payloads.add(payloadAtt.getPayload()); - } - } - stream.end(); - stream.close(); - return payloads.toArray(new BytesRef[payloads.size()]); - } catch (IOException e) { - throw new RuntimeException("Error analyzing query text", e); - } - } - /** * Creates graph phrase query from the tokenstream contents */ - protected Query analyzeGraphPhrase(TokenStream source, String field, int phraseSlop) + protected Query analyzeGraphPhrase(TokenStream source, String field, boolean boostByPayload, int phraseSlop) throws IOException { source.reset(); GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source); @@ -588,7 +597,7 @@ protected Query analyzeGraphPhrase(TokenStream source, String field, int phraseS BooleanQuery.Builder builder = new BooleanQuery.Builder(); Iterator it = graph.getFiniteStrings(); while (it.hasNext()) { - Query query = createFieldQuery(it.next(), BooleanClause.Occur.MUST, field, true, phraseSlop); + Query query = createFieldQuery(it.next(), BooleanClause.Occur.MUST, field, true, boostByPayload, phraseSlop); if (query != null) { builder.add(query, BooleanClause.Occur.SHOULD); } @@ -700,7 +709,7 @@ protected Query newSynonymQuery(Term[] terms, TokenStream sourceTokenStream) { * This is intended for subclasses that wish to customize the generated queries. * @return new Query instance */ - protected Query newGraphSynonymQuery(Iterator queries, Iterator sidePathsPayloads) { + protected Query newGraphSynonymQuery(Iterator queries) { BooleanQuery.Builder builder = new BooleanQuery.Builder(); while (queries.hasNext()) { builder.add(queries.next(), BooleanClause.Occur.SHOULD); diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java index faeba41da03d..016f3b7ba46f 100644 --- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java +++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java @@ -104,8 +104,6 @@ public abstract class SolrQueryParserBase extends QueryBuilder { * {@link #AS_SAME_TERM} * {@link #PICK_BEST} * {@link #AS_DISTINCT_TERMS} - * {@link #PICK_BEST_BOOST_BY_PAYLOAD} - * {@link #AS_DISTINCT_TERMS_BOOST_BY_PAYLOAD} */ public enum SynonymQueryStyle { /** (default) synonym terms share doc freq @@ -128,19 +126,17 @@ public enum SynonymQueryStyle { * appropriate when more specific synonyms should score higher, but we don't want to ignore * less specific synonyms */ - AS_DISTINCT_TERMS, + AS_DISTINCT_TERMS; - /** - * this approach is an extension of the pick_best, - * it adds a boost to each synonym based on the payload associated to the term - */ - PICK_BEST_BOOST_BY_PAYLOAD, + private boolean boostByPayload; - /** - * this approach is an extension of the as_distinct_terms, - * it adds a boost to each synonym based on the payload associated to the term - */ - AS_DISTINCT_TERMS_BOOST_BY_PAYLOAD, + public boolean isBoostByPayload() { + return boostByPayload; + } + + public void setBoostByPayload(boolean boostByPayload) { + this.boostByPayload = boostByPayload; + } } // make it possible to call setDefaultOperator() without accessing @@ -631,13 +627,9 @@ protected Query newRegexpQuery(Term regexp) { protected Query newSynonymQuery(Term[] terms, TokenStream sourceTokenStream) { switch (synonymQueryStyle) { case PICK_BEST: - return getDisjunctionSynonymQuery(terms, sourceTokenStream, false); - case PICK_BEST_BOOST_BY_PAYLOAD: - return getDisjunctionSynonymQuery(terms, sourceTokenStream, true); + return getDisjunctionSynonymQuery(terms, sourceTokenStream, synonymQueryStyle.isBoostByPayload()); case AS_DISTINCT_TERMS: - return getBooleanSynonymQuery(terms, sourceTokenStream,false); - case AS_DISTINCT_TERMS_BOOST_BY_PAYLOAD: - return getBooleanSynonymQuery(terms, sourceTokenStream,true); + return getBooleanSynonymQuery(terms, sourceTokenStream,synonymQueryStyle.isBoostByPayload()); case AS_SAME_TERM: return super.newSynonymQuery(terms, sourceTokenStream); default: @@ -666,7 +658,7 @@ private List getSynonymQueries(Term[] terms, TokenStream sourceTokenStrea List synonymQueries = new ArrayList<>(terms.length); BytesRef[] currentStreamPayloads = null; if (payloadBoost) { - currentStreamPayloads = super.getPayloadsFromStream(sourceTokenStream); + currentStreamPayloads = this.getPayloadsFromStream(sourceTokenStream); } BytesRef termPayload = null; for (int i = 0; i < terms.length; i++) { @@ -692,14 +684,15 @@ private List getSynonymQueries(Term[] terms, TokenStream sourceTokenStrea * @return new Query instance */ @Override - protected Query newGraphSynonymQuery(Iterator sidePaths, Iterator sidePathsPayloads) { + protected Query newGraphSynonymQuery(Iterator sidePaths) { switch (synonymQueryStyle) { - case PICK_BEST_BOOST_BY_PAYLOAD: + case PICK_BEST:{ List boostedSidePaths = boostQueriesByPayload(sidePaths, sidePathsPayloads); DisjunctionMaxQuery graphSynonymQuery = new DisjunctionMaxQuery(boostedSidePaths, 0.0f); - return graphSynonymQuery; - case AS_DISTINCT_TERMS_BOOST_BY_PAYLOAD: - boostedSidePaths = boostQueriesByPayload(sidePaths, sidePathsPayloads); + return graphSynonymQuery;} + case AS_DISTINCT_TERMS_BOOST_BY_PAYLOAD:{ + Iterator sidePathsPayloads = this.extractPayloadsFromTokenStreams(sidePathsTokenStreams); + List boostedSidePaths = boostQueriesByPayload(sidePaths, sidePathsPayloads); BooleanQuery.Builder builder = new BooleanQuery.Builder(); for (Query boostedSidePath : boostedSidePaths) { builder.add(boostedSidePath, BooleanClause.Occur.SHOULD); @@ -708,9 +701,43 @@ protected Query newGraphSynonymQuery(Iterator sidePaths, Iterator extractPayloadsFromTokenStreams(Iterator sidePathsTokenStreams) { + Iterator sidePathsPayloads = new Iterator() { + @Override + public boolean hasNext() { + return sidePathsTokenStreams.hasNext(); + } + + @Override + public BytesRef[] next() { + TokenStream sidePath = sidePathsTokenStreams.next(); + return getPayloadsFromStream(sidePath); + } + }; + return sidePathsPayloads; + } + + protected BytesRef[] getPayloadsFromStream(TokenStream source) { + try (CachingTokenFilter stream = new CachingTokenFilter(source)) { + PayloadAttribute payloadAtt = stream.getAttribute(PayloadAttribute.class); + stream.reset(); + List payloads = new ArrayList<>(); + while (stream.incrementToken()) { + if (payloadAtt != null) { + payloads.add(payloadAtt.getPayload()); + } + } + stream.end(); + stream.close(); + return payloads.toArray(new BytesRef[payloads.size()]); + } catch (IOException e) { + throw new RuntimeException("Error analyzing query text", e); } } diff --git a/solr/core/src/java/org/apache/solr/schema/FieldType.java b/solr/core/src/java/org/apache/solr/schema/FieldType.java index e344019ca3d8..937b68cb9a62 100644 --- a/solr/core/src/java/org/apache/solr/schema/FieldType.java +++ b/solr/core/src/java/org/apache/solr/schema/FieldType.java @@ -1063,6 +1063,7 @@ protected void checkSupportsDocValues() { protected static final String AUTO_GENERATE_PHRASE_QUERIES = "autoGeneratePhraseQueries"; protected static final String ENABLE_GRAPH_QUERIES = "enableGraphQueries"; + protected static final String BOOST_BY_PAYLOAD = "boostByPayload"; private static final String ARGS = "args"; private static final String POSITION_INCREMENT_GAP = "positionIncrementGap"; protected static final String SYNONYM_QUERY_STYLE = "synonymQueryStyle"; diff --git a/solr/core/src/java/org/apache/solr/schema/TextField.java b/solr/core/src/java/org/apache/solr/schema/TextField.java index bddaf00c760c..71164ac61fc7 100644 --- a/solr/core/src/java/org/apache/solr/schema/TextField.java +++ b/solr/core/src/java/org/apache/solr/schema/TextField.java @@ -87,6 +87,12 @@ protected void init(IndexSchema schema, Map args) { if (enableGraphQueriesStr != null) enableGraphQueries = Boolean.parseBoolean(enableGraphQueriesStr); + boolean boostByPayload = false; + String boostByPayloadStr = args.remove(BOOST_BY_PAYLOAD); + if (boostByPayloadStr != null) + boostByPayload = Boolean.parseBoolean(boostByPayloadStr); + this.synonymQueryStyle.setBoostByPayload(boostByPayload); + super.init(schema, args); } diff --git a/solr/core/src/test-files/solr/collection1/conf/schema12.xml b/solr/core/src/test-files/solr/collection1/conf/schema12.xml index 06c879d66d0a..00b66f0944a3 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema12.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema12.xml @@ -197,7 +197,7 @@ - + @@ -209,7 +209,7 @@ - + @@ -251,7 +251,7 @@ - + @@ -263,7 +263,7 @@ - + From b44be68fff7f29f50effe5f6043b298893f1af20 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Fri, 24 Jan 2020 19:38:51 +0000 Subject: [PATCH 11/36] [SOLR-12238] re-design --- .../org/apache/lucene/util/QueryBuilder.java | 132 ++++++++++------- .../graph/GraphTokenStreamFiniteStrings.java | 4 +- .../org/apache/solr/parser/QueryParser.java | 4 +- .../solr/parser/SolrQueryParserBase.java | 138 +++++++----------- .../org/apache/solr/schema/TextField.java | 7 +- .../solr/search/ExtendedDismaxQParser.java | 4 +- .../solr/search/TestExtendedDismaxParser.java | 4 +- 7 files changed, 145 insertions(+), 148 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index 8dd351cb65da..4e92cfb421bc 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -31,7 +31,6 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; @@ -257,7 +256,7 @@ protected Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator // Use the analyzer to get all the tokens, and then build an appropriate // query based on the analysis chain. try (TokenStream source = analyzer.tokenStream(field, queryText)) { - return createFieldQuery(source, operator, field, quoted, false, phraseSlop); + return createFieldQuery(source, operator, field,quoted, phraseSlop); } catch (IOException e) { throw new RuntimeException("Error analyzing query text", e); } @@ -284,39 +283,32 @@ public boolean getEnableGraphQueries() { * @param operator default boolean operator used for this query * @param field field to create queries against * @param quoted true if phrases should be generated when terms occur at more than one position - * @param boostByPayload true if synonyms must be boosted by payload * @param phraseSlop slop factor for phrase/multiphrase queries */ - protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operator, String field, boolean quoted, boolean boostByPayload, int phraseSlop) { + protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operator, String field, boolean quoted, int phraseSlop) { assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST; // Build an appropriate query based on the analysis chain. try (CachingTokenFilter stream = new CachingTokenFilter(source)) { - PayloadAttribute payloadAtt = null; - if(boostByPayload){ - payloadAtt = stream.getAttribute(PayloadAttribute.class); - } + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLenAtt = stream.addAttribute(PositionLengthAttribute.class); if (termAtt == null) { - return null; + return null; } - + // phase 1: read through the stream and assess the situation: // counting the number of tokens/positions and marking if we have any synonyms. - + int numTokens = 0; int positionCount = 0; boolean hasSynonyms = false; boolean isGraph = false; - BytesRef payload ; - float decodedPayload = 0f; stream.reset(); while (stream.incrementToken()) { - numTokens++; int positionIncrement = posIncAtt.getPositionIncrement(); if (positionIncrement != 0) { @@ -331,24 +323,18 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato } } - if (payloadAtt != null) { - payload = payloadAtt.getPayload(); - if(payload != null) { - decodedPayload = decodeFloat(payload.bytes, payload.offset); - } - } // phase 2: based on token count, presence of synonyms, and options // formulate a single term, boolean, or phrase. - + if (numTokens == 0) { return null; } else if (numTokens == 1) { // single term - return new BoostQuery(analyzeTerm(field, stream),decodedPayload); + return analyzeTerm(field, stream); } else if (isGraph) { // graph if (quoted) { - return analyzeGraphPhrase(stream, field, boostByPayload, phraseSlop); + return analyzeGraphPhrase(stream, field, phraseSlop); } else { return analyzeGraphBoolean(field, stream, operator); } @@ -356,35 +342,25 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato // phrase if (hasSynonyms) { // complex phrase with synonyms - return new BoostQuery(analyzeMultiPhrase(field, stream, phraseSlop),decodedPayload); + return analyzeMultiPhrase(field, stream, phraseSlop); } else { // simple phrase - return new BoostQuery(analyzePhrase(field, stream, phraseSlop),decodedPayload); + return analyzePhrase(field, stream, phraseSlop); } } else { // boolean if (positionCount == 1) { // only one position, with synonyms - return new BoostQuery(analyzeBoolean(field, stream),decodedPayload); + return analyzeBoolean(field, stream); } else { // complex case: multiple positions - return new BoostQuery(analyzeMultiBoolean(field, stream, operator),decodedPayload); + return analyzeMultiBoolean(field, stream, operator); } } } catch (IOException e) { throw new RuntimeException("Error analyzing query text", e); } } - - public static final float decodeFloat(byte [] bytes, int offset){ - - return Float.intBitsToFloat(decodeInt(bytes, offset)); - } - - public static final int decodeInt(byte [] bytes, int offset){ - return ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16) - | ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF); - } /** * Creates a span query from the tokenstream. In the case of a single token, a simple SpanTermQuery is @@ -429,24 +405,33 @@ protected Query analyzeTerm(String field, TokenStream stream) throws IOException */ protected Query analyzeBoolean(String field, TokenStream stream) throws IOException { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); + PayloadAttribute payloadAtt = null; + if(synonymsBoostByPayload){ + payloadAtt = stream.getAttribute(PayloadAttribute.class); + } stream.reset(); List terms = new ArrayList<>(); + List payloads = new ArrayList<>(); while (stream.incrementToken()) { terms.add(new Term(field, termAtt.getBytesRef())); + if (payloadAtt != null) { + payloads.add(payloadAtt.getPayload()); + } } - return newSynonymQuery(terms.toArray(new Term[terms.size()]),stream); + return newSynonymQuery(terms.toArray(new Term[terms.size()]),payloads.toArray(new BytesRef[payloads.size()])); } - protected void add(BooleanQuery.Builder q, List current, TokenStream sourceTokenStream, BooleanClause.Occur operator) { + protected void add(BooleanQuery.Builder q, List current, List payloads, BooleanClause.Occur operator) { if (current.isEmpty()) { return; } if (current.size() == 1) { q.add(newTermQuery(current.get(0)), operator); } else { - q.add(newSynonymQuery(current.toArray(new Term[current.size()]), sourceTokenStream), operator); + Query synonymQuery = newSynonymQuery(current.toArray(new Term[current.size()]), payloads.toArray(new BytesRef[payloads.size()])); + q.add(synonymQuery, operator); } } @@ -456,19 +441,26 @@ protected void add(BooleanQuery.Builder q, List current, TokenStream sourc protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException { BooleanQuery.Builder q = newBooleanQuery(); List currentQuery = new ArrayList<>(); - + List currentPayload = new ArrayList<>(); + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); - + PayloadAttribute payloadAtt = null; + if(synonymsBoostByPayload){ + payloadAtt = stream.getAttribute(PayloadAttribute.class); + } stream.reset(); while (stream.incrementToken()) { if (posIncrAtt.getPositionIncrement() != 0) { - add(q, currentQuery, stream, operator); + add(q, currentQuery, currentPayload, operator); currentQuery.clear(); } currentQuery.add(new Term(field, termAtt.getBytesRef())); + if(payloadAtt!=null){ + currentPayload.add(payloadAtt.getPayload()); + } } - add(q, currentQuery, stream, operator); + add(q, currentQuery, currentPayload, operator); return q.build(); } @@ -552,6 +544,19 @@ protected Query analyzeGraphBoolean(String field, TokenStream source, BooleanCla } lastState = end; final Query queryClause; + final Iterator sidePathsForPayloads = graph.getFiniteStrings(start, end); + Iterator sidePathsPayloads = new Iterator() { + @Override + public boolean hasNext() { + return sidePathsForPayloads.hasNext(); + } + + @Override + public BytesRef[] next() { + TokenStream sidePath = sidePathsForPayloads.next(); + return getPayloadsFromStream(sidePath); + } + }; if (graph.hasSidePath(start)) { final Iterator sidePaths = graph.getFiniteStrings(start, end); Iterator sidePathsQueries = new Iterator() { @@ -563,17 +568,24 @@ public boolean hasNext() { @Override public Query next() { TokenStream sidePath = sidePaths.next(); - return createFieldQuery(sidePath, BooleanClause.Occur.MUST, field, getAutoGenerateMultiTermSynonymsPhraseQuery(),getSynonymsBoostByPayload(), 0); + return createFieldQuery(sidePath, BooleanClause.Occur.MUST, field, getAutoGenerateMultiTermSynonymsPhraseQuery(), 0); } }; - queryClause = newGraphSynonymQuery(sidePathsQueries); + queryClause = newGraphSynonymQuery(sidePathsQueries, sidePathsPayloads); } else { Term[] terms = graph.getTerms(field, start); assert terms.length > 0; if (terms.length == 1) { queryClause = newTermQuery(terms[0]); } else { - queryClause = newSynonymQuery(terms, source); + BytesRef[] payloads = new BytesRef[terms.length]; + if (synonymsBoostByPayload) { + int j=0; + while(sidePathsForPayloads.hasNext()) { + payloads[j] = sidePathsPayloads.next()[0]; + } + } + queryClause = newSynonymQuery(terms, payloads); } } if (queryClause != null) { @@ -582,10 +594,28 @@ public Query next() { } return builder.build(); } + + protected BytesRef[] getPayloadsFromStream(TokenStream source) { + try (CachingTokenFilter stream = new CachingTokenFilter(source)) { + PayloadAttribute payloadAtt = stream.getAttribute(PayloadAttribute.class); + stream.reset(); + List payloads = new ArrayList<>(); + while (stream.incrementToken()) { + if (payloadAtt != null) { + payloads.add(payloadAtt.getPayload()); + } + } + stream.end(); + stream.close(); + return payloads.toArray(new BytesRef[payloads.size()]); + } catch (IOException e) { + throw new RuntimeException("Error analyzing query text", e); + } + } /** * Creates graph phrase query from the tokenstream contents */ - protected Query analyzeGraphPhrase(TokenStream source, String field, boolean boostByPayload, int phraseSlop) + protected Query analyzeGraphPhrase(TokenStream source, String field, int phraseSlop) throws IOException { source.reset(); GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source); @@ -597,7 +627,7 @@ protected Query analyzeGraphPhrase(TokenStream source, String field, boolean boo BooleanQuery.Builder builder = new BooleanQuery.Builder(); Iterator it = graph.getFiniteStrings(); while (it.hasNext()) { - Query query = createFieldQuery(it.next(), BooleanClause.Occur.MUST, field, true, boostByPayload, phraseSlop); + Query query = createFieldQuery(it.next(), BooleanClause.Occur.MUST, field, true, phraseSlop); if (query != null) { builder.add(query, BooleanClause.Occur.SHOULD); } @@ -695,7 +725,7 @@ protected BooleanQuery.Builder newBooleanQuery() { * This is intended for subclasses that wish to customize the generated queries. * @return new Query instance */ - protected Query newSynonymQuery(Term[] terms, TokenStream sourceTokenStream) { + protected Query newSynonymQuery(Term[] terms, BytesRef[] termPayload) { SynonymQuery.Builder builder = new SynonymQuery.Builder(terms[0].field()); for (Term term : terms) { builder.addTerm(term); @@ -709,7 +739,7 @@ protected Query newSynonymQuery(Term[] terms, TokenStream sourceTokenStream) { * This is intended for subclasses that wish to customize the generated queries. * @return new Query instance */ - protected Query newGraphSynonymQuery(Iterator queries) { + protected Query newGraphSynonymQuery(Iterator queries, Iterator termPayload) { BooleanQuery.Builder builder = new BooleanQuery.Builder(); while (queries.hasNext()) { builder.add(queries.next(), BooleanClause.Occur.SHOULD); diff --git a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java index b2b530d93afb..ef5caec40bdb 100644 --- a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java +++ b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java @@ -26,12 +26,14 @@ import java.util.List; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PayloadAttributeImpl; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.index.Term; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.FiniteStringsIterator; @@ -123,7 +125,7 @@ public Term[] getTerms(String field, int state) { .map(s -> new Term(field, s.addAttribute(TermToBytesRefAttribute.class).getBytesRef())) .toArray(Term[]::new); } - + /** * Get all finite strings from the automaton. */ diff --git a/solr/core/src/java/org/apache/solr/parser/QueryParser.java b/solr/core/src/java/org/apache/solr/parser/QueryParser.java index 518cdefaa791..f95204b0dfa5 100644 --- a/solr/core/src/java/org/apache/solr/parser/QueryParser.java +++ b/solr/core/src/java/org/apache/solr/parser/QueryParser.java @@ -52,14 +52,14 @@ private static boolean allowedPostMultiTerm(int tokenKind) { @Override protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, - boolean quoted, boolean fieldAutoGenPhraseQueries, boolean fieldEnableGraphQueries, + boolean quoted, boolean fieldAutoGenPhraseQueries, boolean fieldEnableGraphQueries, boolean fieldSynonymsBoostByPayload, SynonymQueryStyle synonymQueryStyle) throws SyntaxError { setAutoGenerateMultiTermSynonymsPhraseQuery(fieldAutoGenPhraseQueries || getAutoGeneratePhraseQueries()); // Don't auto-quote graph-aware field queries boolean treatAsQuoted = getSplitOnWhitespace() ? (quoted || fieldAutoGenPhraseQueries || getAutoGeneratePhraseQueries()) : quoted; - return super.newFieldQuery(analyzer, field, queryText, treatAsQuoted, false, fieldEnableGraphQueries, synonymQueryStyle); + return super.newFieldQuery(analyzer, field, queryText, treatAsQuoted, false, fieldEnableGraphQueries, fieldSynonymsBoostByPayload, synonymQueryStyle); } // * Query ::= ( Clause )* diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java index 016f3b7ba46f..1fd60a438460 100644 --- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java +++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java @@ -105,7 +105,7 @@ public abstract class SolrQueryParserBase extends QueryBuilder { * {@link #PICK_BEST} * {@link #AS_DISTINCT_TERMS} */ - public enum SynonymQueryStyle { + public static enum SynonymQueryStyle { /** (default) synonym terms share doc freq * so if "pants" has df 500, and "khakis" a df of 50, uses 500 df when scoring both terms * appropriate for exact synonyms @@ -119,24 +119,13 @@ public enum SynonymQueryStyle { * */ PICK_BEST, - /** - * each synonym scored indepedently, then added together (ie boolean query) - * so if "pants" has df 500, and "khakis" a df of 50, khakis matches are scored higher but - * summed with any "pants" matches - * appropriate when more specific synonyms should score higher, but we don't want to ignore - * less specific synonyms - */ - AS_DISTINCT_TERMS; - - private boolean boostByPayload; - - public boolean isBoostByPayload() { - return boostByPayload; - } - - public void setBoostByPayload(boolean boostByPayload) { - this.boostByPayload = boostByPayload; - } + /** each synonym scored indepedently, then added together (ie boolean query) + * so if "pants" has df 500, and "khakis" a df of 50, khakis matches are scored higher but + * summed with any "pants" matches + * appropriate when more specific synonyms should score higher, but we don't want to ignore + * less specific synonyms + * */ + AS_DISTINCT_TERMS } // make it possible to call setDefaultOperator() without accessing @@ -542,14 +531,17 @@ protected void addMultiTermClause(List clauses, Query q) { protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, boolean quoted, boolean fieldAutoGenPhraseQueries, boolean fieldEnableGraphQueries, + boolean synonymsBoostByPayload, SynonymQueryStyle synonymQueryStyle) throws SyntaxError { BooleanClause.Occur occur = operator == Operator.AND ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD; setEnableGraphQueries(fieldEnableGraphQueries); + setSynonymsBoostByPayload(synonymsBoostByPayload); setSynonymQueryStyle(synonymQueryStyle); Query query = createFieldQuery(analyzer, occur, field, queryText, quoted || fieldAutoGenPhraseQueries || autoGeneratePhraseQueries, phraseSlop); setEnableGraphQueries(true); // reset back to default + setSynonymsBoostByPayload(false); // reset back to default setSynonymQueryStyle(AS_SAME_TERM); return query; } @@ -624,47 +616,43 @@ protected Query newRegexpQuery(Term regexp) { } @Override - protected Query newSynonymQuery(Term[] terms, TokenStream sourceTokenStream) { + protected Query newSynonymQuery(Term[] terms, BytesRef[] payloads) { switch (synonymQueryStyle) { case PICK_BEST: - return getDisjunctionSynonymQuery(terms, sourceTokenStream, synonymQueryStyle.isBoostByPayload()); + return getDisjunctionSynonymQuery(terms, payloads, super.synonymsBoostByPayload); case AS_DISTINCT_TERMS: - return getBooleanSynonymQuery(terms, sourceTokenStream,synonymQueryStyle.isBoostByPayload()); + return getBooleanSynonymQuery(terms, payloads,super.synonymsBoostByPayload); case AS_SAME_TERM: - return super.newSynonymQuery(terms, sourceTokenStream); + return super.newSynonymQuery(terms, payloads); default: throw new AssertionError("unrecognized synonymQueryStyle passed when creating newSynonymQuery"); } } - private Query getBooleanSynonymQuery(Term[] terms, TokenStream sourceTokenStream, boolean payloadBoost) { + private Query getBooleanSynonymQuery(Term[] terms, BytesRef[] payloads, boolean payloadBoost) { BooleanQuery.Builder builder; builder = new BooleanQuery.Builder(); - List synonymQueries = getSynonymQueries(terms, sourceTokenStream, payloadBoost); + List synonymQueries = getSynonymQueries(terms, payloads, payloadBoost); for(Query synonymQuery:synonymQueries){ builder.add(synonymQuery,BooleanClause.Occur.SHOULD); } return builder.build(); } - private Query getDisjunctionSynonymQuery(Term[] terms, TokenStream sourceTokenStream, boolean payloadBoost) { - List synonymQueries = getSynonymQueries(terms, sourceTokenStream, payloadBoost); + private Query getDisjunctionSynonymQuery(Term[] terms, BytesRef[] payloads, boolean payloadBoost) { + List synonymQueries = getSynonymQueries(terms, payloads, payloadBoost); DisjunctionMaxQuery synonymQuery; synonymQuery = new DisjunctionMaxQuery(synonymQueries, 0.0f); return synonymQuery; } - private List getSynonymQueries(Term[] terms, TokenStream sourceTokenStream, boolean payloadBoost) { + private List getSynonymQueries(Term[] terms, BytesRef[] payloads, boolean payloadBoost) { List synonymQueries = new ArrayList<>(terms.length); - BytesRef[] currentStreamPayloads = null; - if (payloadBoost) { - currentStreamPayloads = this.getPayloadsFromStream(sourceTokenStream); - } BytesRef termPayload = null; for (int i = 0; i < terms.length; i++) { Term currentTerm = terms[i]; if (payloadBoost) { - termPayload = currentStreamPayloads[i]; + termPayload = payloads[i]; } if (termPayload != null) { float decodedPayload = PayloadHelper.decodeFloat(termPayload.bytes, termPayload.offset); @@ -684,18 +672,17 @@ private List getSynonymQueries(Term[] terms, TokenStream sourceTokenStrea * @return new Query instance */ @Override - protected Query newGraphSynonymQuery(Iterator sidePaths) { + protected Query newGraphSynonymQuery(Iterator sidePaths, Iterator payloads ) { switch (synonymQueryStyle) { case PICK_BEST:{ - List boostedSidePaths = boostQueriesByPayload(sidePaths, sidePathsPayloads); - DisjunctionMaxQuery graphSynonymQuery = new DisjunctionMaxQuery(boostedSidePaths, 0.0f); + List sidePathsQueries = boostQueriesByPayload(sidePaths, payloads); + DisjunctionMaxQuery graphSynonymQuery = new DisjunctionMaxQuery(sidePathsQueries, 0.0f); return graphSynonymQuery;} - case AS_DISTINCT_TERMS_BOOST_BY_PAYLOAD:{ - Iterator sidePathsPayloads = this.extractPayloadsFromTokenStreams(sidePathsTokenStreams); - List boostedSidePaths = boostQueriesByPayload(sidePaths, sidePathsPayloads); + case AS_DISTINCT_TERMS:{ + List sidePathsQueries = boostQueriesByPayload(sidePaths, payloads); BooleanQuery.Builder builder = new BooleanQuery.Builder(); - for (Query boostedSidePath : boostedSidePaths) { - builder.add(boostedSidePath, BooleanClause.Occur.SHOULD); + for (Query sidePath : sidePathsQueries) { + builder.add(sidePath, BooleanClause.Occur.SHOULD); } BooleanQuery graphBooleanSynonymQuery = builder.build(); if (graphBooleanSynonymQuery.clauses().size() == 1) { @@ -703,57 +690,27 @@ protected Query newGraphSynonymQuery(Iterator sidePaths) { } return graphBooleanSynonymQuery;} default: - return super.newGraphSynonymQuery(sidePaths, sidePathsTokenStreams); - } - } - - private Iterator extractPayloadsFromTokenStreams(Iterator sidePathsTokenStreams) { - Iterator sidePathsPayloads = new Iterator() { - @Override - public boolean hasNext() { - return sidePathsTokenStreams.hasNext(); - } - - @Override - public BytesRef[] next() { - TokenStream sidePath = sidePathsTokenStreams.next(); - return getPayloadsFromStream(sidePath); - } - }; - return sidePathsPayloads; - } - - protected BytesRef[] getPayloadsFromStream(TokenStream source) { - try (CachingTokenFilter stream = new CachingTokenFilter(source)) { - PayloadAttribute payloadAtt = stream.getAttribute(PayloadAttribute.class); - stream.reset(); - List payloads = new ArrayList<>(); - while (stream.incrementToken()) { - if (payloadAtt != null) { - payloads.add(payloadAtt.getPayload()); - } - } - stream.end(); - stream.close(); - return payloads.toArray(new BytesRef[payloads.size()]); - } catch (IOException e) { - throw new RuntimeException("Error analyzing query text", e); + return super.newGraphSynonymQuery(sidePaths, payloads); } } private List boostQueriesByPayload(Iterator sidePaths, Iterator sidePathsPayloads) { - List boostedSidePaths = new LinkedList<>(); + List resultSidePaths = new LinkedList<>(); while (sidePaths.hasNext()) { Query sidePath = sidePaths.next(); - BytesRef[] sidePathPayloads = sidePathsPayloads.next(); - float overallQueryPayload = extractQueryPayload(sidePathPayloads); - if (overallQueryPayload != 0) { - boostedSidePaths.add(new BoostQuery(sidePath, overallQueryPayload)); + if (super.synonymsBoostByPayload) { + BytesRef[] sidePathPayloads = sidePathsPayloads.next(); + float overallQueryPayload = extractQueryPayload(sidePathPayloads); + if (overallQueryPayload != 0) { + resultSidePaths.add(new BoostQuery(sidePath, overallQueryPayload)); + } else { + resultSidePaths.add(sidePath); + } } else { - boostedSidePaths.add(sidePath); + resultSidePaths.add(sidePath); } } - return boostedSidePaths; + return resultSidePaths; } /*Current assumption is that the user will associate a single payload to the multi terms synonym @@ -780,7 +737,7 @@ private float extractQueryPayload(BytesRef[] payloadsForQueryTerms) { */ @Override protected SpanQuery createSpanQuery(TokenStream source, String field) throws IOException { - if (synonymQueryStyle == SynonymQueryStyle.PICK_BEST_BOOST_BY_PAYLOAD || synonymQueryStyle == SynonymQueryStyle.AS_DISTINCT_TERMS_BOOST_BY_PAYLOAD) { + if (super.synonymsBoostByPayload) { try (CachingTokenFilter stream = new CachingTokenFilter(source)) { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PayloadAttribute payloadAttribute = stream.getAttribute(PayloadAttribute.class); @@ -950,6 +907,7 @@ protected Query getBooleanQuery(List clauses) throws SyntaxError if (ft.isTokenized() && sfield.indexed()) { boolean fieldAutoGenPhraseQueries = ft instanceof TextField && ((TextField)ft).getAutoGeneratePhraseQueries(); boolean fieldEnableGraphQueries = ft instanceof TextField && ((TextField)ft).getEnableGraphQueries(); + boolean fieldSynonymsBoostByPayload = ft instanceof TextField && ((TextField)ft).getSynonymBoostByPayload(); SynonymQueryStyle synonymQueryStyle = AS_SAME_TERM; if (ft instanceof TextField) { @@ -957,7 +915,7 @@ protected Query getBooleanQuery(List clauses) throws SyntaxError } subq = newFieldQuery(getAnalyzer(), sfield.getName(), rawq.getJoinedExternalVal(), - false, fieldAutoGenPhraseQueries, fieldEnableGraphQueries, synonymQueryStyle); + false, fieldAutoGenPhraseQueries, fieldEnableGraphQueries, fieldSynonymsBoostByPayload, synonymQueryStyle); booleanBuilder.add(subq, BooleanClause.Occur.SHOULD); } else { for (String externalVal : rawq.getExternalVals()) { @@ -1274,11 +1232,12 @@ protected Query getFieldQuery(String field, String queryText, boolean quoted, bo if (ft.isTokenized() && sf.indexed()) { boolean fieldAutoGenPhraseQueries = ft instanceof TextField && ((TextField)ft).getAutoGeneratePhraseQueries(); boolean fieldEnableGraphQueries = ft instanceof TextField && ((TextField)ft).getEnableGraphQueries(); + boolean fieldSynonymsBoostByPayload = ft instanceof TextField && ((TextField)ft).getSynonymBoostByPayload(); SynonymQueryStyle synonymQueryStyle = AS_SAME_TERM; if (ft instanceof TextField) { synonymQueryStyle = ((TextField)(ft)).getSynonymQueryStyle(); } - return newFieldQuery(getAnalyzer(), field, queryText, quoted, fieldAutoGenPhraseQueries, fieldEnableGraphQueries, synonymQueryStyle); + return newFieldQuery(getAnalyzer(), field, queryText, quoted, fieldAutoGenPhraseQueries, fieldEnableGraphQueries,fieldSynonymsBoostByPayload, synonymQueryStyle); } else { if (raw) { return new RawQuery(sf, queryText); @@ -1289,7 +1248,7 @@ protected Query getFieldQuery(String field, String queryText, boolean quoted, bo } // default to a normal field query - return newFieldQuery(getAnalyzer(), field, queryText, quoted, false, true, AS_SAME_TERM); + return newFieldQuery(getAnalyzer(), field, queryText, quoted, false, true,false, AS_SAME_TERM); } // Assumption: quoted is always false @@ -1323,12 +1282,13 @@ protected Query getFieldQuery(String field, List queryTerms, boolean raw String queryText = queryTerms.size() == 1 ? queryTerms.get(0) : String.join(" ", queryTerms); boolean fieldAutoGenPhraseQueries = ft instanceof TextField && ((TextField)ft).getAutoGeneratePhraseQueries(); boolean fieldEnableGraphQueries = ft instanceof TextField && ((TextField)ft).getEnableGraphQueries(); + boolean fieldSynonymsBoostByPayload = ft instanceof TextField && ((TextField)ft).getSynonymBoostByPayload(); SynonymQueryStyle synonymQueryStyle = AS_SAME_TERM; if (ft instanceof TextField) { synonymQueryStyle = ((TextField)(ft)).getSynonymQueryStyle(); } return newFieldQuery - (getAnalyzer(), field, queryText, false, fieldAutoGenPhraseQueries, fieldEnableGraphQueries, synonymQueryStyle); + (getAnalyzer(), field, queryText, false, fieldAutoGenPhraseQueries, fieldEnableGraphQueries, fieldSynonymsBoostByPayload, synonymQueryStyle); } else { if (raw) { return new RawQuery(sf, queryTerms); @@ -1360,7 +1320,7 @@ protected Query getFieldQuery(String field, List queryTerms, boolean raw // default to a normal field query String queryText = queryTerms.size() == 1 ? queryTerms.get(0) : String.join(" ", queryTerms); - return newFieldQuery(getAnalyzer(), field, queryText, false, false, true, AS_SAME_TERM); + return newFieldQuery(getAnalyzer(), field, queryText, false, false, true, false, AS_SAME_TERM); } protected boolean isRangeShouldBeProtectedFromReverse(String field, String part1){ diff --git a/solr/core/src/java/org/apache/solr/schema/TextField.java b/solr/core/src/java/org/apache/solr/schema/TextField.java index 71164ac61fc7..daaea9334378 100644 --- a/solr/core/src/java/org/apache/solr/schema/TextField.java +++ b/solr/core/src/java/org/apache/solr/schema/TextField.java @@ -43,6 +43,7 @@ public class TextField extends FieldType { protected boolean autoGeneratePhraseQueries; protected boolean enableGraphQueries; + protected boolean synonymBoostByPayload; protected SolrQueryParserBase.SynonymQueryStyle synonymQueryStyle; /** @@ -91,7 +92,7 @@ protected void init(IndexSchema schema, Map args) { String boostByPayloadStr = args.remove(BOOST_BY_PAYLOAD); if (boostByPayloadStr != null) boostByPayload = Boolean.parseBoolean(boostByPayloadStr); - this.synonymQueryStyle.setBoostByPayload(boostByPayload); + this.synonymBoostByPayload = boostByPayload; super.init(schema, args); } @@ -119,6 +120,10 @@ public boolean getEnableGraphQueries() { return enableGraphQueries; } + public boolean getSynonymBoostByPayload() { + return synonymBoostByPayload; + } + public SolrQueryParserBase.SynonymQueryStyle getSynonymQueryStyle() {return synonymQueryStyle;} @Override diff --git a/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java index de5700d9bd25..81726d808b9f 100644 --- a/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java +++ b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java @@ -1085,7 +1085,7 @@ protected Query getPrefixQuery(String field, String val) throws SyntaxError { @Override protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, - boolean quoted, boolean fieldAutoGenPhraseQueries, boolean enableGraphQueries, + boolean quoted, boolean fieldAutoGenPhraseQueries, boolean enableGraphQueries, boolean fieldSynonymsBoostByPayload, SynonymQueryStyle synonymQueryStyle) throws SyntaxError { Analyzer actualAnalyzer; @@ -1100,7 +1100,7 @@ protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, } else { actualAnalyzer = parser.getReq().getSchema().getFieldType(field).getQueryAnalyzer(); } - return super.newFieldQuery(actualAnalyzer, field, queryText, quoted, fieldAutoGenPhraseQueries, enableGraphQueries, synonymQueryStyle); + return super.newFieldQuery(actualAnalyzer, field, queryText, quoted, fieldAutoGenPhraseQueries, enableGraphQueries, fieldSynonymsBoostByPayload, synonymQueryStyle); } @Override diff --git a/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java b/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java index 1f0b5690b90e..cd4554c6fc81 100644 --- a/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java +++ b/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java @@ -2081,10 +2081,10 @@ protected Query getFieldQuery(String field, @Override protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, boolean quoted, boolean fieldAutoGenPhraseQueries, - boolean fieldEnableGraphQueries, SynonymQueryStyle synonymQueryStyle) + boolean fieldEnableGraphQueries, boolean fieldSynonymsBoostByPayload, SynonymQueryStyle synonymQueryStyle) throws SyntaxError { Query q = super.newFieldQuery - (analyzer, field, queryText, quoted, fieldAutoGenPhraseQueries, fieldEnableGraphQueries, synonymQueryStyle); + (analyzer, field, queryText, quoted, fieldAutoGenPhraseQueries, fieldEnableGraphQueries, fieldSynonymsBoostByPayload, synonymQueryStyle); if (q instanceof BooleanQuery) { boolean rewrittenSubQ = false; // dirty flag: rebuild the repacked query? BooleanQuery.Builder builder = newBooleanQuery(); From f5567a6c5612cc5f8c23b987e9f932238f09c111 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Fri, 24 Jan 2020 20:00:29 +0000 Subject: [PATCH 12/36] [SOLR-12238] re-design --- .../src/java/org/apache/solr/parser/SolrQueryParserBase.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java index 1fd60a438460..567018d86592 100644 --- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java +++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java @@ -673,6 +673,7 @@ private List getSynonymQueries(Term[] terms, BytesRef[] payloads, boolean */ @Override protected Query newGraphSynonymQuery(Iterator sidePaths, Iterator payloads ) { + if(super.synonymsBoostByPayload){ switch (synonymQueryStyle) { case PICK_BEST:{ List sidePathsQueries = boostQueriesByPayload(sidePaths, payloads); @@ -689,9 +690,9 @@ protected Query newGraphSynonymQuery(Iterator sidePaths, Iterator boostQueriesByPayload(Iterator sidePaths, Iterator sidePathsPayloads) { From 89e55d15355714d20327df37d24fd38476c4d927 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Sat, 25 Jan 2020 10:56:03 +0000 Subject: [PATCH 13/36] [SOLR-12238] re-design and refinement --- .../org/apache/lucene/util/QueryBuilder.java | 160 +++++++++++----- .../solr/parser/SolrQueryParserBase.java | 73 ------- .../solr/collection1/conf/synonyms.txt | 12 +- .../solr/search/TestSolrQueryParser.java | 179 +++++++++--------- 4 files changed, 209 insertions(+), 215 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index 4e92cfb421bc..f83f6c6835dc 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -31,12 +31,14 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.SynonymQuery; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.spans.SpanBoostQuery; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; @@ -218,14 +220,6 @@ public void setAutoGenerateMultiTermSynonymsPhraseQuery(boolean enable) { this.autoGenerateMultiTermSynonymsPhraseQuery = enable; } - /** - * Returns true if synonyms should be automatically boosted by their payload. - * @see #setAutoGenerateMultiTermSynonymsPhraseQuery(boolean) - */ - public boolean getSynonymsBoostByPayload() { - return synonymsBoostByPayload; - } - /** * Set to true if synonyms should be automatically boosted by their payload. * Default: false. @@ -256,7 +250,7 @@ protected Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator // Use the analyzer to get all the tokens, and then build an appropriate // query based on the analysis chain. try (TokenStream source = analyzer.tokenStream(field, queryText)) { - return createFieldQuery(source, operator, field,quoted, phraseSlop); + return createFieldQuery(source, operator, field, quoted, phraseSlop); } catch (IOException e) { throw new RuntimeException("Error analyzing query text", e); } @@ -290,18 +284,18 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato // Build an appropriate query based on the analysis chain. try (CachingTokenFilter stream = new CachingTokenFilter(source)) { - + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLenAtt = stream.addAttribute(PositionLengthAttribute.class); if (termAtt == null) { - return null; + return null; } - + // phase 1: read through the stream and assess the situation: // counting the number of tokens/positions and marking if we have any synonyms. - + int numTokens = 0; int positionCount = 0; boolean hasSynonyms = false; @@ -322,10 +316,10 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato isGraph = true; } } - + // phase 2: based on token count, presence of synonyms, and options // formulate a single term, boolean, or phrase. - + if (numTokens == 0) { return null; } else if (numTokens == 1) { @@ -361,31 +355,84 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato throw new RuntimeException("Error analyzing query text", e); } } - + /** * Creates a span query from the tokenstream. In the case of a single token, a simple SpanTermQuery is * returned. When multiple tokens, an ordered SpanNearQuery with slop 0 is returned. */ protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOException { + PayloadAttribute payloadAttribute = null; + if(synonymsBoostByPayload){ + payloadAttribute = in.getAttribute(PayloadAttribute.class); + } TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class); if (termAtt == null) { return null; } List terms = new ArrayList<>(); + List payloads = new ArrayList<>(); while (in.incrementToken()) { terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef()))); + if(payloadAttribute!=null){ + payloads.add(payloadAttribute.getPayload()); + } + } + in.end(); + in.close(); + + BytesRef[] queryPayloadsArray = payloads.toArray(new BytesRef[payloads.size()]); + float queryPayloadBoost = 0; + if (!payloads.isEmpty()) { + queryPayloadBoost = extractQueryPayload(queryPayloadsArray); } if (terms.isEmpty()) { return null; } else if (terms.size() == 1) { - return terms.get(0); + SpanTermQuery singleTermQuery = terms.get(0); + if (queryPayloadBoost != 0) { + return new SpanBoostQuery(singleTermQuery, queryPayloadBoost); + } else { + return singleTermQuery; + } } else { - return new SpanNearQuery(terms.toArray(new SpanTermQuery[0]), 0, true); + SpanNearQuery multiTermQuery = new SpanNearQuery(terms.toArray(new SpanTermQuery[0]), 0, true); + if (queryPayloadBoost != 0) { + return new SpanBoostQuery(multiTermQuery, queryPayloadBoost); + } else { + return multiTermQuery; + } } } + /*Current assumption is that the user will associate a single payload to the multi terms synonym + * that generated the phrase query, so a valid value for the payload associated to the query is just the first not null payload + * e.g. + * lion => panthera leo|0.99 + * "panthera leo" query will have associated Payloads [null,0.99] + * So the payload associated to the query will be 0.99 which is the first not null + * */ + protected float extractQueryPayload(BytesRef[] payloadsForQueryTerms) { + for (BytesRef singlePayload : payloadsForQueryTerms) { + if (singlePayload != null) { + float decodedPayload = decodeFloat(singlePayload.bytes, singlePayload.offset); + return decodedPayload; + } + } + return 0; + } + + public static final float decodeFloat(byte [] bytes, int offset){ + + return Float.intBitsToFloat(decodeInt(bytes, offset)); + } + + public static final int decodeInt(byte [] bytes, int offset){ + return ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16) + | ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF); + } + /** * Creates simple term query from the cached tokenstream contents */ @@ -454,9 +501,10 @@ protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanCla if (posIncrAtt.getPositionIncrement() != 0) { add(q, currentQuery, currentPayload, operator); currentQuery.clear(); + currentPayload.clear(); } currentQuery.add(new Term(field, termAtt.getBytesRef())); - if(payloadAtt!=null){ + if(payloadAtt != null){ currentPayload.add(payloadAtt.getPayload()); } } @@ -651,20 +699,30 @@ protected Query analyzeGraphPhrase(TokenStream source, String field, int phraseS end = articulationPoints[i]; } lastState = end; - final SpanQuery queryClause; + final SpanQuery queryPos; if (graph.hasSidePath(start)) { - Iterator sidePaths = graph.getFiniteStrings(start, end); - List sidePathQueries = newGraphSpanQueries(field, sidePaths, maxClauseCount); - if (sidePathQueries.size() > 0) { - queryClause = new SpanOrQuery(sidePathQueries.toArray(new SpanQuery[0])); + List queries = new ArrayList<>(); + Iterator it = graph.getFiniteStrings(start, end); + while (it.hasNext()) { + TokenStream ts = it.next(); + SpanQuery q = createSpanQuery(ts, field); + if (q != null) { + if (queries.size() >= maxClauseCount) { + throw new IndexSearcher.TooManyClauses(); + } + queries.add(q); + } + } + if (queries.size() > 0) { + queryPos = new SpanOrQuery(queries.toArray(new SpanQuery[0])); } else { - queryClause = null; + queryPos = null; } } else { Term[] terms = graph.getTerms(field, start); assert terms.length > 0; if (terms.length == 1) { - queryClause = new SpanTermQuery(terms[0]); + queryPos = new SpanTermQuery(terms[0]); } else { if (terms.length >= maxClauseCount) { throw new IndexSearcher.TooManyClauses(); @@ -674,17 +732,18 @@ protected Query analyzeGraphPhrase(TokenStream source, String field, int phraseS orClauses[idx] = new SpanTermQuery(terms[idx]); } - queryClause = new SpanOrQuery(orClauses); + queryPos = new SpanOrQuery(orClauses); } } - if (queryClause != null) { + if (queryPos != null) { if (clauses.size() >= maxClauseCount) { throw new IndexSearcher.TooManyClauses(); } - clauses.add(queryClause); + clauses.add(queryPos); } } + if (clauses.isEmpty()) { return null; } else if (clauses.size() == 1) { @@ -694,21 +753,6 @@ protected Query analyzeGraphPhrase(TokenStream source, String field, int phraseS } } - protected List newGraphSpanQueries(String field, Iterator sidePaths, int maxClauseCount) throws IOException { - List queries = new ArrayList<>(); - while (sidePaths.hasNext()) { - TokenStream sidePath = sidePaths.next(); - SpanQuery sidePathQuery = createSpanQuery(sidePath, field); - if (sidePathQuery != null) { - if (queries.size() >= maxClauseCount) { - throw new IndexSearcher.TooManyClauses(); - } - queries.add(sidePathQuery); - } - } - return queries; - } - /** * Builds a new BooleanQuery instance. *

@@ -725,13 +769,25 @@ protected BooleanQuery.Builder newBooleanQuery() { * This is intended for subclasses that wish to customize the generated queries. * @return new Query instance */ - protected Query newSynonymQuery(Term[] terms, BytesRef[] termPayload) { + protected Query newSynonymQuery(Term[] terms, BytesRef[] termPayloads) { SynonymQuery.Builder builder = new SynonymQuery.Builder(terms[0].field()); - for (Term term : terms) { - builder.addTerm(term); + for (int i = 0; i < terms.length; i++) { + float payloadBoost = 0f; + if (termPayloads.length == terms.length) { + if (termPayloads[i] != null) { + payloadBoost = decodeFloat(termPayloads[i].bytes, termPayloads[i].offset); + } + } + if (payloadBoost != 0) { + builder.addTerm(terms[i], payloadBoost); + } else { + builder.addTerm(terms[i]); + } } return builder.build(); } + + /** * Builds a new GraphQuery for multi-terms synonyms. @@ -742,7 +798,15 @@ protected Query newSynonymQuery(Term[] terms, BytesRef[] termPayload) { protected Query newGraphSynonymQuery(Iterator queries, Iterator termPayload) { BooleanQuery.Builder builder = new BooleanQuery.Builder(); while (queries.hasNext()) { - builder.add(queries.next(), BooleanClause.Occur.SHOULD); + Query next = queries.next(); + if (termPayload.hasNext()) { + BytesRef[] queryPayloads = termPayload.next(); + float payloadBoost = this.extractQueryPayload(queryPayloads); + if (payloadBoost != 0) { + next = new BoostQuery(next, payloadBoost); + } + } + builder.add(next, BooleanClause.Occur.SHOULD); } BooleanQuery bq = builder.build(); if (bq.clauses().size() == 1) { diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java index 567018d86592..591505aed40c 100644 --- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java +++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java @@ -714,79 +714,6 @@ private List boostQueriesByPayload(Iterator sidePaths, Iterator panthera leo|0.99 - * "panthera leo" query will have associated Payloads [null,0.99] - * So the payload associated to the query will be 0.99 which is the first not null - * */ - private float extractQueryPayload(BytesRef[] payloadsForQueryTerms) { - for (BytesRef singlePayload : payloadsForQueryTerms) { - if (singlePayload != null) { - float decodedPayload = PayloadHelper.decodeFloat(singlePayload.bytes, singlePayload.offset); - return decodedPayload; - } - } - return 0; - } - - /** - * Creates a span query from the tokenstream. In the case of a single token, a simple SpanTermQuery is - * returned. When multiple tokens, an ordered SpanNearQuery with slop of 0 is returned. - * In case the synonym query style involves payload boosting a SpanBoostQuery is returned - */ - @Override - protected SpanQuery createSpanQuery(TokenStream source, String field) throws IOException { - if (super.synonymsBoostByPayload) { - try (CachingTokenFilter stream = new CachingTokenFilter(source)) { - TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); - PayloadAttribute payloadAttribute = stream.getAttribute(PayloadAttribute.class); - List terms = new ArrayList<>(); - List payloads = new ArrayList<>(); - - stream.reset(); - if (termAtt == null) { - return null; - } - while (stream.incrementToken()) { - terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef()))); - payloads.add(payloadAttribute.getPayload()); - } - stream.end(); - stream.close(); - - BytesRef[] queryPayloadsArray = payloads.toArray(new BytesRef[payloads.size()]); - float queryPayloadBoost = 0; - if (!payloads.isEmpty()) { - queryPayloadBoost = extractQueryPayload(queryPayloadsArray); - } - - if (terms.isEmpty()) { - return null; - } else if (terms.size() == 1) { - SpanTermQuery singleTermQuery = terms.get(0); - if (queryPayloadBoost != 0) { - return new SpanBoostQuery(singleTermQuery, queryPayloadBoost); - } else { - return singleTermQuery; - } - } else { - SpanNearQuery multiTermQuery = new SpanNearQuery(terms.toArray(new SpanTermQuery[0]), 0, true); - if (queryPayloadBoost != 0) { - return new SpanBoostQuery(multiTermQuery, queryPayloadBoost); - } else { - return multiTermQuery; - } - } - } catch (IOException e) { - throw new RuntimeException("Error analyzing query text", e); - } - } else { - return super.createSpanQuery(source, field); - } - } - /** * Builds a new FuzzyQuery instance * @param term Term diff --git a/solr/core/src/test-files/solr/collection1/conf/synonyms.txt b/solr/core/src/test-files/solr/collection1/conf/synonyms.txt index 25466ca7b860..a71ddc3ba718 100644 --- a/solr/core/src/test-files/solr/collection1/conf/synonyms.txt +++ b/solr/core/src/test-files/solr/collection1/conf/synonyms.txt @@ -40,7 +40,11 @@ persian => persian, cat, feline, animal jeans, denim pants # Synonyms used by Payload Boost -tiger => tiger|1.0, Big_Cat|0.8, Shere_Khan|0.9 -leopard => leopard, Big_Cat|0.8, Bagheera|0.9 -lion => lion|1.0, panthera leo|0.99, Simba|0.8 -snow_leopard => panthera uncia|0.99, snow leopard|1.0 +tiger, tigre|0.9 +lynx => lince|0.8, lynx_canadensis|0.9 + +leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85 +lion => panthera leo|0.9, simba leo|0.8, kimba|0.75 + +snow leopard, panthera uncia|0.9, big cat|0.8, white_leopard|0.6 +panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65 \ No newline at end of file diff --git a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java index 0e718c838465..2f1f138b24ea 100644 --- a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java +++ b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java @@ -1225,125 +1225,124 @@ public void testSynonymQueryStyle() throws Exception { } - public void testGraphSynonyms_singleTermSynonymsPayloadBoost_shouldParseBoostedQuery() throws Exception { + public void testSynonymsBoostByPayload_singleTermQuerySingleTermSynonyms_shouldParseBoostedQuery() throws Exception { + //tiger, tigre|0.9 Query q = QParser.getParser("tiger", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_pick_best_boost_by_payload_foo:tiger)^1.0 | (t_pick_best_boost_by_payload_foo:big_cat)^0.8 | (t_pick_best_boost_by_payload_foo:shere_khan)^0.9)", q.toString()); + assertEquals("((t_pick_best_boost_by_payload_foo:tigre)^0.9 | t_pick_best_boost_by_payload_foo:tiger)", q.toString()); q = QParser.getParser("tiger", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); - assertEquals("(t_as_distinct_boost_by_payload_foo:tiger)^1.0 (t_as_distinct_boost_by_payload_foo:big_cat)^0.8 (t_as_distinct_boost_by_payload_foo:shere_khan)^0.9", q.toString()); + assertEquals("(t_as_distinct_boost_by_payload_foo:tigre)^0.9 t_as_distinct_boost_by_payload_foo:tiger", q.toString()); - /*AutoGenerated queries follow the synonym query style approach */ - q = QParser.getParser("jeans", req(params("df", "t_pick_best_boost_by_payload_foo", "sow", "false"))).getQuery(); - assertEquals("(t_pick_best_boost_by_payload_foo:\"denim pants\" | t_pick_best_boost_by_payload_foo:jeans)", q.toString()); + //lynx => lince|0.8, lynx_canadensis|0.9 + q = QParser.getParser("lynx", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); + assertEquals("((t_pick_best_boost_by_payload_foo:lince)^0.8 | (t_pick_best_boost_by_payload_foo:lynx_canadensis)^0.9)", q.toString()); - q = QParser.getParser("jeans", req(params("df", "t_as_distinct_boost_by_payload_foo", "sow", "false"))).getQuery(); - assertEquals("(t_as_distinct_boost_by_payload_foo:\"denim pants\" t_as_distinct_boost_by_payload_foo:jeans)", q.toString()); + q = QParser.getParser("lynx", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); + assertEquals("(t_as_distinct_boost_by_payload_foo:lince)^0.8 (t_as_distinct_boost_by_payload_foo:lynx_canadensis)^0.9", q.toString()); } - public void testGraphSynonyms_multiTermSynonymsPayloadBoost_shouldParseBoostedPhraseQuery() throws Exception { - Query q = QParser.getParser("lion", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_pick_best_boost_by_payload_foo:lion)^1.0 | (t_pick_best_boost_by_payload_foo:\"panthera leo\")^0.99 | (t_pick_best_boost_by_payload_foo:simba)^0.8)", q.toString()); + public void testSynonymsBoostByPayload_singleTermQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { + //leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85 + Query q = QParser.getParser("leopard", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); + assertEquals("((t_pick_best_boost_by_payload_foo:\"big cat\")^0.8 | (t_pick_best_boost_by_payload_foo:bagheera)^0.9 | (t_pick_best_boost_by_payload_foo:\"panthera pardus\")^0.85 | t_pick_best_boost_by_payload_foo:leopard)", q.toString()); - q = QParser.getParser("lion", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_as_distinct_boost_by_payload_foo:lion)^1.0 (t_as_distinct_boost_by_payload_foo:\"panthera leo\")^0.99 (t_as_distinct_boost_by_payload_foo:simba)^0.8)", q.toString()); - } + q = QParser.getParser("leopard", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); + assertEquals("((t_as_distinct_boost_by_payload_foo:\"big cat\")^0.8 (t_as_distinct_boost_by_payload_foo:bagheera)^0.9 (t_as_distinct_boost_by_payload_foo:\"panthera pardus\")^0.85 t_as_distinct_boost_by_payload_foo:leopard)", q.toString()); - public void testGraphSynonyms_multiTermQueryMultiTermSynonymsPayloadBoost_shouldParseBoostedPhraseQuery() throws Exception { - Query q = QParser.getParser("snow_leopard lion", req(params("df", "t_pick_best_boost_by_payload_foo", "sow", "false"))).getQuery(); - assertEquals("((t_pick_best_boost_by_payload_foo:\"panthera uncia\")^0.99 | (t_pick_best_boost_by_payload_foo:\"snow leopard\")^1.0) " + - "((t_pick_best_boost_by_payload_foo:lion)^1.0 | (t_pick_best_boost_by_payload_foo:\"panthera leo\")^0.99 | (t_pick_best_boost_by_payload_foo:simba)^0.8)", q.toString()); + //lion => panthera leo|0.9, simba leo|0.8, kimba|0.75 + q = QParser.getParser("lion", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); + assertEquals("((t_pick_best_boost_by_payload_foo:\"panthera leo\")^0.9 | (t_pick_best_boost_by_payload_foo:\"simba leo\")^0.8 | (t_pick_best_boost_by_payload_foo:kimba)^0.75)", q.toString()); - q = QParser.getParser("snow_leopard lion", req(params("df", "t_as_distinct_boost_by_payload_foo", "sow", "false"))).getQuery(); - assertEquals("((t_as_distinct_boost_by_payload_foo:\"panthera uncia\")^0.99 (t_as_distinct_boost_by_payload_foo:\"snow leopard\")^1.0) " + - "((t_as_distinct_boost_by_payload_foo:lion)^1.0 (t_as_distinct_boost_by_payload_foo:\"panthera leo\")^0.99 (t_as_distinct_boost_by_payload_foo:simba)^0.8)", q.toString()); + q = QParser.getParser("lion", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); + assertEquals("((t_as_distinct_boost_by_payload_foo:\"panthera leo\")^0.9 (t_as_distinct_boost_by_payload_foo:\"simba leo\")^0.8 (t_as_distinct_boost_by_payload_foo:kimba)^0.75)", q.toString()); } - public void testGraphSynonyms_multiTermQueryMultiTermSynonymsEdismaxBoostAndPayloadBoost_shouldParseBoostedPhraseQuery() throws Exception { - Query q = QParser.getParser("snow_leopard lion","edismax",true, req(params("sow", "false","qf", "t_pick_best_boost_by_payload_foo^10"))).getQuery(); - assertEquals("+(" + - "((((t_pick_best_boost_by_payload_foo:\"panthera uncia\")^0.99 | (t_pick_best_boost_by_payload_foo:\"snow leopard\")^1.0))^10.0) " + - "((((t_pick_best_boost_by_payload_foo:lion)^1.0 | (t_pick_best_boost_by_payload_foo:\"panthera leo\")^0.99 | (t_pick_best_boost_by_payload_foo:simba)^0.8))^10.0))", q.toString()); + public void testSynonymsBoostByPayload_multiTermQuerySingleTermSynonyms_shouldParseBoostedQuery() throws Exception { + //tiger, tigre|0.9 + //lynx => lince|0.8, lynx_canadensis|0.9 + Query q = QParser.getParser("tiger lynx", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); + assertEquals("((t_pick_best_boost_by_payload_foo:tigre)^0.9 | t_pick_best_boost_by_payload_foo:tiger)" + + " ((t_pick_best_boost_by_payload_foo:lince)^0.8 | (t_pick_best_boost_by_payload_foo:lynx_canadensis)^0.9)", q.toString()); - q = QParser.getParser("snow_leopard lion","edismax",true, req(params("sow", "false","qf", "t_as_distinct_boost_by_payload_foo^10"))).getQuery(); - assertEquals("+(" + - "(((t_as_distinct_boost_by_payload_foo:\"panthera uncia\")^0.99 (t_as_distinct_boost_by_payload_foo:\"snow leopard\")^1.0)^10.0) " + - "(((t_as_distinct_boost_by_payload_foo:lion)^1.0 (t_as_distinct_boost_by_payload_foo:\"panthera leo\")^0.99 (t_as_distinct_boost_by_payload_foo:simba)^0.8)^10.0))", q.toString()); + q = QParser.getParser("tiger lynx", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); + assertEquals("((t_as_distinct_boost_by_payload_foo:tigre)^0.9 t_as_distinct_boost_by_payload_foo:tiger)" + + " ((t_as_distinct_boost_by_payload_foo:lince)^0.8 (t_as_distinct_boost_by_payload_foo:lynx_canadensis)^0.9)", q.toString()); } - public void testGraphSynonyms_phraseQueryMultiTermSynonymsPayloadBoost_shouldParseBoostedSpanQuery() throws Exception { - Query q = QParser.getParser("\"snow_leopard lion\"", req(params("df", "t_pick_best_boost_by_payload_foo", "sow", "false"))).getQuery(); - assertEquals("spanNear(" + - "[spanOr([" + - "(spanNear([t_pick_best_boost_by_payload_foo:panthera, t_pick_best_boost_by_payload_foo:uncia], 0, true))^0.99, " + - "(spanNear([t_pick_best_boost_by_payload_foo:snow, t_pick_best_boost_by_payload_foo:leopard], 0, true))^1.0]), " + - "spanOr([" + - "(t_pick_best_boost_by_payload_foo:lion)^1.0, " + - "(spanNear([t_pick_best_boost_by_payload_foo:panthera, t_pick_best_boost_by_payload_foo:leo], 0, true))^0.99, " + - "(t_pick_best_boost_by_payload_foo:simba)^0.8])]," + - " 0, true)", q.toString()); - - q = QParser.getParser("\"snow_leopard lion\"", req(params("df", "t_as_distinct_boost_by_payload_foo", "sow", "false"))).getQuery(); - assertEquals("spanNear(" + - "[spanOr([" + - "(spanNear([t_as_distinct_boost_by_payload_foo:panthera, t_as_distinct_boost_by_payload_foo:uncia], 0, true))^0.99, " + - "(spanNear([t_as_distinct_boost_by_payload_foo:snow, t_as_distinct_boost_by_payload_foo:leopard], 0, true))^1.0]), " + - "spanOr([" + - "(t_as_distinct_boost_by_payload_foo:lion)^1.0, " + - "(spanNear([t_as_distinct_boost_by_payload_foo:panthera, t_as_distinct_boost_by_payload_foo:leo], 0, true))^0.99, " + - "(t_as_distinct_boost_by_payload_foo:simba)^0.8])]," + - " 0, true)", q.toString()); - } + public void testSynonymsBoostByPayload_multiTermQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { + //leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85 + //lion => panthera leo|0.9, simba leo|0.8, kimba|0.75 + Query q = QParser.getParser("leopard lion", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); + assertEquals("((t_pick_best_boost_by_payload_foo:\"big cat\")^0.8 | (t_pick_best_boost_by_payload_foo:bagheera)^0.9 | (t_pick_best_boost_by_payload_foo:\"panthera pardus\")^0.85 | t_pick_best_boost_by_payload_foo:leopard)" + + " ((t_pick_best_boost_by_payload_foo:\"panthera leo\")^0.9 | (t_pick_best_boost_by_payload_foo:\"simba leo\")^0.8 | (t_pick_best_boost_by_payload_foo:kimba)^0.75)", q.toString()); - /* If you have single terms synonyms, a flat token stream is still OK */ - public void testFlatSynonyms_singleTermSynonymsPayloadBoost_shouldParseBoostedQuery() throws Exception { - Query q = QParser.getParser("tiger", req(params("df", "t_pick_best_boost_by_payload_flat_foo"))).getQuery(); - assertEquals("((t_pick_best_boost_by_payload_flat_foo:tiger)^1.0 | (t_pick_best_boost_by_payload_flat_foo:big_cat)^0.8 | (t_pick_best_boost_by_payload_flat_foo:shere_khan)^0.9)", q.toString()); + q = QParser.getParser("leopard lion", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); + assertEquals("((t_as_distinct_boost_by_payload_foo:\"big cat\")^0.8 (t_as_distinct_boost_by_payload_foo:bagheera)^0.9 (t_as_distinct_boost_by_payload_foo:\"panthera pardus\")^0.85 t_as_distinct_boost_by_payload_foo:leopard)" + + " ((t_as_distinct_boost_by_payload_foo:\"panthera leo\")^0.9 (t_as_distinct_boost_by_payload_foo:\"simba leo\")^0.8 (t_as_distinct_boost_by_payload_foo:kimba)^0.75)", q.toString()); + } - q = QParser.getParser("tiger", req(params("df", "t_as_distinct_boost_by_payload_flat_foo"))).getQuery(); - assertEquals("(t_as_distinct_boost_by_payload_flat_foo:tiger)^1.0 (t_as_distinct_boost_by_payload_flat_foo:big_cat)^0.8 (t_as_distinct_boost_by_payload_flat_foo:shere_khan)^0.9", q.toString()); + public void testSynonymsBoostByPayload_singleConceptQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { + //snow leopard|1.0, panthera uncia|0.9, big cat|0.8, white_leopard|0.6 + Query q = QParser.getParser("snow leopard",req(params("df", "t_pick_best_boost_by_payload_foo","sow", "false"))).getQuery(); + assertEquals("((t_pick_best_boost_by_payload_foo:\"panthera uncia\")^0.9 | (t_pick_best_boost_by_payload_foo:\"big cat\")^0.8 | (t_pick_best_boost_by_payload_foo:white_leopard)^0.6 | t_pick_best_boost_by_payload_foo:\"snow leopard\")", q.toString()); + + q = QParser.getParser("snow leopard", req(params("df", "t_as_distinct_boost_by_payload_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_distinct_boost_by_payload_foo:\"panthera uncia\")^0.9 (t_as_distinct_boost_by_payload_foo:\"big cat\")^0.8 (t_as_distinct_boost_by_payload_foo:white_leopard)^0.6 t_as_distinct_boost_by_payload_foo:\"snow leopard\")", q.toString()); - /*confirm autoGeneratePhraseQueries builds disjunction queries or boolean queries accordingly*/ - q = QParser.getParser("jeans", req(params("df", "t_pick_best_boost_by_payload_flat_foo", "sow", "false"))).getQuery(); - assertEquals("(t_pick_best_boost_by_payload_flat_foo:\"denim pants\" | t_pick_best_boost_by_payload_flat_foo:jeans)", q.toString()); + //panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65 + q = QParser.getParser("panthera onca", req(params("df", "t_pick_best_boost_by_payload_foo","sow", "false"))).getQuery(); + assertEquals("((t_pick_best_boost_by_payload_foo:jaguar)^0.95 | (t_pick_best_boost_by_payload_foo:\"big cat\")^0.85 | (t_pick_best_boost_by_payload_foo:\"black panther\")^0.65)", q.toString()); - q = QParser.getParser("jeans", req(params("df", "t_as_distinct_boost_by_payload_flat_foo", "sow", "false"))).getQuery(); - assertEquals("(t_as_distinct_boost_by_payload_flat_foo:\"denim pants\" t_as_distinct_boost_by_payload_flat_foo:jeans)", q.toString()); + q = QParser.getParser("panthera onca", req(params("df", "t_as_distinct_boost_by_payload_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_distinct_boost_by_payload_foo:jaguar)^0.95 (t_as_distinct_boost_by_payload_foo:\"big cat\")^0.85 (t_as_distinct_boost_by_payload_foo:\"black panther\")^0.65)", q.toString()); } - /* If you have multi term synonyms a flat token stream is not going to produce the ideal query, this test check it builds the best it can do*/ - public void testFlatSynonyms_multiTermQueryMultiTermSynonymsPayloadBoost_shouldParseBoostedQuery() throws Exception { - Query q = QParser.getParser("snow_leopard lion", req(params("df", "t_pick_best_boost_by_payload_flat_foo", "sow", "false"))).getQuery(); - assertEquals("(t_pick_best_boost_by_payload_flat_foo:panthera | t_pick_best_boost_by_payload_flat_foo:snow) " + - "(t_pick_best_boost_by_payload_flat_foo:uncia | t_pick_best_boost_by_payload_flat_foo:leopard) " + - "((t_pick_best_boost_by_payload_flat_foo:lion)^1.0 | (t_pick_best_boost_by_payload_flat_foo:\"panthera leo\")^0.99 | (t_pick_best_boost_by_payload_flat_foo:simba)^0.8)", q.toString()); - - q = QParser.getParser("snow_leopard lion", req(params("df", "t_as_distinct_boost_by_payload_flat_foo", "sow", "false"))).getQuery(); - assertEquals("(t_as_distinct_boost_by_payload_flat_foo:panthera t_as_distinct_boost_by_payload_flat_foo:snow) " + - "(t_as_distinct_boost_by_payload_flat_foo:uncia t_as_distinct_boost_by_payload_flat_foo:leopard) " + - "((t_as_distinct_boost_by_payload_flat_foo:lion)^1.0 (t_as_distinct_boost_by_payload_flat_foo:\"panthera leo\")^0.99 (t_as_distinct_boost_by_payload_flat_foo:simba)^0.8)", q.toString()); - } + public void testSynonymsBoostByPayload_multiConceptsQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { + //snow leopard|1.0, panthera uncia|0.9, big cat|0.8, white_leopard|0.6 + //panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65 + Query q = QParser.getParser("snow leopard panthera onca",req(params("df", "t_pick_best_boost_by_payload_foo","sow", "false"))).getQuery(); + assertEquals("((t_pick_best_boost_by_payload_foo:\"panthera uncia\")^0.9 | (t_pick_best_boost_by_payload_foo:\"big cat\")^0.8 | (t_pick_best_boost_by_payload_foo:white_leopard)^0.6 | t_pick_best_boost_by_payload_foo:\"snow leopard\")" + + " ((t_pick_best_boost_by_payload_foo:jaguar)^0.95 | (t_pick_best_boost_by_payload_foo:\"big cat\")^0.85 | (t_pick_best_boost_by_payload_foo:\"black panther\")^0.65)", q.toString()); + + q = QParser.getParser("snow leopard panthera onca", req(params("df", "t_as_distinct_boost_by_payload_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_distinct_boost_by_payload_foo:\"panthera uncia\")^0.9 (t_as_distinct_boost_by_payload_foo:\"big cat\")^0.8 (t_as_distinct_boost_by_payload_foo:white_leopard)^0.6 t_as_distinct_boost_by_payload_foo:\"snow leopard\")" + + " ((t_as_distinct_boost_by_payload_foo:jaguar)^0.95 (t_as_distinct_boost_by_payload_foo:\"big cat\")^0.85 (t_as_distinct_boost_by_payload_foo:\"black panther\")^0.65)", q.toString()); + } + + public void testSynonymsBoostByPayload_edismaxBoost_shouldParseBoostedPhraseQuery() throws Exception { + Query q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_pick_best_boost_by_payload_foo^10"))).getQuery(); + assertEquals("+(" + + "((((t_pick_best_boost_by_payload_foo:\"panthera uncia\")^0.9 | (t_pick_best_boost_by_payload_foo:\"big cat\")^0.8 | (t_pick_best_boost_by_payload_foo:white_leopard)^0.6 | t_pick_best_boost_by_payload_foo:\"snow leopard\"))^10.0)" + + " ((((t_pick_best_boost_by_payload_foo:\"panthera leo\")^0.9 | (t_pick_best_boost_by_payload_foo:\"simba leo\")^0.8 | (t_pick_best_boost_by_payload_foo:kimba)^0.75))^10.0)" + + ")", q.toString()); - /* If you have multi term synonyms a flat token stream is not going to produce the ideal query, , this test check it builds the best it can do*/ - public void testFlatSynonyms_phraseQueryMultiTermSynonymsPayloadBoost_shouldParseBoostedSpanQuery() throws Exception { - Query q = QParser.getParser("\"snow_leopard lion\"", req(params("df", "t_pick_best_boost_by_payload_flat_foo", "sow", "false"))).getQuery(); - assertEquals("spanNear([" + - "spanOr([t_pick_best_boost_by_payload_flat_foo:panthera, t_pick_best_boost_by_payload_flat_foo:snow]), " + - "spanOr([t_pick_best_boost_by_payload_flat_foo:uncia, t_pick_best_boost_by_payload_flat_foo:leopard]), " + - "spanOr([(t_pick_best_boost_by_payload_flat_foo:lion)^1.0, (spanNear([t_pick_best_boost_by_payload_flat_foo:panthera, t_pick_best_boost_by_payload_flat_foo:leo], 0, true))^0.99, (t_pick_best_boost_by_payload_flat_foo:simba)^0.8])], 0, true)", q.toString()); + q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_as_distinct_boost_by_payload_foo^10"))).getQuery(); + assertEquals("+(" + + "(((t_as_distinct_boost_by_payload_foo:\"panthera uncia\")^0.9 (t_as_distinct_boost_by_payload_foo:\"big cat\")^0.8 (t_as_distinct_boost_by_payload_foo:white_leopard)^0.6 t_as_distinct_boost_by_payload_foo:\"snow leopard\")^10.0)" + + " (((t_as_distinct_boost_by_payload_foo:\"panthera leo\")^0.9 (t_as_distinct_boost_by_payload_foo:\"simba leo\")^0.8 (t_as_distinct_boost_by_payload_foo:kimba)^0.75)^10.0))", q.toString()); + } - q = QParser.getParser("\"snow_leopard lion\"", req(params("df", "t_as_distinct_boost_by_payload_flat_foo", "sow", "false"))).getQuery(); + public void testSynonymsBoostByPayload_phraseQueryMultiTermSynonymsPayloadBoost_shouldParseBoostedSpanQuery() throws Exception { + Query q = QParser.getParser("\"snow leopard lion\"", req(params("df", "t_pick_best_boost_by_payload_foo", "sow", "false"))).getQuery(); assertEquals("spanNear([" + - "spanOr([t_as_distinct_boost_by_payload_flat_foo:panthera, t_as_distinct_boost_by_payload_flat_foo:snow]), " + - "spanOr([t_as_distinct_boost_by_payload_flat_foo:uncia, t_as_distinct_boost_by_payload_flat_foo:leopard]), " + - "spanOr([(t_as_distinct_boost_by_payload_flat_foo:lion)^1.0, (spanNear([t_as_distinct_boost_by_payload_flat_foo:panthera, t_as_distinct_boost_by_payload_flat_foo:leo], 0, true))^0.99, (t_as_distinct_boost_by_payload_flat_foo:simba)^0.8])], 0, true)", q.toString()); + "spanOr([" + + "(spanNear([t_pick_best_boost_by_payload_foo:panthera, t_pick_best_boost_by_payload_foo:uncia], 0, true))^0.9," + + " (spanNear([t_pick_best_boost_by_payload_foo:big, t_pick_best_boost_by_payload_foo:cat], 0, true))^0.8," + + " (t_pick_best_boost_by_payload_foo:white_leopard)^0.6," + + " spanNear([t_pick_best_boost_by_payload_foo:snow, t_pick_best_boost_by_payload_foo:leopard], 0, true)])," + + " spanOr([" + + "(spanNear([t_pick_best_boost_by_payload_foo:panthera, t_pick_best_boost_by_payload_foo:leo], 0, true))^0.9," + + " (spanNear([t_pick_best_boost_by_payload_foo:simba, t_pick_best_boost_by_payload_foo:leo], 0, true))^0.8," + + " (t_pick_best_boost_by_payload_foo:kimba)^0.75])], 0, true)", q.toString()); } - public void testGraphSynonyms_PayloadBoostMissing_shouldAssignDefaultBoost() throws Exception { + public void testSynonymsBoostByPayload_PayloadBoostMissing_shouldAssignDefaultBoost() throws Exception { + //leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85 Query q = QParser.getParser("leopard", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); - assertEquals("(t_pick_best_boost_by_payload_foo:leopard | (t_pick_best_boost_by_payload_foo:big_cat)^0.8 | (t_pick_best_boost_by_payload_foo:bagheera)^0.9)", q.toString()); + assertEquals("((t_pick_best_boost_by_payload_foo:\"big cat\")^0.8 | (t_pick_best_boost_by_payload_foo:bagheera)^0.9 | (t_pick_best_boost_by_payload_foo:\"panthera pardus\")^0.85 | t_pick_best_boost_by_payload_foo:leopard)", q.toString()); q = QParser.getParser("leopard", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); - assertEquals("t_as_distinct_boost_by_payload_foo:leopard (t_as_distinct_boost_by_payload_foo:big_cat)^0.8 (t_as_distinct_boost_by_payload_foo:bagheera)^0.9", q.toString()); + assertEquals("((t_as_distinct_boost_by_payload_foo:\"big cat\")^0.8 (t_as_distinct_boost_by_payload_foo:bagheera)^0.9 (t_as_distinct_boost_by_payload_foo:\"panthera pardus\")^0.85 t_as_distinct_boost_by_payload_foo:leopard)", q.toString()); } @Test From 5321a93a95b6e878a2e957402f857ceb8caa468f Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Sat, 25 Jan 2020 12:42:56 +0000 Subject: [PATCH 14/36] [SOLR-12238] re-design and refinement --- .../solr/parser/SolrQueryParserBase.java | 111 +++++++----------- 1 file changed, 45 insertions(+), 66 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java index 591505aed40c..84f2c0690948 100644 --- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java +++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java @@ -16,7 +16,6 @@ */ package org.apache.solr.parser; -import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Collections; @@ -29,12 +28,7 @@ import java.util.stream.Collectors; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.CachingTokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.payloads.PayloadHelper; import org.apache.lucene.analysis.reverse.ReverseStringFilter; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.index.Term; import org.apache.lucene.search.AutomatonQuery; @@ -52,12 +46,9 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.search.spans.SpanBoostQuery; -import org.apache.lucene.search.spans.SpanNearQuery; -import org.apache.lucene.search.spans.SpanQuery; -import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.util.QueryBuilder; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; @@ -615,13 +606,25 @@ protected Query newRegexpQuery(Term regexp) { return query; } + private Query buildBooleanQuery(List sidePathsQueries) { + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (Query sidePath : sidePathsQueries) { + builder.add(sidePath, BooleanClause.Occur.SHOULD); + } + return builder.build(); + } + @Override protected Query newSynonymQuery(Term[] terms, BytesRef[] payloads) { switch (synonymQueryStyle) { - case PICK_BEST: - return getDisjunctionSynonymQuery(terms, payloads, super.synonymsBoostByPayload); - case AS_DISTINCT_TERMS: - return getBooleanSynonymQuery(terms, payloads,super.synonymsBoostByPayload); + case PICK_BEST: { + List synonymQueries = getSynonymQueries(terms, payloads); + return new DisjunctionMaxQuery(synonymQueries, 0.0f); + } + case AS_DISTINCT_TERMS: { + List synonymQueries = getSynonymQueries(terms, payloads); + return buildBooleanQuery(synonymQueries); + } case AS_SAME_TERM: return super.newSynonymQuery(terms, payloads); default: @@ -629,37 +632,20 @@ protected Query newSynonymQuery(Term[] terms, BytesRef[] payloads) { } } - private Query getBooleanSynonymQuery(Term[] terms, BytesRef[] payloads, boolean payloadBoost) { - BooleanQuery.Builder builder; - builder = new BooleanQuery.Builder(); - List synonymQueries = getSynonymQueries(terms, payloads, payloadBoost); - for(Query synonymQuery:synonymQueries){ - builder.add(synonymQuery,BooleanClause.Occur.SHOULD); - } - return builder.build(); - } - - private Query getDisjunctionSynonymQuery(Term[] terms, BytesRef[] payloads, boolean payloadBoost) { - List synonymQueries = getSynonymQueries(terms, payloads, payloadBoost); - DisjunctionMaxQuery synonymQuery; - synonymQuery = new DisjunctionMaxQuery(synonymQueries, 0.0f); - return synonymQuery; - } - - private List getSynonymQueries(Term[] terms, BytesRef[] payloads, boolean payloadBoost) { + private List getSynonymQueries(Term[] terms, BytesRef[] payloads) { List synonymQueries = new ArrayList<>(terms.length); - BytesRef termPayload = null; for (int i = 0; i < terms.length; i++) { - Term currentTerm = terms[i]; - if (payloadBoost) { - termPayload = payloads[i]; + float payloadBoost = 0f; + if (payloads.length == terms.length) { + if (payloads[i] != null) { + payloadBoost = decodeFloat(payloads[i].bytes, payloads[i].offset); + } } - if (termPayload != null) { - float decodedPayload = PayloadHelper.decodeFloat(termPayload.bytes, termPayload.offset); - synonymQueries.add(new BoostQuery(newTermQuery(currentTerm), decodedPayload)); - } else { - synonymQueries.add(newTermQuery(currentTerm)); + Query synonymQuery = new TermQuery(terms[i]); + if (payloadBoost != 0) { + synonymQuery = new BoostQuery(synonymQuery, payloadBoost); } + synonymQueries.add(synonymQuery); } return synonymQueries; } @@ -672,41 +658,34 @@ private List getSynonymQueries(Term[] terms, BytesRef[] payloads, boolean * @return new Query instance */ @Override - protected Query newGraphSynonymQuery(Iterator sidePaths, Iterator payloads ) { - if(super.synonymsBoostByPayload){ - switch (synonymQueryStyle) { - case PICK_BEST:{ - List sidePathsQueries = boostQueriesByPayload(sidePaths, payloads); - DisjunctionMaxQuery graphSynonymQuery = new DisjunctionMaxQuery(sidePathsQueries, 0.0f); - return graphSynonymQuery;} - case AS_DISTINCT_TERMS:{ - List sidePathsQueries = boostQueriesByPayload(sidePaths, payloads); - BooleanQuery.Builder builder = new BooleanQuery.Builder(); - for (Query sidePath : sidePathsQueries) { - builder.add(sidePath, BooleanClause.Occur.SHOULD); + protected Query newGraphSynonymQuery(Iterator sidePaths, Iterator payloads) { + if (super.synonymsBoostByPayload) { + switch (synonymQueryStyle) { + case PICK_BEST: { + List sidePathsQueries = getGraphSynonymQueries(sidePaths, payloads); + DisjunctionMaxQuery graphSynonymQuery = new DisjunctionMaxQuery(sidePathsQueries, 0.0f); + return graphSynonymQuery; } - BooleanQuery graphBooleanSynonymQuery = builder.build(); - if (graphBooleanSynonymQuery.clauses().size() == 1) { - return graphBooleanSynonymQuery.clauses().get(0).getQuery(); + case AS_DISTINCT_TERMS: { + List sidePathsQueries = getGraphSynonymQueries(sidePaths, payloads); + return buildBooleanQuery(sidePathsQueries); } - return graphBooleanSynonymQuery;} - } + } } return super.newGraphSynonymQuery(sidePaths, payloads); } - private List boostQueriesByPayload(Iterator sidePaths, Iterator sidePathsPayloads) { + private List getGraphSynonymQueries(Iterator sidePaths, Iterator sidePathsPayloads) { List resultSidePaths = new LinkedList<>(); while (sidePaths.hasNext()) { + float overallQueryPayload = 0; Query sidePath = sidePaths.next(); - if (super.synonymsBoostByPayload) { + if (sidePathsPayloads.hasNext()) { BytesRef[] sidePathPayloads = sidePathsPayloads.next(); - float overallQueryPayload = extractQueryPayload(sidePathPayloads); - if (overallQueryPayload != 0) { - resultSidePaths.add(new BoostQuery(sidePath, overallQueryPayload)); - } else { - resultSidePaths.add(sidePath); - } + overallQueryPayload = extractQueryPayload(sidePathPayloads); + } + if (overallQueryPayload != 0) { + resultSidePaths.add(new BoostQuery(sidePath, overallQueryPayload)); } else { resultSidePaths.add(sidePath); } From 60551b442249dbe858dbacefbfd9c9f106a76386 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Sat, 25 Jan 2020 13:12:44 +0000 Subject: [PATCH 15/36] [SOLR-12238] minor fix for an pre-commit --- .../lucene/util/graph/GraphTokenStreamFiniteStrings.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java index ef5caec40bdb..b2b530d93afb 100644 --- a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java +++ b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java @@ -26,14 +26,12 @@ import java.util.List; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.PayloadAttributeImpl; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.index.Term; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.FiniteStringsIterator; @@ -125,7 +123,7 @@ public Term[] getTerms(String field, int state) { .map(s -> new Term(field, s.addAttribute(TermToBytesRefAttribute.class).getBytesRef())) .toArray(Term[]::new); } - + /** * Get all finite strings from the automaton. */ From e0f23b11004a6ed6a22397c8d1a25cd8f7421106 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Sat, 25 Jan 2020 13:16:42 +0000 Subject: [PATCH 16/36] [SOLR-12238] schema simplification --- .../solr/collection1/conf/schema12.xml | 28 ------------------- 1 file changed, 28 deletions(-) diff --git a/solr/core/src/test-files/solr/collection1/conf/schema12.xml b/solr/core/src/test-files/solr/collection1/conf/schema12.xml index 00b66f0944a3..2a801407db53 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema12.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema12.xml @@ -208,19 +208,6 @@ - - - - - - - - - - - - - @@ -263,19 +250,6 @@ - - - - - - - - - - - - - @@ -703,10 +677,8 @@ - - From e715fb1b46eea703e65a452309cf451ab8758915 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Mon, 27 Jan 2020 15:58:31 +0000 Subject: [PATCH 17/36] [SOLR-12238] Refactor based on PR comments --- .../apache/lucene/search/SynonymQuery.java | 15 +- .../apache/lucene/util/AttributeSource.java | 17 ++ .../org/apache/lucene/util/QueryBuilder.java | 257 ++++++++---------- .../ComplexPhraseQueryParser.java | 7 +- .../solr/parser/SolrQueryParserBase.java | 61 ++--- .../solr/search/TestSolrQueryParser.java | 16 +- 6 files changed, 183 insertions(+), 190 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java index b23283300269..d1ea14e8f718 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java @@ -124,6 +124,13 @@ public List getTerms() { .collect(Collectors.toList()) ); } + + public List getTermsAndBoosts() { + return Collections.unmodifiableList( + Arrays.stream(terms) + .collect(Collectors.toList()) + ); + } @Override public String toString(String field) { @@ -616,7 +623,7 @@ public void setMinCompetitiveScore(float minScore) throws IOException { } } - private static class TermAndBoost { + public static class TermAndBoost { final Term term; final float boost; @@ -625,10 +632,14 @@ private static class TermAndBoost { this.boost = boost; } - Term getTerm() { + public Term getTerm() { return term; } + public float getBoost() { + return boost; + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/lucene/core/src/java/org/apache/lucene/util/AttributeSource.java b/lucene/core/src/java/org/apache/lucene/util/AttributeSource.java index e962fedc1deb..7e5f44341df8 100644 --- a/lucene/core/src/java/org/apache/lucene/util/AttributeSource.java +++ b/lucene/core/src/java/org/apache/lucene/util/AttributeSource.java @@ -171,6 +171,23 @@ static Class[] getAttributeInterfaces(final Class + * The caller must pass in a Class<? extends AttributeImpl> value. + * + * @return instance of the passed in AttributeImpl, or {@code null} if this AttributeSource + * does not contain the AttributeImpl. It is recommended to always use + * {@link #addAttributeImpl} even in consumers of TokenStreams, because you cannot + * know if a specific TokenStream really uses a specific AttributeImpl. + * {@link #addAttributeImpl} will automatically make the attribute impl available. + * If you want to only use the attribute , if it is available (to optimize + * consuming), use {@link #hasAttribute}. + */ + public final T getAttributeImpl(Class attClass) { + return attClass.cast(attributeImpls.get(attClass)); + } + /** Expert: Adds a custom AttributeImpl instance with one or more Attribute interfaces. *

NOTE: It is not guaranteed, that att is added to * the AttributeSource, because the provided attributes may already exist. diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index f83f6c6835dc..b0511741e769 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -19,12 +19,14 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; +import java.util.LinkedList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttributeImpl; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; @@ -391,18 +393,18 @@ protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOExcep return null; } else if (terms.size() == 1) { SpanTermQuery singleTermQuery = terms.get(0); - if (queryPayloadBoost != 0) { - return new SpanBoostQuery(singleTermQuery, queryPayloadBoost); - } else { - return singleTermQuery; - } + return getBoostedQuery(singleTermQuery,queryPayloadBoost); } else { SpanNearQuery multiTermQuery = new SpanNearQuery(terms.toArray(new SpanTermQuery[0]), 0, true); - if (queryPayloadBoost != 0) { - return new SpanBoostQuery(multiTermQuery, queryPayloadBoost); - } else { - return multiTermQuery; - } + return getBoostedQuery(multiTermQuery,queryPayloadBoost); + } + } + + private SpanQuery getBoostedQuery(SpanQuery query, float payloadBoost){ + if (isAcceptableBoost(payloadBoost)) { + return new SpanBoostQuery(query, payloadBoost); + } else { + return query; } } @@ -423,61 +425,47 @@ protected float extractQueryPayload(BytesRef[] payloadsForQueryTerms) { return 0; } - public static final float decodeFloat(byte [] bytes, int offset){ + protected static final float decodeFloat(byte [] bytes, int offset){ return Float.intBitsToFloat(decodeInt(bytes, offset)); } - public static final int decodeInt(byte [] bytes, int offset){ + protected static final int decodeInt(byte [] bytes, int offset){ return ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16) | ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF); } - /** * Creates simple term query from the cached tokenstream contents */ protected Query analyzeTerm(String field, TokenStream stream) throws IOException { - TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); - stream.reset(); if (!stream.incrementToken()) { throw new AssertionError(); } - return newTermQuery(new Term(field, termAtt.getBytesRef())); + return newTermQuery(field,stream.cloneAttributes()); } /** * Creates simple boolean query from the cached tokenstream contents */ protected Query analyzeBoolean(String field, TokenStream stream) throws IOException { - TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); - PayloadAttribute payloadAtt = null; - if(synonymsBoostByPayload){ - payloadAtt = stream.getAttribute(PayloadAttribute.class); - } - stream.reset(); - List terms = new ArrayList<>(); - List payloads = new ArrayList<>(); + List attributes = new ArrayList<>(); while (stream.incrementToken()) { - terms.add(new Term(field, termAtt.getBytesRef())); - if (payloadAtt != null) { - payloads.add(payloadAtt.getPayload()); - } + attributes.add(stream.cloneAttributes()); } - - return newSynonymQuery(terms.toArray(new Term[terms.size()]),payloads.toArray(new BytesRef[payloads.size()])); + return newSynonymQuery(field, attributes.toArray(new AttributeSource[attributes.size()])); } - protected void add(BooleanQuery.Builder q, List current, List payloads, BooleanClause.Occur operator) { - if (current.isEmpty()) { + protected void add(BooleanQuery.Builder q, String field, List currentAttributes, BooleanClause.Occur operator) { + if (currentAttributes.isEmpty()) { return; } - if (current.size() == 1) { - q.add(newTermQuery(current.get(0)), operator); + if (currentAttributes.size() == 1) { + q.add(newTermQuery(field,currentAttributes.get(0)), operator); } else { - Query synonymQuery = newSynonymQuery(current.toArray(new Term[current.size()]), payloads.toArray(new BytesRef[payloads.size()])); + Query synonymQuery = newSynonymQuery(field, currentAttributes.toArray(new AttributeSource[currentAttributes.size()])); q.add(synonymQuery, operator); } } @@ -487,28 +475,17 @@ protected void add(BooleanQuery.Builder q, List current, List pa */ protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException { BooleanQuery.Builder q = newBooleanQuery(); - List currentQuery = new ArrayList<>(); - List currentPayload = new ArrayList<>(); - - TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); + List cumulativeAttributes = new ArrayList<>(); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); - PayloadAttribute payloadAtt = null; - if(synonymsBoostByPayload){ - payloadAtt = stream.getAttribute(PayloadAttribute.class); - } stream.reset(); while (stream.incrementToken()) { if (posIncrAtt.getPositionIncrement() != 0) { - add(q, currentQuery, currentPayload, operator); - currentQuery.clear(); - currentPayload.clear(); - } - currentQuery.add(new Term(field, termAtt.getBytesRef())); - if(payloadAtt != null){ - currentPayload.add(payloadAtt.getPayload()); + add(q, field, cumulativeAttributes, operator); + cumulativeAttributes.clear(); } + cumulativeAttributes.add(stream.cloneAttributes()); } - add(q, currentQuery, currentPayload, operator); + add(q, field, cumulativeAttributes, operator); return q.build(); } @@ -517,24 +494,12 @@ protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanCla * Creates simple phrase query from the cached tokenstream contents */ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws IOException { - PhraseQuery.Builder builder = new PhraseQuery.Builder(); - builder.setSlop(slop); - - TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); - PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); - int position = -1; - + List clonedAttributes = new LinkedList<>(); stream.reset(); while (stream.incrementToken()) { - if (enablePositionIncrements) { - position += posIncrAtt.getPositionIncrement(); - } else { - position += 1; - } - builder.add(new Term(field, termAtt.getBytesRef()), position); + clonedAttributes.add(stream.cloneAttributes()); } - - return builder.build(); + return newPhraseQuery(field,clonedAttributes.toArray(new AttributeSource[clonedAttributes.size()]),slop); } /** @@ -591,75 +556,38 @@ protected Query analyzeGraphBoolean(String field, TokenStream source, BooleanCla end = articulationPoints[i]; } lastState = end; - final Query queryClause; - final Iterator sidePathsForPayloads = graph.getFiniteStrings(start, end); - Iterator sidePathsPayloads = new Iterator() { - @Override - public boolean hasNext() { - return sidePathsForPayloads.hasNext(); - } - - @Override - public BytesRef[] next() { - TokenStream sidePath = sidePathsForPayloads.next(); - return getPayloadsFromStream(sidePath); - } - }; + final Query positionalQuery; if (graph.hasSidePath(start)) { - final Iterator sidePaths = graph.getFiniteStrings(start, end); - Iterator sidePathsQueries = new Iterator() { + final Iterator sidePathsIterator = graph.getFiniteStrings(start, end); + Iterator queries = new Iterator() { @Override public boolean hasNext() { - return sidePaths.hasNext(); + return sidePathsIterator.hasNext(); } @Override public Query next() { - TokenStream sidePath = sidePaths.next(); + TokenStream sidePath = sidePathsIterator.next(); return createFieldQuery(sidePath, BooleanClause.Occur.MUST, field, getAutoGenerateMultiTermSynonymsPhraseQuery(), 0); } }; - queryClause = newGraphSynonymQuery(sidePathsQueries, sidePathsPayloads); + positionalQuery = newGraphSynonymQuery(queries); } else { - Term[] terms = graph.getTerms(field, start); - assert terms.length > 0; - if (terms.length == 1) { - queryClause = newTermQuery(terms[0]); + List attributes = graph.getTerms( start); + assert attributes.size()> 0; + if (attributes.size() == 1) { + positionalQuery = newTermQuery(field,attributes.get(0)); } else { - BytesRef[] payloads = new BytesRef[terms.length]; - if (synonymsBoostByPayload) { - int j=0; - while(sidePathsForPayloads.hasNext()) { - payloads[j] = sidePathsPayloads.next()[0]; - } - } - queryClause = newSynonymQuery(terms, payloads); + positionalQuery = newSynonymQuery(field, attributes.toArray(new AttributeSource[attributes.size()])); } } - if (queryClause != null) { - builder.add(queryClause, operator); + if (positionalQuery != null) { + builder.add(positionalQuery, operator); } } return builder.build(); } - protected BytesRef[] getPayloadsFromStream(TokenStream source) { - try (CachingTokenFilter stream = new CachingTokenFilter(source)) { - PayloadAttribute payloadAtt = stream.getAttribute(PayloadAttribute.class); - stream.reset(); - List payloads = new ArrayList<>(); - while (stream.incrementToken()) { - if (payloadAtt != null) { - payloads.add(payloadAtt.getPayload()); - } - } - stream.end(); - stream.close(); - return payloads.toArray(new BytesRef[payloads.size()]); - } catch (IOException e) { - throw new RuntimeException("Error analyzing query text", e); - } - } /** * Creates graph phrase query from the tokenstream contents */ @@ -769,25 +697,38 @@ protected BooleanQuery.Builder newBooleanQuery() { * This is intended for subclasses that wish to customize the generated queries. * @return new Query instance */ - protected Query newSynonymQuery(Term[] terms, BytesRef[] termPayloads) { - SynonymQuery.Builder builder = new SynonymQuery.Builder(terms[0].field()); - for (int i = 0; i < terms.length; i++) { - float payloadBoost = 0f; - if (termPayloads.length == terms.length) { - if (termPayloads[i] != null) { - payloadBoost = decodeFloat(termPayloads[i].bytes, termPayloads[i].offset); - } - } - if (payloadBoost != 0) { - builder.addTerm(terms[i], payloadBoost); + protected Query newSynonymQuery(String field, AttributeSource[] attributes) { + SynonymQuery.Builder builder = new SynonymQuery.Builder(field); + for (int i = 0; i < attributes.length; i++) { + TermToBytesRefAttribute termAttribute = attributes[i].getAttribute(TermToBytesRefAttribute.class); + Term term = new Term(field, termAttribute.getBytesRef()); + + float payloadBoost = getDecodedPayload(attributes[i]); + if (isAcceptableBoost(payloadBoost)) { + builder.addTerm(term, payloadBoost); } else { - builder.addTerm(terms[i]); + builder.addTerm(term); } } return builder.build(); } - - + + private float getDecodedPayload(AttributeSource attribute) { + float payloadBoost = 0f; + PayloadAttributeImpl payloadAttribute = attribute.getAttributeImpl(PayloadAttributeImpl.class); + if(payloadAttribute!=null && synonymsBoostByPayload ) { + BytesRef payloadToDecode = payloadAttribute.getPayload(); + if(payloadToDecode!=null){ + payloadBoost = decodeFloat(payloadToDecode.bytes, payloadToDecode.offset); + } + } + return payloadBoost; + } + + protected boolean isAcceptableBoost(float payloadBoost) { + return payloadBoost >0 && payloadBoost !=1; + } + /** * Builds a new GraphQuery for multi-terms synonyms. @@ -795,18 +736,10 @@ protected Query newSynonymQuery(Term[] terms, BytesRef[] termPayloads) { * This is intended for subclasses that wish to customize the generated queries. * @return new Query instance */ - protected Query newGraphSynonymQuery(Iterator queries, Iterator termPayload) { + protected Query newGraphSynonymQuery(Iterator queries) { BooleanQuery.Builder builder = new BooleanQuery.Builder(); while (queries.hasNext()) { - Query next = queries.next(); - if (termPayload.hasNext()) { - BytesRef[] queryPayloads = termPayload.next(); - float payloadBoost = this.extractQueryPayload(queryPayloads); - if (payloadBoost != 0) { - next = new BoostQuery(next, payloadBoost); - } - } - builder.add(next, BooleanClause.Occur.SHOULD); + builder.add(queries.next(), BooleanClause.Occur.SHOULD); } BooleanQuery bq = builder.build(); if (bq.clauses().size() == 1) { @@ -819,11 +752,49 @@ protected Query newGraphSynonymQuery(Iterator queries, Iterator * This is intended for subclasses that wish to customize the generated queries. - * @param term term * @return new TermQuery instance */ - protected Query newTermQuery(Term term) { - return new TermQuery(term); + protected Query newTermQuery(String field, AttributeSource attribute) { + TermToBytesRefAttribute termAttribute = attribute.getAttribute(TermToBytesRefAttribute.class); + Term term = new Term(field, termAttribute.getBytesRef()); + Query termQuery = new TermQuery(term); + + float payloadBoost = getDecodedPayload(attribute); + if (isAcceptableBoost(payloadBoost)) { + termQuery = new BoostQuery(termQuery, payloadBoost); + } + + return termQuery; + } + + /** + * Builds a new PhraseQuery instance. + *

+ * This is intended for subclasses that wish to customize the generated queries. + * @return new PhraseQuery instance + */ + protected Query newPhraseQuery(String field, AttributeSource[] attributes, int slop) { + PhraseQuery.Builder builder = new PhraseQuery.Builder(); + builder.setSlop(slop); + int position =-1; + float payloadBoost =-1; + for (int i = 0; i < attributes.length; i++) { + TermToBytesRefAttribute termAttribute = attributes[i].getAttribute(TermToBytesRefAttribute.class); + PositionIncrementAttribute posIncrAtt = attributes[i].getAttribute(PositionIncrementAttribute.class); + if (enablePositionIncrements) { + position += posIncrAtt.getPositionIncrement(); + } else { + position += 1; + } + payloadBoost = getDecodedPayload(attributes[i]); + builder.add(new Term(field, termAttribute.getBytesRef()), position); + } + + Query query = builder.build(); + if (isAcceptableBoost(payloadBoost)) { + query = new BoostQuery(query, payloadBoost); + } + return query; } /** diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java index 9a4043d1d8a3..0cb86f621b45 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java @@ -45,6 +45,7 @@ import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.util.AttributeSource; /** * QueryParser which permits complex phrase query syntax eg "(john jon @@ -147,15 +148,15 @@ public Query parse(String query) throws ParseException { // to throw a runtime exception here if a term for another field is embedded // in phrase query @Override - protected Query newTermQuery(Term term) { + protected Query newTermQuery(String field, AttributeSource attribute) { if (isPass2ResolvingPhrases) { try { - checkPhraseClauseIsForSameField(term.field()); + checkPhraseClauseIsForSameField(field); } catch (ParseException pe) { throw new RuntimeException("Error parsing complex phrase", pe); } } - return super.newTermQuery(term); + return super.newTermQuery(field,attribute); } // Helper method used to report on any clauses that appear in query syntax diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java index 84f2c0690948..3c83410eaba8 100644 --- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java +++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java @@ -46,8 +46,10 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.search.SynonymQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.QueryBuilder; import org.apache.lucene.util.automaton.Automata; @@ -615,34 +617,32 @@ private Query buildBooleanQuery(List sidePathsQueries) { } @Override - protected Query newSynonymQuery(Term[] terms, BytesRef[] payloads) { + protected Query newSynonymQuery(String field, AttributeSource[] attributes) { switch (synonymQueryStyle) { case PICK_BEST: { - List synonymQueries = getSynonymQueries(terms, payloads); + List synonymQueries = getSynonymQueries(field, attributes); return new DisjunctionMaxQuery(synonymQueries, 0.0f); } case AS_DISTINCT_TERMS: { - List synonymQueries = getSynonymQueries(terms, payloads); + List synonymQueries = getSynonymQueries(field, attributes); return buildBooleanQuery(synonymQueries); } case AS_SAME_TERM: - return super.newSynonymQuery(terms, payloads); + return super.newSynonymQuery(field, attributes); default: throw new AssertionError("unrecognized synonymQueryStyle passed when creating newSynonymQuery"); } } - private List getSynonymQueries(Term[] terms, BytesRef[] payloads) { - List synonymQueries = new ArrayList<>(terms.length); - for (int i = 0; i < terms.length; i++) { - float payloadBoost = 0f; - if (payloads.length == terms.length) { - if (payloads[i] != null) { - payloadBoost = decodeFloat(payloads[i].bytes, payloads[i].offset); - } - } - Query synonymQuery = new TermQuery(terms[i]); - if (payloadBoost != 0) { + private List getSynonymQueries(String field, AttributeSource[] attributes) { + List synonymQueries = new ArrayList<>(attributes.length); + SynonymQuery q = (SynonymQuery)super.newSynonymQuery(field, attributes); + List terms = q.getTermsAndBoosts(); + for (int i = 0; i < terms.size(); i++) { + SynonymQuery.TermAndBoost currentTerm = terms.get(i); + Query synonymQuery = new TermQuery(currentTerm.getTerm()); + float payloadBoost = currentTerm.getBoost(); + if (super.isAcceptableBoost(payloadBoost)) { synonymQuery = new BoostQuery(synonymQuery, payloadBoost); } synonymQueries.add(synonymQuery); @@ -658,37 +658,30 @@ private List getSynonymQueries(Term[] terms, BytesRef[] payloads) { * @return new Query instance */ @Override - protected Query newGraphSynonymQuery(Iterator sidePaths, Iterator payloads) { + protected Query newGraphSynonymQuery(Iterator sidePathQueries) { if (super.synonymsBoostByPayload) { switch (synonymQueryStyle) { case PICK_BEST: { - List sidePathsQueries = getGraphSynonymQueries(sidePaths, payloads); - DisjunctionMaxQuery graphSynonymQuery = new DisjunctionMaxQuery(sidePathsQueries, 0.0f); + DisjunctionMaxQuery graphSynonymQuery = new DisjunctionMaxQuery(getGraphSynonymQueries(sidePathQueries), 0.0f); return graphSynonymQuery; } case AS_DISTINCT_TERMS: { - List sidePathsQueries = getGraphSynonymQueries(sidePaths, payloads); - return buildBooleanQuery(sidePathsQueries); + return buildBooleanQuery(getGraphSynonymQueries(sidePathQueries)); } + case AS_SAME_TERM: + return super.newGraphSynonymQuery(sidePathQueries); + default: + throw new AssertionError("unrecognized synonymQueryStyle passed when creating newSynonymQuery"); } + }else{ + return super.newGraphSynonymQuery(sidePathQueries); } - return super.newGraphSynonymQuery(sidePaths, payloads); } - - private List getGraphSynonymQueries(Iterator sidePaths, Iterator sidePathsPayloads) { + + private List getGraphSynonymQueries(Iterator sidePaths) { List resultSidePaths = new LinkedList<>(); while (sidePaths.hasNext()) { - float overallQueryPayload = 0; - Query sidePath = sidePaths.next(); - if (sidePathsPayloads.hasNext()) { - BytesRef[] sidePathPayloads = sidePathsPayloads.next(); - overallQueryPayload = extractQueryPayload(sidePathPayloads); - } - if (overallQueryPayload != 0) { - resultSidePaths.add(new BoostQuery(sidePath, overallQueryPayload)); - } else { - resultSidePaths.add(sidePath); - } + resultSidePaths.add(sidePaths.next()); } return resultSidePaths; } diff --git a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java index 2f1f138b24ea..ab8700d9275e 100644 --- a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java +++ b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java @@ -1211,10 +1211,10 @@ public void testShingleQueries() throws Exception { public void testSynonymQueryStyle() throws Exception { Query q = QParser.getParser("tabby", req(params("df", "t_pick_best_foo"))).getQuery(); - assertEquals("(t_pick_best_foo:tabbi | t_pick_best_foo:cat | t_pick_best_foo:felin | t_pick_best_foo:anim)", q.toString()); + assertEquals("(t_pick_best_foo:anim | t_pick_best_foo:cat | t_pick_best_foo:felin | t_pick_best_foo:tabbi)", q.toString()); q = QParser.getParser("tabby", req(params("df", "t_as_distinct_foo"))).getQuery(); - assertEquals("t_as_distinct_foo:tabbi t_as_distinct_foo:cat t_as_distinct_foo:felin t_as_distinct_foo:anim", q.toString()); + assertEquals("t_as_distinct_foo:anim t_as_distinct_foo:cat t_as_distinct_foo:felin t_as_distinct_foo:tabbi", q.toString()); /*confirm autoGeneratePhraseQueries always builds OR queries*/ q = QParser.getParser("jeans", req(params("df", "t_as_distinct_foo", "sow", "false"))).getQuery(); @@ -1228,10 +1228,10 @@ public void testSynonymQueryStyle() throws Exception { public void testSynonymsBoostByPayload_singleTermQuerySingleTermSynonyms_shouldParseBoostedQuery() throws Exception { //tiger, tigre|0.9 Query q = QParser.getParser("tiger", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_pick_best_boost_by_payload_foo:tigre)^0.9 | t_pick_best_boost_by_payload_foo:tiger)", q.toString()); + assertEquals("(t_pick_best_boost_by_payload_foo:tiger | (t_pick_best_boost_by_payload_foo:tigre)^0.9)", q.toString()); q = QParser.getParser("tiger", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); - assertEquals("(t_as_distinct_boost_by_payload_foo:tigre)^0.9 t_as_distinct_boost_by_payload_foo:tiger", q.toString()); + assertEquals("t_as_distinct_boost_by_payload_foo:tiger (t_as_distinct_boost_by_payload_foo:tigre)^0.9", q.toString()); //lynx => lince|0.8, lynx_canadensis|0.9 q = QParser.getParser("lynx", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); @@ -1261,12 +1261,12 @@ public void testSynonymsBoostByPayload_multiTermQuerySingleTermSynonyms_shouldPa //tiger, tigre|0.9 //lynx => lince|0.8, lynx_canadensis|0.9 Query q = QParser.getParser("tiger lynx", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_pick_best_boost_by_payload_foo:tigre)^0.9 | t_pick_best_boost_by_payload_foo:tiger)" + - " ((t_pick_best_boost_by_payload_foo:lince)^0.8 | (t_pick_best_boost_by_payload_foo:lynx_canadensis)^0.9)", q.toString()); + assertEquals("(t_pick_best_boost_by_payload_foo:tiger | (t_pick_best_boost_by_payload_foo:tigre)^0.9)" + + " ((t_pick_best_boost_by_payload_foo:lince)^0.8 | (t_pick_best_boost_by_payload_foo:lynx_canadensis)^0.9)", q.toString()); q = QParser.getParser("tiger lynx", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_as_distinct_boost_by_payload_foo:tigre)^0.9 t_as_distinct_boost_by_payload_foo:tiger)" + - " ((t_as_distinct_boost_by_payload_foo:lince)^0.8 (t_as_distinct_boost_by_payload_foo:lynx_canadensis)^0.9)", q.toString()); + assertEquals("(t_as_distinct_boost_by_payload_foo:tiger (t_as_distinct_boost_by_payload_foo:tigre)^0.9)" + + " ((t_as_distinct_boost_by_payload_foo:lince)^0.8 (t_as_distinct_boost_by_payload_foo:lynx_canadensis)^0.9)", q.toString()); } public void testSynonymsBoostByPayload_multiTermQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { From 6d2d291bb6517be63fbc9a61bbe7e3b142963201 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Mon, 27 Jan 2020 18:37:23 +0000 Subject: [PATCH 18/36] [SOLR-12238] Extract the functionality to Solr classes keeping Lucene ones as simple as possible --- .../apache/lucene/search/SynonymQuery.java | 15 +- .../org/apache/lucene/util/QueryBuilder.java | 156 ++++-------------- .../solr/parser/SolrQueryParserBase.java | 122 ++++++++++---- .../solr/collection1/conf/schema12.xml | 13 ++ .../solr/search/TestSolrQueryParser.java | 59 +++++-- 5 files changed, 184 insertions(+), 181 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java index d1ea14e8f718..b23283300269 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java @@ -124,13 +124,6 @@ public List getTerms() { .collect(Collectors.toList()) ); } - - public List getTermsAndBoosts() { - return Collections.unmodifiableList( - Arrays.stream(terms) - .collect(Collectors.toList()) - ); - } @Override public String toString(String field) { @@ -623,7 +616,7 @@ public void setMinCompetitiveScore(float minScore) throws IOException { } } - public static class TermAndBoost { + private static class TermAndBoost { final Term term; final float boost; @@ -632,14 +625,10 @@ public static class TermAndBoost { this.boost = boost; } - public Term getTerm() { + Term getTerm() { return term; } - public float getBoost() { - return boost; - } - @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index b0511741e769..b66ed0323d46 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -25,22 +25,18 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttributeImpl; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.SynonymQuery; import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.spans.SpanBoostQuery; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; @@ -67,7 +63,6 @@ public class QueryBuilder { protected boolean enablePositionIncrements = true; protected boolean enableGraphQueries = true; protected boolean autoGenerateMultiTermSynonymsPhraseQuery = false; - protected boolean synonymsBoostByPayload = false; /** Creates a new QueryBuilder using the given analyzer. */ public QueryBuilder(Analyzer analyzer) { @@ -222,14 +217,6 @@ public void setAutoGenerateMultiTermSynonymsPhraseQuery(boolean enable) { this.autoGenerateMultiTermSynonymsPhraseQuery = enable; } - /** - * Set to true if synonyms should be automatically boosted by their payload. - * Default: false. - */ - public void setSynonymsBoostByPayload(boolean enable) { - this.synonymsBoostByPayload = enable; - } - /** * Creates a query from the analysis chain. *

@@ -363,77 +350,13 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato * returned. When multiple tokens, an ordered SpanNearQuery with slop 0 is returned. */ protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOException { - PayloadAttribute payloadAttribute = null; - if(synonymsBoostByPayload){ - payloadAttribute = in.getAttribute(PayloadAttribute.class); - } - TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class); - if (termAtt == null) { - return null; - } - - List terms = new ArrayList<>(); - List payloads = new ArrayList<>(); + List clonedAttributes = new ArrayList<>(); while (in.incrementToken()) { - terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef()))); - if(payloadAttribute!=null){ - payloads.add(payloadAttribute.getPayload()); - } - } - in.end(); - in.close(); - - BytesRef[] queryPayloadsArray = payloads.toArray(new BytesRef[payloads.size()]); - float queryPayloadBoost = 0; - if (!payloads.isEmpty()) { - queryPayloadBoost = extractQueryPayload(queryPayloadsArray); + clonedAttributes.add(in.cloneAttributes()); } - - if (terms.isEmpty()) { - return null; - } else if (terms.size() == 1) { - SpanTermQuery singleTermQuery = terms.get(0); - return getBoostedQuery(singleTermQuery,queryPayloadBoost); - } else { - SpanNearQuery multiTermQuery = new SpanNearQuery(terms.toArray(new SpanTermQuery[0]), 0, true); - return getBoostedQuery(multiTermQuery,queryPayloadBoost); - } - } - - private SpanQuery getBoostedQuery(SpanQuery query, float payloadBoost){ - if (isAcceptableBoost(payloadBoost)) { - return new SpanBoostQuery(query, payloadBoost); - } else { - return query; - } - } - - /*Current assumption is that the user will associate a single payload to the multi terms synonym - * that generated the phrase query, so a valid value for the payload associated to the query is just the first not null payload - * e.g. - * lion => panthera leo|0.99 - * "panthera leo" query will have associated Payloads [null,0.99] - * So the payload associated to the query will be 0.99 which is the first not null - * */ - protected float extractQueryPayload(BytesRef[] payloadsForQueryTerms) { - for (BytesRef singlePayload : payloadsForQueryTerms) { - if (singlePayload != null) { - float decodedPayload = decodeFloat(singlePayload.bytes, singlePayload.offset); - return decodedPayload; - } - } - return 0; - } - - protected static final float decodeFloat(byte [] bytes, int offset){ - - return Float.intBitsToFloat(decodeInt(bytes, offset)); - } - - protected static final int decodeInt(byte [] bytes, int offset){ - return ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16) - | ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF); + return newSpanQuery(field,clonedAttributes.toArray(new AttributeSource[clonedAttributes.size()])); } + /** * Creates simple term query from the cached tokenstream contents */ @@ -451,11 +374,11 @@ protected Query analyzeTerm(String field, TokenStream stream) throws IOException */ protected Query analyzeBoolean(String field, TokenStream stream) throws IOException { stream.reset(); - List attributes = new ArrayList<>(); + List clonedAttributes = new ArrayList<>(); while (stream.incrementToken()) { - attributes.add(stream.cloneAttributes()); + clonedAttributes.add(stream.cloneAttributes()); } - return newSynonymQuery(field, attributes.toArray(new AttributeSource[attributes.size()])); + return newSynonymQuery(field, clonedAttributes.toArray(new AttributeSource[clonedAttributes.size()])); } protected void add(BooleanQuery.Builder q, String field, List currentAttributes, BooleanClause.Occur operator) { @@ -573,7 +496,7 @@ public Query next() { }; positionalQuery = newGraphSynonymQuery(queries); } else { - List attributes = graph.getTerms( start); + List attributes = graph.getTerms(start); assert attributes.size()> 0; if (attributes.size() == 1) { positionalQuery = newTermQuery(field,attributes.get(0)); @@ -701,35 +624,11 @@ protected Query newSynonymQuery(String field, AttributeSource[] attributes) { SynonymQuery.Builder builder = new SynonymQuery.Builder(field); for (int i = 0; i < attributes.length; i++) { TermToBytesRefAttribute termAttribute = attributes[i].getAttribute(TermToBytesRefAttribute.class); - Term term = new Term(field, termAttribute.getBytesRef()); - - float payloadBoost = getDecodedPayload(attributes[i]); - if (isAcceptableBoost(payloadBoost)) { - builder.addTerm(term, payloadBoost); - } else { - builder.addTerm(term); - } + builder.addTerm(new Term(field, termAttribute.getBytesRef())); } return builder.build(); } - private float getDecodedPayload(AttributeSource attribute) { - float payloadBoost = 0f; - PayloadAttributeImpl payloadAttribute = attribute.getAttributeImpl(PayloadAttributeImpl.class); - if(payloadAttribute!=null && synonymsBoostByPayload ) { - BytesRef payloadToDecode = payloadAttribute.getPayload(); - if(payloadToDecode!=null){ - payloadBoost = decodeFloat(payloadToDecode.bytes, payloadToDecode.offset); - } - } - return payloadBoost; - } - - protected boolean isAcceptableBoost(float payloadBoost) { - return payloadBoost >0 && payloadBoost !=1; - } - - /** * Builds a new GraphQuery for multi-terms synonyms. *

@@ -756,15 +655,7 @@ protected Query newGraphSynonymQuery(Iterator queries) { */ protected Query newTermQuery(String field, AttributeSource attribute) { TermToBytesRefAttribute termAttribute = attribute.getAttribute(TermToBytesRefAttribute.class); - Term term = new Term(field, termAttribute.getBytesRef()); - Query termQuery = new TermQuery(term); - - float payloadBoost = getDecodedPayload(attribute); - if (isAcceptableBoost(payloadBoost)) { - termQuery = new BoostQuery(termQuery, payloadBoost); - } - - return termQuery; + return new TermQuery(new Term(field, termAttribute.getBytesRef())); } /** @@ -777,7 +668,6 @@ protected Query newPhraseQuery(String field, AttributeSource[] attributes, int s PhraseQuery.Builder builder = new PhraseQuery.Builder(); builder.setSlop(slop); int position =-1; - float payloadBoost =-1; for (int i = 0; i < attributes.length; i++) { TermToBytesRefAttribute termAttribute = attributes[i].getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = attributes[i].getAttribute(PositionIncrementAttribute.class); @@ -786,15 +676,31 @@ protected Query newPhraseQuery(String field, AttributeSource[] attributes, int s } else { position += 1; } - payloadBoost = getDecodedPayload(attributes[i]); builder.add(new Term(field, termAttribute.getBytesRef()), position); } + return builder.build(); + } - Query query = builder.build(); - if (isAcceptableBoost(payloadBoost)) { - query = new BoostQuery(query, payloadBoost); + /** + * Builds a new PhraseQuery instance. + *

+ * This is intended for subclasses that wish to customize the generated queries. + * @return new PhraseQuery instance + */ + protected SpanQuery newSpanQuery(String field, AttributeSource[] attributes) { + List spanQueries = new ArrayList<>(attributes.length); + for (int i = 0; i < attributes.length; i++) { + TermToBytesRefAttribute termAttribute = attributes[i].getAttribute(TermToBytesRefAttribute.class); + SpanTermQuery q = new SpanTermQuery(new Term(field, termAttribute.getBytesRef())); + spanQueries.add(q); + } + if (spanQueries.isEmpty()) { + return null; + } else if (spanQueries.size() == 1) { + return spanQueries.get(0); + } else { + return new SpanNearQuery(spanQueries.toArray(new SpanTermQuery[0]), 0, true); } - return query; } /** diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java index 3c83410eaba8..b9ec1c07bd44 100644 --- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java +++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java @@ -28,7 +28,10 @@ import java.util.stream.Collectors; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.payloads.PayloadHelper; import org.apache.lucene.analysis.reverse.ReverseStringFilter; +import org.apache.lucene.analysis.tokenattributes.PayloadAttributeImpl; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.index.Term; import org.apache.lucene.search.AutomatonQuery; @@ -49,6 +52,8 @@ import org.apache.lucene.search.SynonymQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.search.spans.SpanBoostQuery; +import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.QueryBuilder; @@ -89,6 +94,7 @@ public abstract class SolrQueryParserBase extends QueryBuilder { static final int MOD_REQ = 11; protected SynonymQueryStyle synonymQueryStyle = AS_SAME_TERM; + protected boolean synonymsBoostByPayload = false; /** * Query strategy when analyzed query terms overlap the same position (ie synonyms) @@ -391,7 +397,10 @@ public void setAllowSubQueryParsing(boolean allowSubQueryParsing) { * Gets how overlapping query terms should be scored */ public SynonymQueryStyle getSynonymQueryStyle() {return this.synonymQueryStyle;} - + + public void setSynonymsBoostByPayload(boolean synonymsBoostByPayload) { + this.synonymsBoostByPayload = synonymsBoostByPayload; + } /** * Set to true to allow leading wildcard characters. @@ -615,7 +624,25 @@ private Query buildBooleanQuery(List sidePathsQueries) { } return builder.build(); } - + + @Override + protected Query newTermQuery(String field, AttributeSource attribute) { + Query termQuery = super.newTermQuery(field,attribute); + return getBoostedQueryByPayload(new AttributeSource[]{attribute}, termQuery); + } + + @Override + protected Query newPhraseQuery(String field, AttributeSource[] attributes, int slop) { + Query phraseQuery = super.newPhraseQuery(field,attributes,slop); + return getBoostedQueryByPayload(attributes, phraseQuery); + } + + @Override + protected SpanQuery newSpanQuery(String field, AttributeSource[] attributes) { + SpanQuery spanQuery = super.newSpanQuery(field,attributes); + return getBoostedQueryByPayload(attributes, spanQuery); + } + @Override protected Query newSynonymQuery(String field, AttributeSource[] attributes) { switch (synonymQueryStyle) { @@ -627,8 +654,19 @@ protected Query newSynonymQuery(String field, AttributeSource[] attributes) { List synonymQueries = getSynonymQueries(field, attributes); return buildBooleanQuery(synonymQueries); } - case AS_SAME_TERM: - return super.newSynonymQuery(field, attributes); + case AS_SAME_TERM:{ + SynonymQuery.Builder builder = new SynonymQuery.Builder(field); + for (int i = 0; i < attributes.length; i++) { + TermToBytesRefAttribute termAttribute = attributes[i].getAttribute(TermToBytesRefAttribute.class); + float payloadBoost = getDecodedPayload(attributes[i]); + if (isAcceptableBoost(payloadBoost)) { + builder.addTerm(new Term(field, termAttribute.getBytesRef()), payloadBoost); + } else { + builder.addTerm(new Term(field, termAttribute.getBytesRef())); + } + } + return builder.build(); + } default: throw new AssertionError("unrecognized synonymQueryStyle passed when creating newSynonymQuery"); } @@ -636,19 +674,51 @@ protected Query newSynonymQuery(String field, AttributeSource[] attributes) { private List getSynonymQueries(String field, AttributeSource[] attributes) { List synonymQueries = new ArrayList<>(attributes.length); - SynonymQuery q = (SynonymQuery)super.newSynonymQuery(field, attributes); - List terms = q.getTermsAndBoosts(); - for (int i = 0; i < terms.size(); i++) { - SynonymQuery.TermAndBoost currentTerm = terms.get(i); - Query synonymQuery = new TermQuery(currentTerm.getTerm()); - float payloadBoost = currentTerm.getBoost(); - if (super.isAcceptableBoost(payloadBoost)) { - synonymQuery = new BoostQuery(synonymQuery, payloadBoost); - } - synonymQueries.add(synonymQuery); + for (int i = 0; i < attributes.length; i++) { + TermToBytesRefAttribute termAttribute = attributes[i].getAttribute(TermToBytesRefAttribute.class); + Query synonymQuery = new TermQuery(new Term(field, termAttribute.getBytesRef())); + synonymQueries.add(getBoostedQueryByPayload(new AttributeSource[]{attributes[i]}, synonymQuery)); } return synonymQueries; } + + private Query getBoostedQueryByPayload(AttributeSource[] attributes, Query query) { + float payloadBoost = 0f; + for (int i = 0; i < attributes.length; i++) { + payloadBoost = getDecodedPayload(attributes[i]); + } + if (isAcceptableBoost(payloadBoost)) { + return new BoostQuery(query, payloadBoost); + } + return query; + } + + private SpanQuery getBoostedQueryByPayload(AttributeSource[] attributes, SpanQuery query) { + float payloadBoost = 0f; + for (int i = 0; i < attributes.length; i++) { + payloadBoost = getDecodedPayload(attributes[i]); + } + if (isAcceptableBoost(payloadBoost)) { + return new SpanBoostQuery(query, payloadBoost); + } + return query; + } + + private float getDecodedPayload(AttributeSource attribute) { + float payloadBoost = 0f; + PayloadAttributeImpl payloadAttribute = attribute.getAttributeImpl(PayloadAttributeImpl.class); + if (payloadAttribute != null && synonymsBoostByPayload) { + BytesRef payloadToDecode = payloadAttribute.getPayload(); + if (payloadToDecode != null) { + payloadBoost = PayloadHelper.decodeFloat(payloadToDecode.bytes, payloadToDecode.offset); + } + } + return payloadBoost; + } + + protected boolean isAcceptableBoost(float payloadBoost) { + return payloadBoost >0f && payloadBoost !=1f; + } /** * Builds a new GraphQuery for multi-terms synonyms. @@ -658,32 +728,20 @@ private List getSynonymQueries(String field, AttributeSource[] attributes * @return new Query instance */ @Override - protected Query newGraphSynonymQuery(Iterator sidePathQueries) { - if (super.synonymsBoostByPayload) { + protected Query newGraphSynonymQuery(Iterator sidePathQueriesIterator) { + List sidePathSynonymQueries = new LinkedList<>(); + sidePathQueriesIterator.forEachRemaining(sidePathSynonymQueries::add); switch (synonymQueryStyle) { case PICK_BEST: { - DisjunctionMaxQuery graphSynonymQuery = new DisjunctionMaxQuery(getGraphSynonymQueries(sidePathQueries), 0.0f); - return graphSynonymQuery; + return new DisjunctionMaxQuery(sidePathSynonymQueries, 0.0f); } + case AS_SAME_TERM: case AS_DISTINCT_TERMS: { - return buildBooleanQuery(getGraphSynonymQueries(sidePathQueries)); + return buildBooleanQuery(sidePathSynonymQueries); } - case AS_SAME_TERM: - return super.newGraphSynonymQuery(sidePathQueries); default: throw new AssertionError("unrecognized synonymQueryStyle passed when creating newSynonymQuery"); } - }else{ - return super.newGraphSynonymQuery(sidePathQueries); - } - } - - private List getGraphSynonymQueries(Iterator sidePaths) { - List resultSidePaths = new LinkedList<>(); - while (sidePaths.hasNext()) { - resultSidePaths.add(sidePaths.next()); - } - return resultSidePaths; } /** diff --git a/solr/core/src/test-files/solr/collection1/conf/schema12.xml b/solr/core/src/test-files/solr/collection1/conf/schema12.xml index 2a801407db53..2527c9455587 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema12.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema12.xml @@ -250,6 +250,18 @@ + + + + + + + + + + + + @@ -680,6 +692,7 @@ + diff --git a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java index ab8700d9275e..f639b33acd02 100644 --- a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java +++ b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java @@ -1209,29 +1209,30 @@ public void testShingleQueries() throws Exception { public void testSynonymQueryStyle() throws Exception { - Query q = QParser.getParser("tabby", req(params("df", "t_pick_best_foo"))).getQuery(); - assertEquals("(t_pick_best_foo:anim | t_pick_best_foo:cat | t_pick_best_foo:felin | t_pick_best_foo:tabbi)", q.toString()); + assertEquals("(t_pick_best_foo:tabbi | t_pick_best_foo:cat | t_pick_best_foo:felin | t_pick_best_foo:anim)", q.toString()); q = QParser.getParser("tabby", req(params("df", "t_as_distinct_foo"))).getQuery(); - assertEquals("t_as_distinct_foo:anim t_as_distinct_foo:cat t_as_distinct_foo:felin t_as_distinct_foo:tabbi", q.toString()); + assertEquals("t_as_distinct_foo:tabbi t_as_distinct_foo:cat t_as_distinct_foo:felin t_as_distinct_foo:anim", q.toString()); /*confirm autoGeneratePhraseQueries always builds OR queries*/ q = QParser.getParser("jeans", req(params("df", "t_as_distinct_foo", "sow", "false"))).getQuery(); assertEquals("(t_as_distinct_foo:\"denim pant\" t_as_distinct_foo:jean)", q.toString()); q = QParser.getParser("jeans", req(params("df", "t_pick_best_foo", "sow", "false"))).getQuery(); - assertEquals("(t_pick_best_foo:\"denim pant\" t_pick_best_foo:jean)", q.toString()); - + assertEquals("(t_pick_best_foo:\"denim pant\" | t_pick_best_foo:jean)", q.toString()); } public void testSynonymsBoostByPayload_singleTermQuerySingleTermSynonyms_shouldParseBoostedQuery() throws Exception { //tiger, tigre|0.9 Query q = QParser.getParser("tiger", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); - assertEquals("(t_pick_best_boost_by_payload_foo:tiger | (t_pick_best_boost_by_payload_foo:tigre)^0.9)", q.toString()); + assertEquals("((t_pick_best_boost_by_payload_foo:tigre)^0.9 | t_pick_best_boost_by_payload_foo:tiger)", q.toString()); q = QParser.getParser("tiger", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); - assertEquals("t_as_distinct_boost_by_payload_foo:tiger (t_as_distinct_boost_by_payload_foo:tigre)^0.9", q.toString()); + assertEquals("(t_as_distinct_boost_by_payload_foo:tigre)^0.9 t_as_distinct_boost_by_payload_foo:tiger", q.toString()); + + q = QParser.getParser("tiger", req(params("df", "t_as_same_term_boost_by_payload_foo"))).getQuery(); + assertEquals("Synonym(t_as_same_term_boost_by_payload_foo:tiger t_as_same_term_boost_by_payload_foo:tigre^0.9)", q.toString()); //lynx => lince|0.8, lynx_canadensis|0.9 q = QParser.getParser("lynx", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); @@ -1239,6 +1240,9 @@ public void testSynonymsBoostByPayload_singleTermQuerySingleTermSynonyms_shouldP q = QParser.getParser("lynx", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); assertEquals("(t_as_distinct_boost_by_payload_foo:lince)^0.8 (t_as_distinct_boost_by_payload_foo:lynx_canadensis)^0.9", q.toString()); + + q = QParser.getParser("lynx", req(params("df", "t_as_same_term_boost_by_payload_foo"))).getQuery(); + assertEquals("Synonym(t_as_same_term_boost_by_payload_foo:lince^0.8 t_as_same_term_boost_by_payload_foo:lynx_canadensis^0.9)", q.toString()); } public void testSynonymsBoostByPayload_singleTermQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { @@ -1249,24 +1253,34 @@ public void testSynonymsBoostByPayload_singleTermQueryMultiTermSynonyms_shouldPa q = QParser.getParser("leopard", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); assertEquals("((t_as_distinct_boost_by_payload_foo:\"big cat\")^0.8 (t_as_distinct_boost_by_payload_foo:bagheera)^0.9 (t_as_distinct_boost_by_payload_foo:\"panthera pardus\")^0.85 t_as_distinct_boost_by_payload_foo:leopard)", q.toString()); + q = QParser.getParser("leopard", req(params("df", "t_as_same_term_boost_by_payload_foo"))).getQuery(); + assertEquals("((t_as_same_term_boost_by_payload_foo:\"big cat\")^0.8 (t_as_same_term_boost_by_payload_foo:bagheera)^0.9 (t_as_same_term_boost_by_payload_foo:\"panthera pardus\")^0.85 t_as_same_term_boost_by_payload_foo:leopard)", q.toString()); + //lion => panthera leo|0.9, simba leo|0.8, kimba|0.75 q = QParser.getParser("lion", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); assertEquals("((t_pick_best_boost_by_payload_foo:\"panthera leo\")^0.9 | (t_pick_best_boost_by_payload_foo:\"simba leo\")^0.8 | (t_pick_best_boost_by_payload_foo:kimba)^0.75)", q.toString()); q = QParser.getParser("lion", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); assertEquals("((t_as_distinct_boost_by_payload_foo:\"panthera leo\")^0.9 (t_as_distinct_boost_by_payload_foo:\"simba leo\")^0.8 (t_as_distinct_boost_by_payload_foo:kimba)^0.75)", q.toString()); + + q = QParser.getParser("lion", req(params("df", "t_as_same_term_boost_by_payload_foo"))).getQuery(); + assertEquals("((t_as_same_term_boost_by_payload_foo:\"panthera leo\")^0.9 (t_as_same_term_boost_by_payload_foo:\"simba leo\")^0.8 (t_as_same_term_boost_by_payload_foo:kimba)^0.75)", q.toString()); } public void testSynonymsBoostByPayload_multiTermQuerySingleTermSynonyms_shouldParseBoostedQuery() throws Exception { //tiger, tigre|0.9 //lynx => lince|0.8, lynx_canadensis|0.9 Query q = QParser.getParser("tiger lynx", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); - assertEquals("(t_pick_best_boost_by_payload_foo:tiger | (t_pick_best_boost_by_payload_foo:tigre)^0.9)" + + assertEquals("((t_pick_best_boost_by_payload_foo:tigre)^0.9 | t_pick_best_boost_by_payload_foo:tiger)" + " ((t_pick_best_boost_by_payload_foo:lince)^0.8 | (t_pick_best_boost_by_payload_foo:lynx_canadensis)^0.9)", q.toString()); q = QParser.getParser("tiger lynx", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); - assertEquals("(t_as_distinct_boost_by_payload_foo:tiger (t_as_distinct_boost_by_payload_foo:tigre)^0.9)" + + assertEquals("((t_as_distinct_boost_by_payload_foo:tigre)^0.9 t_as_distinct_boost_by_payload_foo:tiger)" + " ((t_as_distinct_boost_by_payload_foo:lince)^0.8 (t_as_distinct_boost_by_payload_foo:lynx_canadensis)^0.9)", q.toString()); + + q = QParser.getParser("tiger lynx", req(params("df", "t_as_same_term_boost_by_payload_foo"))).getQuery(); + assertEquals("Synonym(t_as_same_term_boost_by_payload_foo:tiger t_as_same_term_boost_by_payload_foo:tigre^0.9)" + + " Synonym(t_as_same_term_boost_by_payload_foo:lince^0.8 t_as_same_term_boost_by_payload_foo:lynx_canadensis^0.9)", q.toString()); } public void testSynonymsBoostByPayload_multiTermQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { @@ -1279,7 +1293,12 @@ public void testSynonymsBoostByPayload_multiTermQueryMultiTermSynonyms_shouldPar q = QParser.getParser("leopard lion", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); assertEquals("((t_as_distinct_boost_by_payload_foo:\"big cat\")^0.8 (t_as_distinct_boost_by_payload_foo:bagheera)^0.9 (t_as_distinct_boost_by_payload_foo:\"panthera pardus\")^0.85 t_as_distinct_boost_by_payload_foo:leopard)" + " ((t_as_distinct_boost_by_payload_foo:\"panthera leo\")^0.9 (t_as_distinct_boost_by_payload_foo:\"simba leo\")^0.8 (t_as_distinct_boost_by_payload_foo:kimba)^0.75)", q.toString()); - } + + q = QParser.getParser("leopard lion", req(params("df", "t_as_same_term_boost_by_payload_foo"))).getQuery(); + assertEquals("((t_as_same_term_boost_by_payload_foo:\"big cat\")^0.8 (t_as_same_term_boost_by_payload_foo:bagheera)^0.9 (t_as_same_term_boost_by_payload_foo:\"panthera pardus\")^0.85 t_as_same_term_boost_by_payload_foo:leopard)" + + " ((t_as_same_term_boost_by_payload_foo:\"panthera leo\")^0.9 (t_as_same_term_boost_by_payload_foo:\"simba leo\")^0.8 (t_as_same_term_boost_by_payload_foo:kimba)^0.75)", q.toString()); + + } public void testSynonymsBoostByPayload_singleConceptQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { //snow leopard|1.0, panthera uncia|0.9, big cat|0.8, white_leopard|0.6 @@ -1289,12 +1308,19 @@ public void testSynonymsBoostByPayload_singleConceptQueryMultiTermSynonyms_shoul q = QParser.getParser("snow leopard", req(params("df", "t_as_distinct_boost_by_payload_foo","sow", "false"))).getQuery(); assertEquals("((t_as_distinct_boost_by_payload_foo:\"panthera uncia\")^0.9 (t_as_distinct_boost_by_payload_foo:\"big cat\")^0.8 (t_as_distinct_boost_by_payload_foo:white_leopard)^0.6 t_as_distinct_boost_by_payload_foo:\"snow leopard\")", q.toString()); + q = QParser.getParser("snow leopard", req(params("df", "t_as_same_term_boost_by_payload_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_same_term_boost_by_payload_foo:\"panthera uncia\")^0.9 (t_as_same_term_boost_by_payload_foo:\"big cat\")^0.8 (t_as_same_term_boost_by_payload_foo:white_leopard)^0.6 t_as_same_term_boost_by_payload_foo:\"snow leopard\")", q.toString()); + //panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65 q = QParser.getParser("panthera onca", req(params("df", "t_pick_best_boost_by_payload_foo","sow", "false"))).getQuery(); assertEquals("((t_pick_best_boost_by_payload_foo:jaguar)^0.95 | (t_pick_best_boost_by_payload_foo:\"big cat\")^0.85 | (t_pick_best_boost_by_payload_foo:\"black panther\")^0.65)", q.toString()); q = QParser.getParser("panthera onca", req(params("df", "t_as_distinct_boost_by_payload_foo","sow", "false"))).getQuery(); assertEquals("((t_as_distinct_boost_by_payload_foo:jaguar)^0.95 (t_as_distinct_boost_by_payload_foo:\"big cat\")^0.85 (t_as_distinct_boost_by_payload_foo:\"black panther\")^0.65)", q.toString()); + + q = QParser.getParser("panthera onca", req(params("df", "t_as_same_term_boost_by_payload_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_same_term_boost_by_payload_foo:jaguar)^0.95 (t_as_same_term_boost_by_payload_foo:\"big cat\")^0.85 (t_as_same_term_boost_by_payload_foo:\"black panther\")^0.65)", q.toString()); + } public void testSynonymsBoostByPayload_multiConceptsQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { @@ -1307,7 +1333,12 @@ public void testSynonymsBoostByPayload_multiConceptsQueryMultiTermSynonyms_shoul q = QParser.getParser("snow leopard panthera onca", req(params("df", "t_as_distinct_boost_by_payload_foo","sow", "false"))).getQuery(); assertEquals("((t_as_distinct_boost_by_payload_foo:\"panthera uncia\")^0.9 (t_as_distinct_boost_by_payload_foo:\"big cat\")^0.8 (t_as_distinct_boost_by_payload_foo:white_leopard)^0.6 t_as_distinct_boost_by_payload_foo:\"snow leopard\")" + " ((t_as_distinct_boost_by_payload_foo:jaguar)^0.95 (t_as_distinct_boost_by_payload_foo:\"big cat\")^0.85 (t_as_distinct_boost_by_payload_foo:\"black panther\")^0.65)", q.toString()); - } + + q = QParser.getParser("snow leopard panthera onca", req(params("df", "t_as_same_term_boost_by_payload_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_same_term_boost_by_payload_foo:\"panthera uncia\")^0.9 (t_as_same_term_boost_by_payload_foo:\"big cat\")^0.8 (t_as_same_term_boost_by_payload_foo:white_leopard)^0.6 t_as_same_term_boost_by_payload_foo:\"snow leopard\")" + + " ((t_as_same_term_boost_by_payload_foo:jaguar)^0.95 (t_as_same_term_boost_by_payload_foo:\"big cat\")^0.85 (t_as_same_term_boost_by_payload_foo:\"black panther\")^0.65)", q.toString()); + + } public void testSynonymsBoostByPayload_edismaxBoost_shouldParseBoostedPhraseQuery() throws Exception { Query q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_pick_best_boost_by_payload_foo^10"))).getQuery(); @@ -1320,6 +1351,12 @@ public void testSynonymsBoostByPayload_edismaxBoost_shouldParseBoostedPhraseQuer assertEquals("+(" + "(((t_as_distinct_boost_by_payload_foo:\"panthera uncia\")^0.9 (t_as_distinct_boost_by_payload_foo:\"big cat\")^0.8 (t_as_distinct_boost_by_payload_foo:white_leopard)^0.6 t_as_distinct_boost_by_payload_foo:\"snow leopard\")^10.0)" + " (((t_as_distinct_boost_by_payload_foo:\"panthera leo\")^0.9 (t_as_distinct_boost_by_payload_foo:\"simba leo\")^0.8 (t_as_distinct_boost_by_payload_foo:kimba)^0.75)^10.0))", q.toString()); + + q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_as_same_term_boost_by_payload_foo^10"))).getQuery(); + assertEquals("+(" + + "(((t_as_same_term_boost_by_payload_foo:\"panthera uncia\")^0.9 (t_as_same_term_boost_by_payload_foo:\"big cat\")^0.8 (t_as_same_term_boost_by_payload_foo:white_leopard)^0.6 t_as_same_term_boost_by_payload_foo:\"snow leopard\")^10.0)" + + " (((t_as_same_term_boost_by_payload_foo:\"panthera leo\")^0.9 (t_as_same_term_boost_by_payload_foo:\"simba leo\")^0.8 (t_as_same_term_boost_by_payload_foo:kimba)^0.75)^10.0))", q.toString()); + } public void testSynonymsBoostByPayload_phraseQueryMultiTermSynonymsPayloadBoost_shouldParseBoostedSpanQuery() throws Exception { From 6d9f1ee645bf1ccfae2ec0523cc8ec85e40d9712 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Mon, 27 Jan 2020 18:58:37 +0000 Subject: [PATCH 19/36] [SOLR-12238] documentation improvement --- .../src/field-type-definitions-and-properties.adoc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc b/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc index c9ee82026551..c6e516ec7ad0 100644 --- a/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc +++ b/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc @@ -90,10 +90,14 @@ For multivalued fields, specifies a distance between multiple values, which prev `synonymQueryStyle`:: Query used to combine scores of overlapping query terms (i.e., synonyms). Consider a search for "blue tee" with query-time synonyms `tshirt,tee`. + -Use `as_same_term` (default) to blend terms, i.e., `SynonymQuery(tshirt,tee)` where each term will be treated as equally important. Use `pick_best` to select the most significant synonym when scoring `Dismax(tee,tshirt)`. Use `as_distinct_terms` to bias scoring towards the most significant synonym `(pants OR slacks)`. Use `*_boost_by_payload` ( 'pick_best_boost_by_payload' 'as_distinct_terms_boost_by_payload') to additionally apply a different boost to each synonym based on the synonym payload. +Use `as_same_term` (default) to blend terms, i.e., `SynonymQuery(tshirt,tee)` where each term will be treated as equally important. Use `pick_best` to select the most significant synonym when scoring `Dismax(tee,tshirt)`. Use `as_distinct_terms` to bias scoring towards the most significant synonym `(pants OR slacks)`. + `as_same_term` is appropriate when terms are true synonyms (television, tv). Use `pick_best` or `as_distinct_terms` when synonyms are expanding to hyponyms `(q=jeans w/ jeans\=>jeans,pants)` and you want exact to come before parent and sibling concepts. See this http://opensourceconnections.com/blog/2017/11/21/solr-synonyms-mea-culpa/[blog article]. +`boostByPayload`:: +This boolean parameter allow to apply an additional query time boost depending on the synonym payload. + + `enableGraphQueries`:: For text fields, applicable when querying with <> (which is the default for the `sow` parameter). Use `true`, the default, for field types with query analyzers including graph-aware filters, e.g., <> and <>. + From 7f70c0097fce38ae4c26a48260afd4b6ea4e0f0e Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 3 Feb 2020 12:30:58 +0000 Subject: [PATCH 20/36] LUCENE-9171: Add BoostAttribute handling to QueryBuilder --- .../org/apache/lucene/util/QueryBuilder.java | 60 +++++++++++++------ .../apache/lucene/util/TestQueryBuilder.java | 51 ++++++++++++++++ .../ComplexPhraseQueryParser.java | 4 +- .../solr/parser/SolrQueryParserBase.java | 10 ++-- 4 files changed, 101 insertions(+), 24 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index 31da91256517..e32dd8e3b5fe 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -30,6 +30,8 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostAttribute; +import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; @@ -63,6 +65,16 @@ public class QueryBuilder { protected boolean enableGraphQueries = true; protected boolean autoGenerateMultiTermSynonymsPhraseQuery = false; + public static class TermAndBoost { + public final Term term; + public final float boost; + + public TermAndBoost(Term term, float boost) { + this.term = term; + this.boost = boost; + } + } + /** Creates a new QueryBuilder using the given analyzer. */ public QueryBuilder(Analyzer analyzer) { this.analyzer = analyzer; @@ -373,13 +385,14 @@ protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOExcep */ protected Query analyzeTerm(String field, TokenStream stream) throws IOException { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); + BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class); stream.reset(); if (!stream.incrementToken()) { throw new AssertionError(); } - return newTermQuery(new Term(field, termAtt.getBytesRef())); + return newTermQuery(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()); } /** @@ -387,24 +400,25 @@ protected Query analyzeTerm(String field, TokenStream stream) throws IOException */ protected Query analyzeBoolean(String field, TokenStream stream) throws IOException { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); + BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class); stream.reset(); - List terms = new ArrayList<>(); + List terms = new ArrayList<>(); while (stream.incrementToken()) { - terms.add(new Term(field, termAtt.getBytesRef())); + terms.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost())); } - return newSynonymQuery(terms.toArray(new Term[terms.size()])); + return newSynonymQuery(terms.toArray(new TermAndBoost[0])); } - protected void add(BooleanQuery.Builder q, List current, BooleanClause.Occur operator) { + protected void add(BooleanQuery.Builder q, List current, BooleanClause.Occur operator) { if (current.isEmpty()) { return; } if (current.size() == 1) { - q.add(newTermQuery(current.get(0)), operator); + q.add(newTermQuery(current.get(0).term, current.get(0).boost), operator); } else { - q.add(newSynonymQuery(current.toArray(new Term[current.size()])), operator); + q.add(newSynonymQuery(current.toArray(new TermAndBoost[0])), operator); } } @@ -413,10 +427,11 @@ protected void add(BooleanQuery.Builder q, List current, BooleanClause.Occ */ protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException { BooleanQuery.Builder q = newBooleanQuery(); - List currentQuery = new ArrayList<>(); + List currentQuery = new ArrayList<>(); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); + BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class); stream.reset(); while (stream.incrementToken()) { @@ -424,7 +439,7 @@ protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanCla add(q, currentQuery, operator); currentQuery.clear(); } - currentQuery.add(new Term(field, termAtt.getBytesRef())); + currentQuery.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost())); } add(q, currentQuery, operator); @@ -526,10 +541,17 @@ public Query next() { }; queryPos = newGraphSynonymQuery(queries); } else { - Term[] terms = graph.getTerms(field, start); + List attributes = graph.getTerms(start); + TermAndBoost[] terms = attributes.stream() + .map(s -> { + TermToBytesRefAttribute t = s.addAttribute(TermToBytesRefAttribute.class); + BoostAttribute b = s.addAttribute(BoostAttribute.class); + return new TermAndBoost(new Term(field, t.getBytesRef()), b.getBoost()); + }) + .toArray(TermAndBoost[]::new); assert terms.length > 0; if (terms.length == 1) { - queryPos = newTermQuery(terms[0]); + queryPos = newTermQuery(terms[0].term, terms[0].boost); } else { queryPos = newSynonymQuery(terms); } @@ -650,10 +672,10 @@ protected BooleanQuery.Builder newBooleanQuery() { * This is intended for subclasses that wish to customize the generated queries. * @return new Query instance */ - protected Query newSynonymQuery(Term terms[]) { - SynonymQuery.Builder builder = new SynonymQuery.Builder(terms[0].field()); - for (Term term : terms) { - builder.addTerm(term); + protected Query newSynonymQuery(TermAndBoost[] terms) { + SynonymQuery.Builder builder = new SynonymQuery.Builder(terms[0].term.field()); + for (TermAndBoost t : terms) { + builder.addTerm(t.term, t.boost); } return builder.build(); } @@ -683,8 +705,12 @@ protected Query newGraphSynonymQuery(Iterator queries) { * @param term term * @return new TermQuery instance */ - protected Query newTermQuery(Term term) { - return new TermQuery(term); + protected Query newTermQuery(Term term, float boost) { + Query q = new TermQuery(term); + if (boost == 1.0f) { + return q; + } + return new BoostQuery(q, boost); } /** diff --git a/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java b/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java index 7289ead38eff..f2119d538c0c 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java @@ -20,6 +20,7 @@ import java.io.IOException; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.AnalyzerWrapper; import org.apache.lucene.analysis.CannedBinaryTokenStream; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockSynonymFilter; @@ -29,9 +30,12 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostAttribute; +import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; @@ -507,4 +511,51 @@ public void testMaxBooleanClause() throws Exception { expectThrows(IndexSearcher.TooManyClauses.class, () -> qb.analyzeGraphPhrase(ts, "", 0)); } } + + private static final class MockBoostTokenFilter extends TokenFilter { + + final BoostAttribute boostAtt = addAttribute(BoostAttribute.class); + final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + protected MockBoostTokenFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken() == false) { + return false; + } + if (termAtt.length() == 3) { + boostAtt.setBoost(0.5f); + } + return true; + } + } + + public void testTokenStreamBoosts() { + Analyzer msa = new MockSynonymAnalyzer(); + Analyzer a = new AnalyzerWrapper(msa.getReuseStrategy()) { + @Override + protected Analyzer getWrappedAnalyzer(String fieldName) { + return msa; + } + @Override + protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { + return new TokenStreamComponents(components.getSource(), new MockBoostTokenFilter(components.getTokenStream())); + } + }; + + QueryBuilder builder = new QueryBuilder(a); + Query q = builder.createBooleanQuery("field", "hot dogs"); + Query expected = new BooleanQuery.Builder() + .add(new BoostQuery(new TermQuery(new Term("field", "hot")), 0.5f), BooleanClause.Occur.SHOULD) + .add(new SynonymQuery.Builder("field") + .addTerm(new Term("field", "dogs")) + .addTerm(new Term("field", "dog"), 0.5f) + .build(), BooleanClause.Occur.SHOULD) + .build(); + + assertEquals(expected, q); + } } diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java index 9a4043d1d8a3..d552aef9fe75 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java @@ -147,7 +147,7 @@ public Query parse(String query) throws ParseException { // to throw a runtime exception here if a term for another field is embedded // in phrase query @Override - protected Query newTermQuery(Term term) { + protected Query newTermQuery(Term term, float boost) { if (isPass2ResolvingPhrases) { try { checkPhraseClauseIsForSameField(term.field()); @@ -155,7 +155,7 @@ protected Query newTermQuery(Term term) { throw new RuntimeException("Error parsing complex phrase", pe); } } - return super.newTermQuery(term); + return super.newTermQuery(term, boost); } // Helper method used to report on any clauses that appear in query syntax diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java index 81516f695b6c..c2b2ad448292 100644 --- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java +++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java @@ -601,19 +601,19 @@ protected Query newRegexpQuery(Term regexp) { } @Override - protected Query newSynonymQuery(Term terms[]) { + protected Query newSynonymQuery(TermAndBoost[] terms) { switch (synonymQueryStyle) { case PICK_BEST: List currPosnClauses = new ArrayList(terms.length); - for (Term term : terms) { - currPosnClauses.add(newTermQuery(term)); + for (TermAndBoost term : terms) { + currPosnClauses.add(newTermQuery(term.term, term.boost)); } DisjunctionMaxQuery dm = new DisjunctionMaxQuery(currPosnClauses, 0.0f); return dm; case AS_DISTINCT_TERMS: BooleanQuery.Builder builder = new BooleanQuery.Builder(); - for (Term term : terms) { - builder.add(newTermQuery(term), BooleanClause.Occur.SHOULD); + for (TermAndBoost term : terms) { + builder.add(newTermQuery(term.term, term.boost), BooleanClause.Occur.SHOULD); } return builder.build(); case AS_SAME_TERM: From 96e1ed31ff5b13d1d393b44820cfb1cfa8c1b83d Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 3 Feb 2020 14:09:35 +0000 Subject: [PATCH 21/36] imports --- .../core/src/test/org/apache/lucene/util/TestQueryBuilder.java | 1 - 1 file changed, 1 deletion(-) diff --git a/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java b/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java index f2119d538c0c..927dfd4080f9 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java @@ -30,7 +30,6 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; From 8f349f0e8bace37432e03b796c6838da70ec9cd8 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 3 Feb 2020 14:24:27 +0000 Subject: [PATCH 22/36] javadocs --- .../src/java/org/apache/lucene/util/QueryBuilder.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index e32dd8e3b5fe..b05610d6e9fa 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -65,10 +65,18 @@ public class QueryBuilder { protected boolean enableGraphQueries = true; protected boolean autoGenerateMultiTermSynonymsPhraseQuery = false; + /** + * Wraps a term and boost + */ public static class TermAndBoost { + /** the term */ public final Term term; + /** the boost */ public final float boost; + /** + * Creates a new TermAndBoost + */ public TermAndBoost(Term term, float boost) { this.term = term; this.boost = boost; From 9928a4a03e02db9dba8f682ec83887dc7cbada81 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Fri, 7 Feb 2020 15:06:00 +0000 Subject: [PATCH 23/36] [SOLR-12238] first implementation of the boostAttribute approach --- .../boost/DelimitedBoostTokenFilter.java | 63 +++++++++++++++++ .../DelimitedBoostTokenFilterFactory.java | 68 +++++++++++++++++++ ...he.lucene.analysis.util.TokenFilterFactory | 1 + .../org/apache/lucene/util/QueryBuilder.java | 55 +++++++++++++-- .../org/apache/solr/parser/QueryParser.java | 40 ++++++----- .../solr/parser/SolrQueryParserBase.java | 25 +++++++ .../solr/search/ExtendedDismaxQParser.java | 8 +-- .../solr/collection1/conf/schema12.xml | 12 ++-- .../solr/search/TestExtendedDismaxParser.java | 10 +-- 9 files changed, 235 insertions(+), 47 deletions(-) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java new file mode 100644 index 000000000000..21b1232fc238 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.boost; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.search.BoostAttribute; + +import java.io.IOException; + + +/** + * Characters before the delimiter are the "token", those after are the boost. + *

+ * For example, if the delimiter is '|', then for the string "foo|0.7", foo is the token + * and 0.7 is the boost. + *

+ * Note make sure your Tokenizer doesn't split on the delimiter, or this won't work + */ +public final class DelimitedBoostTokenFilter extends TokenFilter { + private final char delimiter; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final BoostAttribute boostAtt = addAttribute(BoostAttribute.class); + + public DelimitedBoostTokenFilter(TokenStream input, char delimiter) { + super(input); + this.delimiter = delimiter; + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + final char[] buffer = termAtt.buffer(); + final int length = termAtt.length(); + for (int i = 0; i < length; i++) { + if (buffer[i] == delimiter) { + float boost = Float.parseFloat(new String(buffer, i + 1, (length - (i + 1)))); + boostAtt.setBoost(boost); + termAtt.setLength(i); + return true; + } + } + // we have not seen the delimiter + boostAtt.setBoost(1.0f); + return true; + } else return false; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java new file mode 100644 index 000000000000..b87fa52b051b --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.boost; + + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; +import org.apache.lucene.analysis.payloads.FloatEncoder; +import org.apache.lucene.analysis.payloads.IdentityEncoder; +import org.apache.lucene.analysis.payloads.IntegerEncoder; +import org.apache.lucene.analysis.payloads.PayloadEncoder; +import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoaderAware; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import java.util.Map; + +/** + * Factory for {@link DelimitedBoostTokenFilter}. + *

+ * <fieldType name="text_dlmtd" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.DelimitedBoostTokenFilterFactory" delimiter="|"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * @since 3.1 + * @lucene.spi {@value #NAME} + */ +public class DelimitedBoostTokenFilterFactory extends TokenFilterFactory { + + /** SPI name */ + public static final String NAME = "delimitedBoost"; + public static final String DELIMITER_ATTR = "delimiter"; + public static final char DEFAULT_DELIMITER = '|'; + + private final char delimiter; + + /** Creates a new DelimitedPayloadTokenFilterFactory */ + public DelimitedBoostTokenFilterFactory(Map args) { + super(args); + delimiter = getChar(args, DELIMITER_ATTR, DEFAULT_DELIMITER); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public DelimitedBoostTokenFilter create(TokenStream input) { + return new DelimitedBoostTokenFilter(input, delimiter); + } + +} diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory index 16fca20f84e7..fd13e6fc86ce 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -17,6 +17,7 @@ org.apache.lucene.analysis.tr.ApostropheFilterFactory org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory org.apache.lucene.analysis.ar.ArabicStemFilterFactory org.apache.lucene.analysis.bg.BulgarianStemFilterFactory +org.apache.lucene.analysis.boost.DelimitedBoostTokenFilterFactory org.apache.lucene.analysis.bn.BengaliNormalizationFilterFactory org.apache.lucene.analysis.bn.BengaliStemFilterFactory org.apache.lucene.analysis.br.BrazilianStemFilterFactory diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index 0f169597b5b1..81d0dc8b722c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -38,6 +38,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.SynonymQuery; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.spans.SpanBoostQuery; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; @@ -369,13 +370,36 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato * returned. When multiple tokens, an ordered SpanNearQuery with slop 0 is returned. */ protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOException { - List clonedAttributes = new ArrayList<>(); + TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class); + BoostAttribute boostAtt = in.addAttribute(BoostAttribute.class); + + SpanQuery result; + float boost = 1.0f; + if (termAtt == null) { + return null; + } + + List terms = new ArrayList<>(); while (in.incrementToken()) { - clonedAttributes.add(in.cloneAttributes()); + boost = boostAtt.getBoost(); + SpanQuery query = new SpanTermQuery(new Term(field, termAtt.getBytesRef())); + terms.add(query); + } + + if (terms.isEmpty()) { + return null; + } else if (terms.size() == 1) { + result = terms.get(0); + } else { + result = new SpanNearQuery(terms.toArray(new SpanQuery[0]), 0, true); + } + + if (boost != 1.0f) { + result = new SpanBoostQuery(result, boost); } - return newSpanQuery(field,clonedAttributes.toArray(new AttributeSource[clonedAttributes.size()])); + return result; } - + /** * Creates simple term query from the cached tokenstream contents */ @@ -446,12 +470,29 @@ protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanCla * Creates simple phrase query from the cached tokenstream contents */ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws IOException { - List clonedAttributes = new LinkedList<>(); + PhraseQuery.Builder builder = new PhraseQuery.Builder(); + builder.setSlop(slop); + + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); + BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class); + PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); + int position = -1; + float phraseBoost = 1.0f; stream.reset(); while (stream.incrementToken()) { - clonedAttributes.add(stream.cloneAttributes()); + if (enablePositionIncrements) { + position += posIncrAtt.getPositionIncrement(); + } else { + position += 1; + } + builder.add(new Term(field, termAtt.getBytesRef()), position); + phraseBoost = boostAtt.getBoost(); + } + PhraseQuery query = builder.build(); + if (phraseBoost == 1.0f) { + return query; } - return newPhraseQuery(field,clonedAttributes.toArray(new AttributeSource[clonedAttributes.size()]),slop); + return new BoostQuery(query, phraseBoost); } /** diff --git a/solr/core/src/java/org/apache/solr/parser/QueryParser.java b/solr/core/src/java/org/apache/solr/parser/QueryParser.java index f2b792e46b0a..518cdefaa791 100644 --- a/solr/core/src/java/org/apache/solr/parser/QueryParser.java +++ b/solr/core/src/java/org/apache/solr/parser/QueryParser.java @@ -52,16 +52,14 @@ private static boolean allowedPostMultiTerm(int tokenKind) { @Override protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, - boolean quoted, boolean fieldAutoGenPhraseQueries, - boolean fieldEnableGraphQueries, - SynonymQueryStyle synonymQueryStyle) + boolean quoted, boolean fieldAutoGenPhraseQueries, boolean fieldEnableGraphQueries, + SynonymQueryStyle synonymQueryStyle) throws SyntaxError { setAutoGenerateMultiTermSynonymsPhraseQuery(fieldAutoGenPhraseQueries || getAutoGeneratePhraseQueries()); // Don't auto-quote graph-aware field queries boolean treatAsQuoted = getSplitOnWhitespace() ? (quoted || fieldAutoGenPhraseQueries || getAutoGeneratePhraseQueries()) : quoted; - return super.newFieldQuery(analyzer, field, queryText, treatAsQuoted, false, - fieldEnableGraphQueries, synonymQueryStyle); + return super.newFieldQuery(analyzer, field, queryText, treatAsQuoted, false, fieldEnableGraphQueries, synonymQueryStyle); } // * Query ::= ( Clause )* @@ -618,6 +616,22 @@ private boolean jj_2_3(int xla) { finally { jj_save(2, xla); } } + private boolean jj_3R_7() { + if (jj_scan_token(TERM)) return true; + return false; + } + + private boolean jj_3R_4() { + if (jj_scan_token(TERM)) return true; + if (jj_scan_token(COLON)) return true; + return false; + } + + private boolean jj_3_1() { + if (jj_3R_3()) return true; + return false; + } + private boolean jj_3R_6() { return false; } @@ -658,22 +672,6 @@ private boolean jj_3R_5() { return false; } - private boolean jj_3R_7() { - if (jj_scan_token(TERM)) return true; - return false; - } - - private boolean jj_3R_4() { - if (jj_scan_token(TERM)) return true; - if (jj_scan_token(COLON)) return true; - return false; - } - - private boolean jj_3_1() { - if (jj_3R_3()) return true; - return false; - } - /** Generated Token Manager. */ public QueryParserTokenManager token_source; /** Current token. */ diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java index 19ea38ce7dc6..fc5c7523f23c 100644 --- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java +++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java @@ -21,6 +21,8 @@ import java.util.Collections; import java.util.EnumSet; import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -600,6 +602,29 @@ protected Query newRegexpQuery(Term regexp) { return query; } + /** + * Builds a new GraphQuery for multi-terms synonyms. + *

+ * This is intended for subclasses that wish to customize the generated queries. + * + * @return new Query instance + */ + @Override + protected Query newGraphSynonymQuery(Iterator sidePathQueriesIterator) { + switch (synonymQueryStyle) { + case PICK_BEST: { + List sidePathSynonymQueries = new LinkedList<>(); + sidePathQueriesIterator.forEachRemaining(sidePathSynonymQueries::add); + return new DisjunctionMaxQuery(sidePathSynonymQueries, 0.0f); + } + case AS_SAME_TERM: + case AS_DISTINCT_TERMS:{ + return super.newGraphSynonymQuery(sidePathQueriesIterator);} + default: + throw new AssertionError("unrecognized synonymQueryStyle passed when creating newSynonymQuery"); + } + } + @Override protected Query newSynonymQuery(TermAndBoost[] terms) { switch (synonymQueryStyle) { diff --git a/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java index c3ff3f8b4946..de5700d9bd25 100644 --- a/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java +++ b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java @@ -29,8 +29,6 @@ import java.util.Map; import java.util.Set; -import org.apache.commons.lang3.StringUtils; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.StopFilterFactory; import org.apache.lucene.analysis.util.TokenFilterFactory; @@ -134,7 +132,7 @@ public Query parse() throws SyntaxError { parsedUserQuery = null; String userQuery = getString(); altUserQuery = null; - if (StringUtils.isBlank(userQuery)) { + if( userQuery == null || userQuery.trim().length() == 0 ) { // If no query is specified, we may have an alternate if (config.altQ != null) { QParser altQParser = subQuery(config.altQ, null); @@ -1087,7 +1085,7 @@ protected Query getPrefixQuery(String field, String val) throws SyntaxError { @Override protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, - boolean quoted, boolean fieldAutoGenPhraseQueries, boolean enableGraphQueries, boolean fieldSynonymsBoostByPayload, + boolean quoted, boolean fieldAutoGenPhraseQueries, boolean enableGraphQueries, SynonymQueryStyle synonymQueryStyle) throws SyntaxError { Analyzer actualAnalyzer; @@ -1102,7 +1100,7 @@ protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, } else { actualAnalyzer = parser.getReq().getSchema().getFieldType(field).getQueryAnalyzer(); } - return super.newFieldQuery(actualAnalyzer, field, queryText, quoted, fieldAutoGenPhraseQueries, enableGraphQueries, fieldSynonymsBoostByPayload, synonymQueryStyle); + return super.newFieldQuery(actualAnalyzer, field, queryText, quoted, fieldAutoGenPhraseQueries, enableGraphQueries, synonymQueryStyle); } @Override diff --git a/solr/core/src/test-files/solr/collection1/conf/schema12.xml b/solr/core/src/test-files/solr/collection1/conf/schema12.xml index 98bffcb3ed7e..ad72261e3d12 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema12.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema12.xml @@ -197,7 +197,7 @@ - + @@ -205,7 +205,7 @@ - + @@ -238,7 +238,7 @@ - + @@ -246,11 +246,11 @@ - + - + @@ -258,7 +258,7 @@ - + diff --git a/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java b/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java index e012453951cb..1f0b5690b90e 100644 --- a/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java +++ b/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java @@ -241,12 +241,6 @@ public void testFocusQueryParser() { "q.alt",allq, "defType","edismax") ,allr); - - assertQ("ideographic space should be considered whitespace", - req("q","\u3000", - "q.alt",allq, - "defType","edismax") - ,allr); assertQ("expected doc is missing (using un-escaped edismax w/qf)", req("q", "literal:colon", @@ -2087,10 +2081,10 @@ protected Query getFieldQuery(String field, @Override protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, boolean quoted, boolean fieldAutoGenPhraseQueries, - boolean fieldEnableGraphQueries, boolean fieldSynonymsBoostByPayload, SynonymQueryStyle synonymQueryStyle) + boolean fieldEnableGraphQueries, SynonymQueryStyle synonymQueryStyle) throws SyntaxError { Query q = super.newFieldQuery - (analyzer, field, queryText, quoted, fieldAutoGenPhraseQueries, fieldEnableGraphQueries, fieldSynonymsBoostByPayload, synonymQueryStyle); + (analyzer, field, queryText, quoted, fieldAutoGenPhraseQueries, fieldEnableGraphQueries, synonymQueryStyle); if (q instanceof BooleanQuery) { boolean rewrittenSubQ = false; // dirty flag: rebuild the repacked query? BooleanQuery.Builder builder = newBooleanQuery(); From 71b5a43ba811740d737512468fa62bf8db724f4d Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Fri, 7 Feb 2020 15:25:44 +0000 Subject: [PATCH 24/36] [SOLR-12238] refinement of the boostAttribute approach --- .../DelimitedBoostTokenFilterFactory.java | 24 +++++------ .../apache/lucene/util/AttributeSource.java | 17 -------- .../org/apache/lucene/util/QueryBuilder.java | 7 ++-- .../org/apache/solr/parser/QueryParser.java | 40 ++++++++++--------- .../solr/parser/SolrQueryParserBase.java | 7 ---- .../org/apache/solr/schema/FieldType.java | 1 - .../org/apache/solr/schema/TextField.java | 11 ----- .../solr/search/ExtendedDismaxQParser.java | 4 +- .../solr/collection1/conf/schema12.xml | 27 +++++++------ .../solr/search/TestExtendedDismaxParser.java | 6 +++ .../solr/search/TestSolrQueryParser.java | 1 + ...field-type-definitions-and-properties.adoc | 4 -- 12 files changed, 58 insertions(+), 91 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java index b87fa52b051b..2f5be8fefc35 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java @@ -16,15 +16,7 @@ */ package org.apache.lucene.analysis.boost; - import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; -import org.apache.lucene.analysis.payloads.FloatEncoder; -import org.apache.lucene.analysis.payloads.IdentityEncoder; -import org.apache.lucene.analysis.payloads.IntegerEncoder; -import org.apache.lucene.analysis.payloads.PayloadEncoder; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.lucene.analysis.util.ResourceLoaderAware; import org.apache.lucene.analysis.util.TokenFilterFactory; import java.util.Map; @@ -39,19 +31,23 @@ * </analyzer> * </fieldType> * - * @since 3.1 * @lucene.spi {@value #NAME} + * @since 3.1 */ public class DelimitedBoostTokenFilterFactory extends TokenFilterFactory { - /** SPI name */ + /** + * SPI name + */ public static final String NAME = "delimitedBoost"; public static final String DELIMITER_ATTR = "delimiter"; public static final char DEFAULT_DELIMITER = '|'; - + private final char delimiter; - - /** Creates a new DelimitedPayloadTokenFilterFactory */ + + /** + * Creates a new DelimitedPayloadTokenFilterFactory + */ public DelimitedBoostTokenFilterFactory(Map args) { super(args); delimiter = getChar(args, DELIMITER_ATTR, DEFAULT_DELIMITER); @@ -64,5 +60,5 @@ public DelimitedBoostTokenFilterFactory(Map args) { public DelimitedBoostTokenFilter create(TokenStream input) { return new DelimitedBoostTokenFilter(input, delimiter); } - + } diff --git a/lucene/core/src/java/org/apache/lucene/util/AttributeSource.java b/lucene/core/src/java/org/apache/lucene/util/AttributeSource.java index 7e5f44341df8..e962fedc1deb 100644 --- a/lucene/core/src/java/org/apache/lucene/util/AttributeSource.java +++ b/lucene/core/src/java/org/apache/lucene/util/AttributeSource.java @@ -171,23 +171,6 @@ static Class[] getAttributeInterfaces(final Class - * The caller must pass in a Class<? extends AttributeImpl> value. - * - * @return instance of the passed in AttributeImpl, or {@code null} if this AttributeSource - * does not contain the AttributeImpl. It is recommended to always use - * {@link #addAttributeImpl} even in consumers of TokenStreams, because you cannot - * know if a specific TokenStream really uses a specific AttributeImpl. - * {@link #addAttributeImpl} will automatically make the attribute impl available. - * If you want to only use the attribute , if it is available (to optimize - * consuming), use {@link #hasAttribute}. - */ - public final T getAttributeImpl(Class attClass) { - return attClass.cast(attributeImpls.get(attClass)); - } - /** Expert: Adds a custom AttributeImpl instance with one or more Attribute interfaces. *

NOTE: It is not guaranteed, that att is added to * the AttributeSource, because the provided attributes may already exist. diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index 81d0dc8b722c..e5718a2dcbd8 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -379,11 +379,10 @@ protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOExcep return null; } - List terms = new ArrayList<>(); + List terms = new ArrayList<>(); while (in.incrementToken()) { boost = boostAtt.getBoost(); - SpanQuery query = new SpanTermQuery(new Term(field, termAtt.getBytesRef())); - terms.add(query); + terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef()))); } if (terms.isEmpty()) { @@ -472,7 +471,7 @@ protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanCla protected Query analyzePhrase(String field, TokenStream stream, int slop) throws IOException { PhraseQuery.Builder builder = new PhraseQuery.Builder(); builder.setSlop(slop); - + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); diff --git a/solr/core/src/java/org/apache/solr/parser/QueryParser.java b/solr/core/src/java/org/apache/solr/parser/QueryParser.java index 518cdefaa791..f2b792e46b0a 100644 --- a/solr/core/src/java/org/apache/solr/parser/QueryParser.java +++ b/solr/core/src/java/org/apache/solr/parser/QueryParser.java @@ -52,14 +52,16 @@ private static boolean allowedPostMultiTerm(int tokenKind) { @Override protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, - boolean quoted, boolean fieldAutoGenPhraseQueries, boolean fieldEnableGraphQueries, - SynonymQueryStyle synonymQueryStyle) + boolean quoted, boolean fieldAutoGenPhraseQueries, + boolean fieldEnableGraphQueries, + SynonymQueryStyle synonymQueryStyle) throws SyntaxError { setAutoGenerateMultiTermSynonymsPhraseQuery(fieldAutoGenPhraseQueries || getAutoGeneratePhraseQueries()); // Don't auto-quote graph-aware field queries boolean treatAsQuoted = getSplitOnWhitespace() ? (quoted || fieldAutoGenPhraseQueries || getAutoGeneratePhraseQueries()) : quoted; - return super.newFieldQuery(analyzer, field, queryText, treatAsQuoted, false, fieldEnableGraphQueries, synonymQueryStyle); + return super.newFieldQuery(analyzer, field, queryText, treatAsQuoted, false, + fieldEnableGraphQueries, synonymQueryStyle); } // * Query ::= ( Clause )* @@ -616,22 +618,6 @@ private boolean jj_2_3(int xla) { finally { jj_save(2, xla); } } - private boolean jj_3R_7() { - if (jj_scan_token(TERM)) return true; - return false; - } - - private boolean jj_3R_4() { - if (jj_scan_token(TERM)) return true; - if (jj_scan_token(COLON)) return true; - return false; - } - - private boolean jj_3_1() { - if (jj_3R_3()) return true; - return false; - } - private boolean jj_3R_6() { return false; } @@ -672,6 +658,22 @@ private boolean jj_3R_5() { return false; } + private boolean jj_3R_7() { + if (jj_scan_token(TERM)) return true; + return false; + } + + private boolean jj_3R_4() { + if (jj_scan_token(TERM)) return true; + if (jj_scan_token(COLON)) return true; + return false; + } + + private boolean jj_3_1() { + if (jj_3R_3()) return true; + return false; + } + /** Generated Token Manager. */ public QueryParserTokenManager token_source; /** Current token. */ diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java index fc5c7523f23c..a4084d1509de 100644 --- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java +++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java @@ -602,13 +602,6 @@ protected Query newRegexpQuery(Term regexp) { return query; } - /** - * Builds a new GraphQuery for multi-terms synonyms. - *

- * This is intended for subclasses that wish to customize the generated queries. - * - * @return new Query instance - */ @Override protected Query newGraphSynonymQuery(Iterator sidePathQueriesIterator) { switch (synonymQueryStyle) { diff --git a/solr/core/src/java/org/apache/solr/schema/FieldType.java b/solr/core/src/java/org/apache/solr/schema/FieldType.java index 937b68cb9a62..e344019ca3d8 100644 --- a/solr/core/src/java/org/apache/solr/schema/FieldType.java +++ b/solr/core/src/java/org/apache/solr/schema/FieldType.java @@ -1063,7 +1063,6 @@ protected void checkSupportsDocValues() { protected static final String AUTO_GENERATE_PHRASE_QUERIES = "autoGeneratePhraseQueries"; protected static final String ENABLE_GRAPH_QUERIES = "enableGraphQueries"; - protected static final String BOOST_BY_PAYLOAD = "boostByPayload"; private static final String ARGS = "args"; private static final String POSITION_INCREMENT_GAP = "positionIncrementGap"; protected static final String SYNONYM_QUERY_STYLE = "synonymQueryStyle"; diff --git a/solr/core/src/java/org/apache/solr/schema/TextField.java b/solr/core/src/java/org/apache/solr/schema/TextField.java index daaea9334378..bddaf00c760c 100644 --- a/solr/core/src/java/org/apache/solr/schema/TextField.java +++ b/solr/core/src/java/org/apache/solr/schema/TextField.java @@ -43,7 +43,6 @@ public class TextField extends FieldType { protected boolean autoGeneratePhraseQueries; protected boolean enableGraphQueries; - protected boolean synonymBoostByPayload; protected SolrQueryParserBase.SynonymQueryStyle synonymQueryStyle; /** @@ -88,12 +87,6 @@ protected void init(IndexSchema schema, Map args) { if (enableGraphQueriesStr != null) enableGraphQueries = Boolean.parseBoolean(enableGraphQueriesStr); - boolean boostByPayload = false; - String boostByPayloadStr = args.remove(BOOST_BY_PAYLOAD); - if (boostByPayloadStr != null) - boostByPayload = Boolean.parseBoolean(boostByPayloadStr); - this.synonymBoostByPayload = boostByPayload; - super.init(schema, args); } @@ -120,10 +113,6 @@ public boolean getEnableGraphQueries() { return enableGraphQueries; } - public boolean getSynonymBoostByPayload() { - return synonymBoostByPayload; - } - public SolrQueryParserBase.SynonymQueryStyle getSynonymQueryStyle() {return synonymQueryStyle;} @Override diff --git a/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java index de5700d9bd25..93aaf28f1dd2 100644 --- a/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java +++ b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java @@ -29,6 +29,8 @@ import java.util.Map; import java.util.Set; +import org.apache.commons.lang3.StringUtils; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.StopFilterFactory; import org.apache.lucene.analysis.util.TokenFilterFactory; @@ -132,7 +134,7 @@ public Query parse() throws SyntaxError { parsedUserQuery = null; String userQuery = getString(); altUserQuery = null; - if( userQuery == null || userQuery.trim().length() == 0 ) { + if (StringUtils.isBlank(userQuery)) { // If no query is specified, we may have an alternate if (config.altQ != null) { QParser altQParser = subQuery(config.altQ, null); diff --git a/solr/core/src/test-files/solr/collection1/conf/schema12.xml b/solr/core/src/test-files/solr/collection1/conf/schema12.xml index ad72261e3d12..299f97949ae6 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema12.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema12.xml @@ -197,18 +197,7 @@ - - - - - - - - - - - - + @@ -238,6 +227,18 @@ + + + + + + + + + + + + @@ -689,8 +690,8 @@ - + diff --git a/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java b/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java index 1f0b5690b90e..e8d7dbd16e61 100644 --- a/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java +++ b/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java @@ -241,6 +241,12 @@ public void testFocusQueryParser() { "q.alt",allq, "defType","edismax") ,allr); + + assertQ("ideographic space should be considered whitespace", + req("q","\u3000", + "q.alt",allq, + "defType","edismax") + ,allr); assertQ("expected doc is missing (using un-escaped edismax w/qf)", req("q", "literal:colon", diff --git a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java index f639b33acd02..7d3a9cf6e8c6 100644 --- a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java +++ b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java @@ -1209,6 +1209,7 @@ public void testShingleQueries() throws Exception { public void testSynonymQueryStyle() throws Exception { + Query q = QParser.getParser("tabby", req(params("df", "t_pick_best_foo"))).getQuery(); assertEquals("(t_pick_best_foo:tabbi | t_pick_best_foo:cat | t_pick_best_foo:felin | t_pick_best_foo:anim)", q.toString()); diff --git a/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc b/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc index c6e516ec7ad0..487cd4c348a8 100644 --- a/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc +++ b/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc @@ -94,10 +94,6 @@ Use `as_same_term` (default) to blend terms, i.e., `SynonymQuery(tshirt,tee)` wh + `as_same_term` is appropriate when terms are true synonyms (television, tv). Use `pick_best` or `as_distinct_terms` when synonyms are expanding to hyponyms `(q=jeans w/ jeans\=>jeans,pants)` and you want exact to come before parent and sibling concepts. See this http://opensourceconnections.com/blog/2017/11/21/solr-synonyms-mea-culpa/[blog article]. -`boostByPayload`:: -This boolean parameter allow to apply an additional query time boost depending on the synonym payload. - - `enableGraphQueries`:: For text fields, applicable when querying with <> (which is the default for the `sow` parameter). Use `true`, the default, for field types with query analyzers including graph-aware filters, e.g., <> and <>. + From a75d6183cf1015653d027801138b49125e957f79 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Fri, 7 Feb 2020 16:02:31 +0000 Subject: [PATCH 25/36] [SOLR-12238] test for boost token filter --- .../boost/DelimitedBoostTokenFilterTest.java | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterTest.java diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterTest.java new file mode 100644 index 000000000000..8b9d69000af6 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterTest.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.boost; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.search.BoostAttribute; + +public class DelimitedBoostTokenFilterTest extends BaseTokenStreamTestCase { + + public void testBoosts() throws Exception { + String test = "The quick|0.4 red|0.5 fox|0.2 jumped|0.1 over the lazy|0.8 brown|0.9 dogs|0.9"; + DelimitedBoostTokenFilter filter = new DelimitedBoostTokenFilter + (whitespaceMockTokenizer(test), + DelimitedBoostTokenFilterFactory.DEFAULT_DELIMITER); + CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); + BoostAttribute boostAtt = filter.addAttribute(BoostAttribute.class); + filter.reset(); + assertTermEquals("The", filter, termAtt, boostAtt, 1.0f); + assertTermEquals("quick", filter, termAtt, boostAtt, 0.4f); + assertTermEquals("red", filter, termAtt, boostAtt, 0.5f); + assertTermEquals("fox", filter, termAtt, boostAtt, 0.2f); + assertTermEquals("jumped", filter, termAtt, boostAtt, 0.1f); + assertTermEquals("over", filter, termAtt, boostAtt, 1.0f); + assertTermEquals("the", filter, termAtt, boostAtt, 1.0f); + assertTermEquals("lazy", filter, termAtt, boostAtt, 0.8f); + assertTermEquals("brown", filter, termAtt, boostAtt, 0.9f); + assertTermEquals("dogs", filter, termAtt, boostAtt, 0.9f); + assertFalse(filter.incrementToken()); + filter.end(); + filter.close(); + } + + public void testNext() throws Exception { + String test = "The quick|0.1 red|0.2 fox|0.3 jumped|0.4 over the lazy|0.5 brown|0.6 dogs|0.6"; + DelimitedBoostTokenFilter filter = new DelimitedBoostTokenFilter + (whitespaceMockTokenizer(test), + DelimitedBoostTokenFilterFactory.DEFAULT_DELIMITER); + filter.reset(); + assertTermEquals("The", filter, 1.0f); + assertTermEquals("quick", filter, 0.1f); + assertTermEquals("red", filter, 0.2f); + assertTermEquals("fox", filter, 0.3f); + assertTermEquals("jumped", filter, 0.4f); + assertTermEquals("over", filter, 1.0f); + assertTermEquals("the", filter, 1.0f); + assertTermEquals("lazy", filter, 0.5f); + assertTermEquals("brown", filter, 0.6f); + assertTermEquals("dogs", filter, 0.6f); + assertFalse(filter.incrementToken()); + filter.end(); + filter.close(); + } + + void assertTermEquals(String expected, TokenStream stream, float expectedBoost) throws Exception { + CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); + BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class); + assertTrue(stream.incrementToken()); + assertEquals(expected, termAtt.toString()); + float actualBoost = boostAtt.getBoost(); + assertTrue(actualBoost + " does not equal: " + expectedBoost, actualBoost == expectedBoost); + } + + void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, BoostAttribute boostAtt, float expectedBoost) throws Exception { + assertTrue(stream.incrementToken()); + assertEquals(expected, termAtt.toString()); + float actualBoost = boostAtt.getBoost(); + assertTrue(actualBoost + " does not equal: " + expectedBoost, actualBoost == expectedBoost); + } +} From 61995cdc48abd511eb065b9ebf48dbc0b4e4e587 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Fri, 7 Feb 2020 16:05:13 +0000 Subject: [PATCH 26/36] [SOLR-12238] package info fix --- .../lucene/analysis/boost/package-info.java | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/package-info.java diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/package-info.java new file mode 100644 index 000000000000..9bae5dc4b235 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Provides various convenience classes for creating boosts on Tokens. + */ +package org.apache.lucene.analysis.boost; From 52d848a9961e19b07ad05d5368aec8e89a3d9d4a Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Fri, 7 Feb 2020 17:29:47 +0000 Subject: [PATCH 27/36] [SOLR-12238] adjustments of the github PR feedback --- .../boost/DelimitedBoostTokenFilter.java | 2 - .../org/apache/lucene/util/QueryBuilder.java | 24 +- .../graph/GraphTokenStreamFiniteStrings.java | 11 + .../solr/collection1/conf/schema12.xml | 12 +- .../solr/collection1/conf/synonyms.txt | 6 +- .../solr/search/TestSolrQueryParser.java | 245 +++++++++++------- 6 files changed, 183 insertions(+), 117 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java index 21b1232fc238..b34b1c8a628d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java @@ -55,8 +55,6 @@ public boolean incrementToken() throws IOException { return true; } } - // we have not seen the delimiter - boostAtt.setBoost(1.0f); return true; } else return false; } diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index e5718a2dcbd8..6f5bfe494596 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -70,6 +70,7 @@ public class QueryBuilder { * Wraps a term and boost */ public static class TermAndBoost { + private static final float DEFAULT_BOOST = 1.0f; /** the term */ public final Term term; /** the boost */ @@ -374,14 +375,14 @@ protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOExcep BoostAttribute boostAtt = in.addAttribute(BoostAttribute.class); SpanQuery result; - float boost = 1.0f; + float boost = TermAndBoost.DEFAULT_BOOST; if (termAtt == null) { return null; } List terms = new ArrayList<>(); while (in.incrementToken()) { - boost = boostAtt.getBoost(); + boost *= boostAtt.getBoost(); terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef()))); } @@ -393,7 +394,7 @@ protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOExcep result = new SpanNearQuery(terms.toArray(new SpanQuery[0]), 0, true); } - if (boost != 1.0f) { + if (boost != TermAndBoost.DEFAULT_BOOST) { result = new SpanBoostQuery(result, boost); } return result; @@ -476,7 +477,7 @@ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); int position = -1; - float phraseBoost = 1.0f; + float phraseBoost = TermAndBoost.DEFAULT_BOOST; stream.reset(); while (stream.incrementToken()) { if (enablePositionIncrements) { @@ -485,10 +486,10 @@ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws position += 1; } builder.add(new Term(field, termAtt.getBytesRef()), position); - phraseBoost = boostAtt.getBoost(); + phraseBoost *= boostAtt.getBoost(); } PhraseQuery query = builder.build(); - if (phraseBoost == 1.0f) { + if (phraseBoost == TermAndBoost.DEFAULT_BOOST) { return query; } return new BoostQuery(query, phraseBoost); @@ -565,14 +566,7 @@ public Query next() { }; positionalQuery = newGraphSynonymQuery(queries); } else { - List attributes = graph.getTerms(start); - TermAndBoost[] terms = attributes.stream() - .map(s -> { - TermToBytesRefAttribute t = s.addAttribute(TermToBytesRefAttribute.class); - BoostAttribute b = s.addAttribute(BoostAttribute.class); - return new TermAndBoost(new Term(field, t.getBytesRef()), b.getBoost()); - }) - .toArray(TermAndBoost[]::new); + TermAndBoost[] terms = graph.getTermsAndBoosts(field,start); assert terms.length > 0; if (terms.length == 1) { positionalQuery = newTermQuery(terms[0].term, terms[0].boost); @@ -731,7 +725,7 @@ protected Query newGraphSynonymQuery(Iterator queries) { */ protected Query newTermQuery(Term term, float boost) { Query q = new TermQuery(term); - if (boost == 1.0f) { + if (boost == TermAndBoost.DEFAULT_BOOST) { return q; } return new BoostQuery(q, boost); diff --git a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java index b2b530d93afb..fef0fb221d8e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java +++ b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java @@ -30,9 +30,11 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.index.Term; +import org.apache.lucene.search.BoostAttribute; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.QueryBuilder; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.FiniteStringsIterator; import org.apache.lucene.util.automaton.Operations; @@ -124,6 +126,15 @@ public Term[] getTerms(String field, int state) { .toArray(Term[]::new); } + /** + * Returns the list of terms that start at the provided state + */ + public QueryBuilder.TermAndBoost[] getTermsAndBoosts(String field, int state) { + return getTerms(state).stream() + .map(s -> new QueryBuilder.TermAndBoost(new Term(field, s.addAttribute(TermToBytesRefAttribute.class).getBytesRef()),s.addAttribute(BoostAttribute.class).getBoost())) + .toArray(QueryBuilder.TermAndBoost[]::new); + } + /** * Get all finite strings from the automaton. */ diff --git a/solr/core/src/test-files/solr/collection1/conf/schema12.xml b/solr/core/src/test-files/solr/collection1/conf/schema12.xml index 299f97949ae6..d4cb89e85f87 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema12.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema12.xml @@ -227,7 +227,7 @@ - + @@ -239,7 +239,7 @@ - + @@ -251,7 +251,7 @@ - + @@ -691,9 +691,9 @@ - - - + + + diff --git a/solr/core/src/test-files/solr/collection1/conf/synonyms.txt b/solr/core/src/test-files/solr/collection1/conf/synonyms.txt index a71ddc3ba718..1dad43d3a17c 100644 --- a/solr/core/src/test-files/solr/collection1/conf/synonyms.txt +++ b/solr/core/src/test-files/solr/collection1/conf/synonyms.txt @@ -46,5 +46,9 @@ lynx => lince|0.8, lynx_canadensis|0.9 leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85 lion => panthera leo|0.9, simba leo|0.8, kimba|0.75 +panthera pardus, leopard|0.6 +panthera tigris => tiger|0.99 + snow leopard, panthera uncia|0.9, big cat|0.8, white_leopard|0.6 -panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65 \ No newline at end of file +panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65 +panthera blytheae, oldest|0.5 ancient|0.9 panthera \ No newline at end of file diff --git a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java index 7d3a9cf6e8c6..6593c9dcb4fa 100644 --- a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java +++ b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java @@ -1224,163 +1224,222 @@ public void testSynonymQueryStyle() throws Exception { assertEquals("(t_pick_best_foo:\"denim pant\" | t_pick_best_foo:jean)", q.toString()); } - public void testSynonymsBoostByPayload_singleTermQuerySingleTermSynonyms_shouldParseBoostedQuery() throws Exception { + public void testSynonymsBoost_singleTermQuerySingleTermSynonyms_shouldParseBoostedQuery() throws Exception { //tiger, tigre|0.9 - Query q = QParser.getParser("tiger", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_pick_best_boost_by_payload_foo:tigre)^0.9 | t_pick_best_boost_by_payload_foo:tiger)", q.toString()); + Query q = QParser.getParser("tiger", req(params("df", "t_pick_best_boosted_foo"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:tigre)^0.9 | t_pick_best_boosted_foo:tiger)", q.toString()); - q = QParser.getParser("tiger", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); - assertEquals("(t_as_distinct_boost_by_payload_foo:tigre)^0.9 t_as_distinct_boost_by_payload_foo:tiger", q.toString()); + q = QParser.getParser("tiger", req(params("df", "t_as_distinct_boosted_foo"))).getQuery(); + assertEquals("(t_as_distinct_boosted_foo:tigre)^0.9 t_as_distinct_boosted_foo:tiger", q.toString()); - q = QParser.getParser("tiger", req(params("df", "t_as_same_term_boost_by_payload_foo"))).getQuery(); - assertEquals("Synonym(t_as_same_term_boost_by_payload_foo:tiger t_as_same_term_boost_by_payload_foo:tigre^0.9)", q.toString()); + q = QParser.getParser("tiger", req(params("df", "t_as_same_term_boosted_foo"))).getQuery(); + assertEquals("Synonym(t_as_same_term_boosted_foo:tiger t_as_same_term_boosted_foo:tigre^0.9)", q.toString()); //lynx => lince|0.8, lynx_canadensis|0.9 - q = QParser.getParser("lynx", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_pick_best_boost_by_payload_foo:lince)^0.8 | (t_pick_best_boost_by_payload_foo:lynx_canadensis)^0.9)", q.toString()); + q = QParser.getParser("lynx", req(params("df", "t_pick_best_boosted_foo"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:lince)^0.8 | (t_pick_best_boosted_foo:lynx_canadensis)^0.9)", q.toString()); - q = QParser.getParser("lynx", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); - assertEquals("(t_as_distinct_boost_by_payload_foo:lince)^0.8 (t_as_distinct_boost_by_payload_foo:lynx_canadensis)^0.9", q.toString()); + q = QParser.getParser("lynx", req(params("df", "t_as_distinct_boosted_foo"))).getQuery(); + assertEquals("(t_as_distinct_boosted_foo:lince)^0.8 (t_as_distinct_boosted_foo:lynx_canadensis)^0.9", q.toString()); - q = QParser.getParser("lynx", req(params("df", "t_as_same_term_boost_by_payload_foo"))).getQuery(); - assertEquals("Synonym(t_as_same_term_boost_by_payload_foo:lince^0.8 t_as_same_term_boost_by_payload_foo:lynx_canadensis^0.9)", q.toString()); + q = QParser.getParser("lynx", req(params("df", "t_as_same_term_boosted_foo"))).getQuery(); + assertEquals("Synonym(t_as_same_term_boosted_foo:lince^0.8 t_as_same_term_boosted_foo:lynx_canadensis^0.9)", q.toString()); } - public void testSynonymsBoostByPayload_singleTermQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { + public void testSynonymsBoost_singleTermQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { //leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85 - Query q = QParser.getParser("leopard", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_pick_best_boost_by_payload_foo:\"big cat\")^0.8 | (t_pick_best_boost_by_payload_foo:bagheera)^0.9 | (t_pick_best_boost_by_payload_foo:\"panthera pardus\")^0.85 | t_pick_best_boost_by_payload_foo:leopard)", q.toString()); + Query q = QParser.getParser("leopard", req(params("df", "t_pick_best_boosted_foo"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:bagheera)^0.9 | (t_pick_best_boosted_foo:\"panthera pardus\")^0.85 | t_pick_best_boosted_foo:leopard)", q.toString()); - q = QParser.getParser("leopard", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_as_distinct_boost_by_payload_foo:\"big cat\")^0.8 (t_as_distinct_boost_by_payload_foo:bagheera)^0.9 (t_as_distinct_boost_by_payload_foo:\"panthera pardus\")^0.85 t_as_distinct_boost_by_payload_foo:leopard)", q.toString()); + q = QParser.getParser("leopard", req(params("df", "t_as_distinct_boosted_foo"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:bagheera)^0.9 (t_as_distinct_boosted_foo:\"panthera pardus\")^0.85 t_as_distinct_boosted_foo:leopard)", q.toString()); - q = QParser.getParser("leopard", req(params("df", "t_as_same_term_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_as_same_term_boost_by_payload_foo:\"big cat\")^0.8 (t_as_same_term_boost_by_payload_foo:bagheera)^0.9 (t_as_same_term_boost_by_payload_foo:\"panthera pardus\")^0.85 t_as_same_term_boost_by_payload_foo:leopard)", q.toString()); + q = QParser.getParser("leopard", req(params("df", "t_as_same_term_boosted_foo"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:bagheera)^0.9 (t_as_same_term_boosted_foo:\"panthera pardus\")^0.85 t_as_same_term_boosted_foo:leopard)", q.toString()); //lion => panthera leo|0.9, simba leo|0.8, kimba|0.75 - q = QParser.getParser("lion", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_pick_best_boost_by_payload_foo:\"panthera leo\")^0.9 | (t_pick_best_boost_by_payload_foo:\"simba leo\")^0.8 | (t_pick_best_boost_by_payload_foo:kimba)^0.75)", q.toString()); + q = QParser.getParser("lion", req(params("df", "t_pick_best_boosted_foo"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:\"panthera leo\")^0.9 | (t_pick_best_boosted_foo:\"simba leo\")^0.8 | (t_pick_best_boosted_foo:kimba)^0.75)", q.toString()); - q = QParser.getParser("lion", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_as_distinct_boost_by_payload_foo:\"panthera leo\")^0.9 (t_as_distinct_boost_by_payload_foo:\"simba leo\")^0.8 (t_as_distinct_boost_by_payload_foo:kimba)^0.75)", q.toString()); + q = QParser.getParser("lion", req(params("df", "t_as_distinct_boosted_foo"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:\"panthera leo\")^0.9 (t_as_distinct_boosted_foo:\"simba leo\")^0.8 (t_as_distinct_boosted_foo:kimba)^0.75)", q.toString()); - q = QParser.getParser("lion", req(params("df", "t_as_same_term_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_as_same_term_boost_by_payload_foo:\"panthera leo\")^0.9 (t_as_same_term_boost_by_payload_foo:\"simba leo\")^0.8 (t_as_same_term_boost_by_payload_foo:kimba)^0.75)", q.toString()); + q = QParser.getParser("lion", req(params("df", "t_as_same_term_boosted_foo"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:\"panthera leo\")^0.9 (t_as_same_term_boosted_foo:\"simba leo\")^0.8 (t_as_same_term_boosted_foo:kimba)^0.75)", q.toString()); } - public void testSynonymsBoostByPayload_multiTermQuerySingleTermSynonyms_shouldParseBoostedQuery() throws Exception { + public void testSynonymsBoost_multiTermQuerySingleTermSynonyms_shouldParseBoostedQuery() throws Exception { //tiger, tigre|0.9 //lynx => lince|0.8, lynx_canadensis|0.9 - Query q = QParser.getParser("tiger lynx", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_pick_best_boost_by_payload_foo:tigre)^0.9 | t_pick_best_boost_by_payload_foo:tiger)" + - " ((t_pick_best_boost_by_payload_foo:lince)^0.8 | (t_pick_best_boost_by_payload_foo:lynx_canadensis)^0.9)", q.toString()); + Query q = QParser.getParser("tiger lynx", req(params("df", "t_pick_best_boosted_foo"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:tigre)^0.9 | t_pick_best_boosted_foo:tiger)" + + " ((t_pick_best_boosted_foo:lince)^0.8 | (t_pick_best_boosted_foo:lynx_canadensis)^0.9)", q.toString()); - q = QParser.getParser("tiger lynx", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_as_distinct_boost_by_payload_foo:tigre)^0.9 t_as_distinct_boost_by_payload_foo:tiger)" + - " ((t_as_distinct_boost_by_payload_foo:lince)^0.8 (t_as_distinct_boost_by_payload_foo:lynx_canadensis)^0.9)", q.toString()); + q = QParser.getParser("tiger lynx", req(params("df", "t_as_distinct_boosted_foo"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:tigre)^0.9 t_as_distinct_boosted_foo:tiger)" + + " ((t_as_distinct_boosted_foo:lince)^0.8 (t_as_distinct_boosted_foo:lynx_canadensis)^0.9)", q.toString()); - q = QParser.getParser("tiger lynx", req(params("df", "t_as_same_term_boost_by_payload_foo"))).getQuery(); - assertEquals("Synonym(t_as_same_term_boost_by_payload_foo:tiger t_as_same_term_boost_by_payload_foo:tigre^0.9)" + - " Synonym(t_as_same_term_boost_by_payload_foo:lince^0.8 t_as_same_term_boost_by_payload_foo:lynx_canadensis^0.9)", q.toString()); + q = QParser.getParser("tiger lynx", req(params("df", "t_as_same_term_boosted_foo"))).getQuery(); + assertEquals("Synonym(t_as_same_term_boosted_foo:tiger t_as_same_term_boosted_foo:tigre^0.9)" + + " Synonym(t_as_same_term_boosted_foo:lince^0.8 t_as_same_term_boosted_foo:lynx_canadensis^0.9)", q.toString()); } - public void testSynonymsBoostByPayload_multiTermQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { + public void testSynonymsBoost_multiTermQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { //leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85 //lion => panthera leo|0.9, simba leo|0.8, kimba|0.75 - Query q = QParser.getParser("leopard lion", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_pick_best_boost_by_payload_foo:\"big cat\")^0.8 | (t_pick_best_boost_by_payload_foo:bagheera)^0.9 | (t_pick_best_boost_by_payload_foo:\"panthera pardus\")^0.85 | t_pick_best_boost_by_payload_foo:leopard)" + - " ((t_pick_best_boost_by_payload_foo:\"panthera leo\")^0.9 | (t_pick_best_boost_by_payload_foo:\"simba leo\")^0.8 | (t_pick_best_boost_by_payload_foo:kimba)^0.75)", q.toString()); + Query q = QParser.getParser("leopard lion", req(params("df", "t_pick_best_boosted_foo"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:bagheera)^0.9 | (t_pick_best_boosted_foo:\"panthera pardus\")^0.85 | t_pick_best_boosted_foo:leopard)" + + " ((t_pick_best_boosted_foo:\"panthera leo\")^0.9 | (t_pick_best_boosted_foo:\"simba leo\")^0.8 | (t_pick_best_boosted_foo:kimba)^0.75)", q.toString()); - q = QParser.getParser("leopard lion", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_as_distinct_boost_by_payload_foo:\"big cat\")^0.8 (t_as_distinct_boost_by_payload_foo:bagheera)^0.9 (t_as_distinct_boost_by_payload_foo:\"panthera pardus\")^0.85 t_as_distinct_boost_by_payload_foo:leopard)" + - " ((t_as_distinct_boost_by_payload_foo:\"panthera leo\")^0.9 (t_as_distinct_boost_by_payload_foo:\"simba leo\")^0.8 (t_as_distinct_boost_by_payload_foo:kimba)^0.75)", q.toString()); + q = QParser.getParser("leopard lion", req(params("df", "t_as_distinct_boosted_foo"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:bagheera)^0.9 (t_as_distinct_boosted_foo:\"panthera pardus\")^0.85 t_as_distinct_boosted_foo:leopard)" + + " ((t_as_distinct_boosted_foo:\"panthera leo\")^0.9 (t_as_distinct_boosted_foo:\"simba leo\")^0.8 (t_as_distinct_boosted_foo:kimba)^0.75)", q.toString()); - q = QParser.getParser("leopard lion", req(params("df", "t_as_same_term_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_as_same_term_boost_by_payload_foo:\"big cat\")^0.8 (t_as_same_term_boost_by_payload_foo:bagheera)^0.9 (t_as_same_term_boost_by_payload_foo:\"panthera pardus\")^0.85 t_as_same_term_boost_by_payload_foo:leopard)" + - " ((t_as_same_term_boost_by_payload_foo:\"panthera leo\")^0.9 (t_as_same_term_boost_by_payload_foo:\"simba leo\")^0.8 (t_as_same_term_boost_by_payload_foo:kimba)^0.75)", q.toString()); + q = QParser.getParser("leopard lion", req(params("df", "t_as_same_term_boosted_foo"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:bagheera)^0.9 (t_as_same_term_boosted_foo:\"panthera pardus\")^0.85 t_as_same_term_boosted_foo:leopard)" + + " ((t_as_same_term_boosted_foo:\"panthera leo\")^0.9 (t_as_same_term_boosted_foo:\"simba leo\")^0.8 (t_as_same_term_boosted_foo:kimba)^0.75)", q.toString()); } - public void testSynonymsBoostByPayload_singleConceptQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { + public void testSynonymsBoost_singleConceptQuerySingleTermSynonym_shouldParseBoostedQuery() throws Exception { + //panthera pardus, leopard|0.6 + Query q = QParser.getParser("panthera pardus story",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:leopard)^0.6 | t_pick_best_boosted_foo:\"panthera pardus\") t_pick_best_boosted_foo:story", q.toString()); + + q = QParser.getParser("panthera pardus story", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:leopard)^0.6 t_as_distinct_boosted_foo:\"panthera pardus\") t_as_distinct_boosted_foo:story", q.toString()); + + q = QParser.getParser("panthera pardus story", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:leopard)^0.6 t_as_same_term_boosted_foo:\"panthera pardus\") t_as_same_term_boosted_foo:story", q.toString()); + + //panthera tigris => tiger|0.99 + q = QParser.getParser("panthera tigris story", req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery(); + assertEquals("(t_pick_best_boosted_foo:tiger)^0.99 t_pick_best_boosted_foo:story", q.toString()); + + q = QParser.getParser("panthera tigris story", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery(); + assertEquals("(t_as_distinct_boosted_foo:tiger)^0.99 t_as_distinct_boosted_foo:story", q.toString()); + + q = QParser.getParser("panthera tigris story", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery(); + assertEquals("(t_as_same_term_boosted_foo:tiger)^0.99 t_as_same_term_boosted_foo:story", q.toString()); + } + + public void testSynonymsBoost_singleConceptQueryMultiTermSynonymWithMultipleBoost_shouldParseMultiplicativeBoostedQuery() throws Exception { + //panthera blytheae, oldest|0.5 ancient|0.9 panthera + Query q = QParser.getParser("panthera blytheae",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:\"oldest ancient panthera\")^0.45 | t_pick_best_boosted_foo:\"panthera blytheae\")", q.toString()); + + q = QParser.getParser("panthera blytheae", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:\"oldest ancient panthera\")^0.45 t_as_distinct_boosted_foo:\"panthera blytheae\")", q.toString()); + + q = QParser.getParser("panthera blytheae", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:\"oldest ancient panthera\")^0.45 t_as_same_term_boosted_foo:\"panthera blytheae\")", q.toString()); + } + + public void testSynonymsBoost_singleConceptQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { //snow leopard|1.0, panthera uncia|0.9, big cat|0.8, white_leopard|0.6 - Query q = QParser.getParser("snow leopard",req(params("df", "t_pick_best_boost_by_payload_foo","sow", "false"))).getQuery(); - assertEquals("((t_pick_best_boost_by_payload_foo:\"panthera uncia\")^0.9 | (t_pick_best_boost_by_payload_foo:\"big cat\")^0.8 | (t_pick_best_boost_by_payload_foo:white_leopard)^0.6 | t_pick_best_boost_by_payload_foo:\"snow leopard\")", q.toString()); + Query q = QParser.getParser("snow leopard",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:\"panthera uncia\")^0.9 | (t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:white_leopard)^0.6 | t_pick_best_boosted_foo:\"snow leopard\")", q.toString()); - q = QParser.getParser("snow leopard", req(params("df", "t_as_distinct_boost_by_payload_foo","sow", "false"))).getQuery(); - assertEquals("((t_as_distinct_boost_by_payload_foo:\"panthera uncia\")^0.9 (t_as_distinct_boost_by_payload_foo:\"big cat\")^0.8 (t_as_distinct_boost_by_payload_foo:white_leopard)^0.6 t_as_distinct_boost_by_payload_foo:\"snow leopard\")", q.toString()); + q = QParser.getParser("snow leopard", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:\"panthera uncia\")^0.9 (t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:white_leopard)^0.6 t_as_distinct_boosted_foo:\"snow leopard\")", q.toString()); - q = QParser.getParser("snow leopard", req(params("df", "t_as_same_term_boost_by_payload_foo","sow", "false"))).getQuery(); - assertEquals("((t_as_same_term_boost_by_payload_foo:\"panthera uncia\")^0.9 (t_as_same_term_boost_by_payload_foo:\"big cat\")^0.8 (t_as_same_term_boost_by_payload_foo:white_leopard)^0.6 t_as_same_term_boost_by_payload_foo:\"snow leopard\")", q.toString()); + q = QParser.getParser("snow leopard", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:\"panthera uncia\")^0.9 (t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:white_leopard)^0.6 t_as_same_term_boosted_foo:\"snow leopard\")", q.toString()); //panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65 - q = QParser.getParser("panthera onca", req(params("df", "t_pick_best_boost_by_payload_foo","sow", "false"))).getQuery(); - assertEquals("((t_pick_best_boost_by_payload_foo:jaguar)^0.95 | (t_pick_best_boost_by_payload_foo:\"big cat\")^0.85 | (t_pick_best_boost_by_payload_foo:\"black panther\")^0.65)", q.toString()); + q = QParser.getParser("panthera onca", req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:jaguar)^0.95 | (t_pick_best_boosted_foo:\"big cat\")^0.85 | (t_pick_best_boosted_foo:\"black panther\")^0.65)", q.toString()); - q = QParser.getParser("panthera onca", req(params("df", "t_as_distinct_boost_by_payload_foo","sow", "false"))).getQuery(); - assertEquals("((t_as_distinct_boost_by_payload_foo:jaguar)^0.95 (t_as_distinct_boost_by_payload_foo:\"big cat\")^0.85 (t_as_distinct_boost_by_payload_foo:\"black panther\")^0.65)", q.toString()); + q = QParser.getParser("panthera onca", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:jaguar)^0.95 (t_as_distinct_boosted_foo:\"big cat\")^0.85 (t_as_distinct_boosted_foo:\"black panther\")^0.65)", q.toString()); - q = QParser.getParser("panthera onca", req(params("df", "t_as_same_term_boost_by_payload_foo","sow", "false"))).getQuery(); - assertEquals("((t_as_same_term_boost_by_payload_foo:jaguar)^0.95 (t_as_same_term_boost_by_payload_foo:\"big cat\")^0.85 (t_as_same_term_boost_by_payload_foo:\"black panther\")^0.65)", q.toString()); + q = QParser.getParser("panthera onca", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:jaguar)^0.95 (t_as_same_term_boosted_foo:\"big cat\")^0.85 (t_as_same_term_boosted_foo:\"black panther\")^0.65)", q.toString()); } - public void testSynonymsBoostByPayload_multiConceptsQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { + public void testSynonymsBoost_multiConceptQuerySingleTermSynonym_shouldParseBoostedQuery() throws Exception { + //panthera pardus, leopard|0.6 + //tiger, tigre|0.9 + Query q = QParser.getParser("panthera pardus tiger",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:leopard)^0.6 | t_pick_best_boosted_foo:\"panthera pardus\") ((t_pick_best_boosted_foo:tigre)^0.9 | t_pick_best_boosted_foo:tiger)", q.toString()); + + q = QParser.getParser("panthera pardus tiger", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:leopard)^0.6 t_as_distinct_boosted_foo:\"panthera pardus\") ((t_as_distinct_boosted_foo:tigre)^0.9 t_as_distinct_boosted_foo:tiger)", q.toString()); + + q = QParser.getParser("panthera pardus tiger", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:leopard)^0.6 t_as_same_term_boosted_foo:\"panthera pardus\") Synonym(t_as_same_term_boosted_foo:tiger t_as_same_term_boosted_foo:tigre^0.9)", q.toString()); + } + + public void testSynonymsBoost_multiConceptsQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { //snow leopard|1.0, panthera uncia|0.9, big cat|0.8, white_leopard|0.6 //panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65 - Query q = QParser.getParser("snow leopard panthera onca",req(params("df", "t_pick_best_boost_by_payload_foo","sow", "false"))).getQuery(); - assertEquals("((t_pick_best_boost_by_payload_foo:\"panthera uncia\")^0.9 | (t_pick_best_boost_by_payload_foo:\"big cat\")^0.8 | (t_pick_best_boost_by_payload_foo:white_leopard)^0.6 | t_pick_best_boost_by_payload_foo:\"snow leopard\")" + - " ((t_pick_best_boost_by_payload_foo:jaguar)^0.95 | (t_pick_best_boost_by_payload_foo:\"big cat\")^0.85 | (t_pick_best_boost_by_payload_foo:\"black panther\")^0.65)", q.toString()); + Query q = QParser.getParser("snow leopard panthera onca",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:\"panthera uncia\")^0.9 | (t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:white_leopard)^0.6 | t_pick_best_boosted_foo:\"snow leopard\")" + + " ((t_pick_best_boosted_foo:jaguar)^0.95 | (t_pick_best_boosted_foo:\"big cat\")^0.85 | (t_pick_best_boosted_foo:\"black panther\")^0.65)", q.toString()); - q = QParser.getParser("snow leopard panthera onca", req(params("df", "t_as_distinct_boost_by_payload_foo","sow", "false"))).getQuery(); - assertEquals("((t_as_distinct_boost_by_payload_foo:\"panthera uncia\")^0.9 (t_as_distinct_boost_by_payload_foo:\"big cat\")^0.8 (t_as_distinct_boost_by_payload_foo:white_leopard)^0.6 t_as_distinct_boost_by_payload_foo:\"snow leopard\")" + - " ((t_as_distinct_boost_by_payload_foo:jaguar)^0.95 (t_as_distinct_boost_by_payload_foo:\"big cat\")^0.85 (t_as_distinct_boost_by_payload_foo:\"black panther\")^0.65)", q.toString()); + q = QParser.getParser("snow leopard panthera onca", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:\"panthera uncia\")^0.9 (t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:white_leopard)^0.6 t_as_distinct_boosted_foo:\"snow leopard\")" + + " ((t_as_distinct_boosted_foo:jaguar)^0.95 (t_as_distinct_boosted_foo:\"big cat\")^0.85 (t_as_distinct_boosted_foo:\"black panther\")^0.65)", q.toString()); - q = QParser.getParser("snow leopard panthera onca", req(params("df", "t_as_same_term_boost_by_payload_foo","sow", "false"))).getQuery(); - assertEquals("((t_as_same_term_boost_by_payload_foo:\"panthera uncia\")^0.9 (t_as_same_term_boost_by_payload_foo:\"big cat\")^0.8 (t_as_same_term_boost_by_payload_foo:white_leopard)^0.6 t_as_same_term_boost_by_payload_foo:\"snow leopard\")" + - " ((t_as_same_term_boost_by_payload_foo:jaguar)^0.95 (t_as_same_term_boost_by_payload_foo:\"big cat\")^0.85 (t_as_same_term_boost_by_payload_foo:\"black panther\")^0.65)", q.toString()); + q = QParser.getParser("snow leopard panthera onca", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:\"panthera uncia\")^0.9 (t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:white_leopard)^0.6 t_as_same_term_boosted_foo:\"snow leopard\")" + + " ((t_as_same_term_boosted_foo:jaguar)^0.95 (t_as_same_term_boosted_foo:\"big cat\")^0.85 (t_as_same_term_boosted_foo:\"black panther\")^0.65)", q.toString()); } - public void testSynonymsBoostByPayload_edismaxBoost_shouldParseBoostedPhraseQuery() throws Exception { - Query q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_pick_best_boost_by_payload_foo^10"))).getQuery(); + public void testSynonymsBoost_edismaxBoost_shouldParseBoostedPhraseQuery() throws Exception { + Query q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_pick_best_boosted_foo^10"))).getQuery(); assertEquals("+(" + - "((((t_pick_best_boost_by_payload_foo:\"panthera uncia\")^0.9 | (t_pick_best_boost_by_payload_foo:\"big cat\")^0.8 | (t_pick_best_boost_by_payload_foo:white_leopard)^0.6 | t_pick_best_boost_by_payload_foo:\"snow leopard\"))^10.0)" + - " ((((t_pick_best_boost_by_payload_foo:\"panthera leo\")^0.9 | (t_pick_best_boost_by_payload_foo:\"simba leo\")^0.8 | (t_pick_best_boost_by_payload_foo:kimba)^0.75))^10.0)" + + "((((t_pick_best_boosted_foo:\"panthera uncia\")^0.9 | (t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:white_leopard)^0.6 | t_pick_best_boosted_foo:\"snow leopard\"))^10.0)" + + " ((((t_pick_best_boosted_foo:\"panthera leo\")^0.9 | (t_pick_best_boosted_foo:\"simba leo\")^0.8 | (t_pick_best_boosted_foo:kimba)^0.75))^10.0)" + ")", q.toString()); - q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_as_distinct_boost_by_payload_foo^10"))).getQuery(); + q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_as_distinct_boosted_foo^10"))).getQuery(); assertEquals("+(" + - "(((t_as_distinct_boost_by_payload_foo:\"panthera uncia\")^0.9 (t_as_distinct_boost_by_payload_foo:\"big cat\")^0.8 (t_as_distinct_boost_by_payload_foo:white_leopard)^0.6 t_as_distinct_boost_by_payload_foo:\"snow leopard\")^10.0)" + - " (((t_as_distinct_boost_by_payload_foo:\"panthera leo\")^0.9 (t_as_distinct_boost_by_payload_foo:\"simba leo\")^0.8 (t_as_distinct_boost_by_payload_foo:kimba)^0.75)^10.0))", q.toString()); + "(((t_as_distinct_boosted_foo:\"panthera uncia\")^0.9 (t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:white_leopard)^0.6 t_as_distinct_boosted_foo:\"snow leopard\")^10.0)" + + " (((t_as_distinct_boosted_foo:\"panthera leo\")^0.9 (t_as_distinct_boosted_foo:\"simba leo\")^0.8 (t_as_distinct_boosted_foo:kimba)^0.75)^10.0))", q.toString()); - q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_as_same_term_boost_by_payload_foo^10"))).getQuery(); + q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_as_same_term_boosted_foo^10"))).getQuery(); assertEquals("+(" + - "(((t_as_same_term_boost_by_payload_foo:\"panthera uncia\")^0.9 (t_as_same_term_boost_by_payload_foo:\"big cat\")^0.8 (t_as_same_term_boost_by_payload_foo:white_leopard)^0.6 t_as_same_term_boost_by_payload_foo:\"snow leopard\")^10.0)" + - " (((t_as_same_term_boost_by_payload_foo:\"panthera leo\")^0.9 (t_as_same_term_boost_by_payload_foo:\"simba leo\")^0.8 (t_as_same_term_boost_by_payload_foo:kimba)^0.75)^10.0))", q.toString()); + "(((t_as_same_term_boosted_foo:\"panthera uncia\")^0.9 (t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:white_leopard)^0.6 t_as_same_term_boosted_foo:\"snow leopard\")^10.0)" + + " (((t_as_same_term_boosted_foo:\"panthera leo\")^0.9 (t_as_same_term_boosted_foo:\"simba leo\")^0.8 (t_as_same_term_boosted_foo:kimba)^0.75)^10.0))", q.toString()); } - public void testSynonymsBoostByPayload_phraseQueryMultiTermSynonymsPayloadBoost_shouldParseBoostedSpanQuery() throws Exception { - Query q = QParser.getParser("\"snow leopard lion\"", req(params("df", "t_pick_best_boost_by_payload_foo", "sow", "false"))).getQuery(); + public void testSynonymsBoost_phraseQueryMultiTermSynonymsBoost_shouldParseBoostedSpanQuery() throws Exception { + Query q = QParser.getParser("\"snow leopard lion\"", req(params("df", "t_pick_best_boosted_foo", "sow", "false"))).getQuery(); assertEquals("spanNear([" + "spanOr([" + - "(spanNear([t_pick_best_boost_by_payload_foo:panthera, t_pick_best_boost_by_payload_foo:uncia], 0, true))^0.9," + - " (spanNear([t_pick_best_boost_by_payload_foo:big, t_pick_best_boost_by_payload_foo:cat], 0, true))^0.8," + - " (t_pick_best_boost_by_payload_foo:white_leopard)^0.6," + - " spanNear([t_pick_best_boost_by_payload_foo:snow, t_pick_best_boost_by_payload_foo:leopard], 0, true)])," + + "(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:uncia], 0, true))^0.9," + + " (spanNear([t_pick_best_boosted_foo:big, t_pick_best_boosted_foo:cat], 0, true))^0.8," + + " (t_pick_best_boosted_foo:white_leopard)^0.6," + + " spanNear([t_pick_best_boosted_foo:snow, t_pick_best_boosted_foo:leopard], 0, true)])," + " spanOr([" + - "(spanNear([t_pick_best_boost_by_payload_foo:panthera, t_pick_best_boost_by_payload_foo:leo], 0, true))^0.9," + - " (spanNear([t_pick_best_boost_by_payload_foo:simba, t_pick_best_boost_by_payload_foo:leo], 0, true))^0.8," + - " (t_pick_best_boost_by_payload_foo:kimba)^0.75])], 0, true)", q.toString()); + "(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:leo], 0, true))^0.9," + + " (spanNear([t_pick_best_boosted_foo:simba, t_pick_best_boosted_foo:leo], 0, true))^0.8," + + " (t_pick_best_boosted_foo:kimba)^0.75])], 0, true)", q.toString()); + } + + public void testSynonymsBoost_phraseQueryMultiTermSynonymsMultipleBoost_shouldParseMultiplicativeBoostedSpanQuery() throws Exception { + Query q = QParser.getParser("\"panthera blytheae lion\"", req(params("df", "t_pick_best_boosted_foo", "sow", "false"))).getQuery(); + assertEquals("spanNear([" + + "spanOr([" + + "(spanNear([t_pick_best_boosted_foo:oldest, t_pick_best_boosted_foo:ancient, t_pick_best_boosted_foo:panthera], 0, true))^0.45," + + " spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:blytheae], 0, true)])," + + " spanOr([" + + "(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:leo], 0, true))^0.9," + + " (spanNear([t_pick_best_boosted_foo:simba, t_pick_best_boosted_foo:leo], 0, true))^0.8," + + " (t_pick_best_boosted_foo:kimba)^0.75])], 0, true)", q.toString()); } - public void testSynonymsBoostByPayload_PayloadBoostMissing_shouldAssignDefaultBoost() throws Exception { + public void testSynonymsBoost_BoostMissing_shouldAssignDefaultBoost() throws Exception { //leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85 - Query q = QParser.getParser("leopard", req(params("df", "t_pick_best_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_pick_best_boost_by_payload_foo:\"big cat\")^0.8 | (t_pick_best_boost_by_payload_foo:bagheera)^0.9 | (t_pick_best_boost_by_payload_foo:\"panthera pardus\")^0.85 | t_pick_best_boost_by_payload_foo:leopard)", q.toString()); + Query q = QParser.getParser("leopard", req(params("df", "t_pick_best_boosted_foo"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:bagheera)^0.9 | (t_pick_best_boosted_foo:\"panthera pardus\")^0.85 | t_pick_best_boosted_foo:leopard)", q.toString()); - q = QParser.getParser("leopard", req(params("df", "t_as_distinct_boost_by_payload_foo"))).getQuery(); - assertEquals("((t_as_distinct_boost_by_payload_foo:\"big cat\")^0.8 (t_as_distinct_boost_by_payload_foo:bagheera)^0.9 (t_as_distinct_boost_by_payload_foo:\"panthera pardus\")^0.85 t_as_distinct_boost_by_payload_foo:leopard)", q.toString()); + q = QParser.getParser("leopard", req(params("df", "t_as_distinct_boosted_foo"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:bagheera)^0.9 (t_as_distinct_boosted_foo:\"panthera pardus\")^0.85 t_as_distinct_boosted_foo:leopard)", q.toString()); } @Test From 6da4b8264862435f93e6cee0caac6f81f8c2784d Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Fri, 7 Feb 2020 17:32:47 +0000 Subject: [PATCH 28/36] [SOLR-12238] adjustments of the github PR feedback --- solr/core/src/test-files/solr/collection1/conf/synonyms.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solr/core/src/test-files/solr/collection1/conf/synonyms.txt b/solr/core/src/test-files/solr/collection1/conf/synonyms.txt index 1dad43d3a17c..d7feb34ee647 100644 --- a/solr/core/src/test-files/solr/collection1/conf/synonyms.txt +++ b/solr/core/src/test-files/solr/collection1/conf/synonyms.txt @@ -39,7 +39,7 @@ persian => persian, cat, feline, animal jeans, denim pants -# Synonyms used by Payload Boost +# Boosted Synonyms tiger, tigre|0.9 lynx => lince|0.8, lynx_canadensis|0.9 From b6cbace926168ea5aac04aa21679db9ec548cb18 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Mon, 10 Feb 2020 10:37:01 +0000 Subject: [PATCH 29/36] [SOLR-12238] adjustments of the github PR feedback --- .../boost/DelimitedBoostTokenFilter.java | 4 +++- .../apache/lucene/search/BoostAttribute.java | 1 + .../org/apache/lucene/util/QueryBuilder.java | 21 ++++++++++++------- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java index b34b1c8a628d..c37f7d716025 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java @@ -56,6 +56,8 @@ public boolean incrementToken() throws IOException { } } return true; - } else return false; + } else { + return false; + } } } diff --git a/lucene/core/src/java/org/apache/lucene/search/BoostAttribute.java b/lucene/core/src/java/org/apache/lucene/search/BoostAttribute.java index 2a99a0828ad7..9030b5728d71 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BoostAttribute.java +++ b/lucene/core/src/java/org/apache/lucene/search/BoostAttribute.java @@ -32,6 +32,7 @@ * @lucene.internal */ public interface BoostAttribute extends Attribute { + float DEFAULT_BOOST = 1.0f; /** Sets the boost in this attribute */ public void setBoost(float boost); /** Retrieves the boost, default is {@code 1.0f}. */ diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index 6f5bfe494596..55ccbd1d06d8 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -44,6 +44,7 @@ import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings; +import static org.apache.lucene.search.BoostAttribute.DEFAULT_BOOST; /** * Creates queries from the {@link Analyzer} chain. @@ -70,7 +71,6 @@ public class QueryBuilder { * Wraps a term and boost */ public static class TermAndBoost { - private static final float DEFAULT_BOOST = 1.0f; /** the term */ public final Term term; /** the boost */ @@ -375,7 +375,7 @@ protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOExcep BoostAttribute boostAtt = in.addAttribute(BoostAttribute.class); SpanQuery result; - float boost = TermAndBoost.DEFAULT_BOOST; + float boost = DEFAULT_BOOST; if (termAtt == null) { return null; } @@ -394,7 +394,7 @@ protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOExcep result = new SpanNearQuery(terms.toArray(new SpanQuery[0]), 0, true); } - if (boost != TermAndBoost.DEFAULT_BOOST) { + if (boost != DEFAULT_BOOST) { result = new SpanBoostQuery(result, boost); } return result; @@ -477,7 +477,7 @@ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); int position = -1; - float phraseBoost = TermAndBoost.DEFAULT_BOOST; + float phraseBoost = DEFAULT_BOOST; stream.reset(); while (stream.incrementToken()) { if (enablePositionIncrements) { @@ -489,7 +489,7 @@ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws phraseBoost *= boostAtt.getBoost(); } PhraseQuery query = builder.build(); - if (phraseBoost == TermAndBoost.DEFAULT_BOOST) { + if (phraseBoost == DEFAULT_BOOST) { return query; } return new BoostQuery(query, phraseBoost); @@ -566,7 +566,14 @@ public Query next() { }; positionalQuery = newGraphSynonymQuery(queries); } else { - TermAndBoost[] terms = graph.getTermsAndBoosts(field,start); + List attributes = graph.getTerms(start); + TermAndBoost[] terms = attributes.stream() + .map(s -> { + TermToBytesRefAttribute t = s.addAttribute(TermToBytesRefAttribute.class); + BoostAttribute b = s.addAttribute(BoostAttribute.class); + return new TermAndBoost(new Term(field, t.getBytesRef()), b.getBoost()); + }) + .toArray(TermAndBoost[]::new); assert terms.length > 0; if (terms.length == 1) { positionalQuery = newTermQuery(terms[0].term, terms[0].boost); @@ -725,7 +732,7 @@ protected Query newGraphSynonymQuery(Iterator queries) { */ protected Query newTermQuery(Term term, float boost) { Query q = new TermQuery(term); - if (boost == TermAndBoost.DEFAULT_BOOST) { + if (boost == DEFAULT_BOOST) { return q; } return new BoostQuery(q, boost); From 54225c817cf92095079fe16ee808873dc7399f7f Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Tue, 11 Feb 2020 09:53:07 +0000 Subject: [PATCH 30/36] [SOLR-12238] adjustments of the github PR feedback --- .../util/graph/GraphTokenStreamFiniteStrings.java | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java index fef0fb221d8e..b2b530d93afb 100644 --- a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java +++ b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java @@ -30,11 +30,9 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.index.Term; -import org.apache.lucene.search.BoostAttribute; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.QueryBuilder; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.FiniteStringsIterator; import org.apache.lucene.util.automaton.Operations; @@ -126,15 +124,6 @@ public Term[] getTerms(String field, int state) { .toArray(Term[]::new); } - /** - * Returns the list of terms that start at the provided state - */ - public QueryBuilder.TermAndBoost[] getTermsAndBoosts(String field, int state) { - return getTerms(state).stream() - .map(s -> new QueryBuilder.TermAndBoost(new Term(field, s.addAttribute(TermToBytesRefAttribute.class).getBytesRef()),s.addAttribute(BoostAttribute.class).getBoost())) - .toArray(QueryBuilder.TermAndBoost[]::new); - } - /** * Get all finite strings from the automaton. */ From bfdaf9d39a9e7316da2dd00fbd481ad95fd7c08a Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Thu, 13 Feb 2020 11:42:03 +0000 Subject: [PATCH 31/36] [SOLR-12238] docs --- .../DelimitedBoostTokenFilterFactory.java | 2 - .../src/filter-descriptions.adoc | 101 ++++++++++++++++++ 2 files changed, 101 insertions(+), 2 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java index 2f5be8fefc35..b956a0870866 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java @@ -31,8 +31,6 @@ * </analyzer> * </fieldType> * - * @lucene.spi {@value #NAME} - * @since 3.1 */ public class DelimitedBoostTokenFilterFactory extends TokenFilterFactory { diff --git a/solr/solr-ref-guide/src/filter-descriptions.adoc b/solr/solr-ref-guide/src/filter-descriptions.adoc index daa1f85dfe16..18ef6dfc0a51 100644 --- a/solr/solr-ref-guide/src/filter-descriptions.adoc +++ b/solr/solr-ref-guide/src/filter-descriptions.adoc @@ -398,6 +398,72 @@ Discard original token (`inject="false"`). Note that "Kuczewski" has two encodings, which are added at the same position. +== Delimited Boost Filter + +This filter adds a numeric floating point boost value to tokens, splitting on a delimiter character. + +*Factory class:* `solr.DelimitedBoostTokenFilterFactory` + +*Arguments:* + +`delimiter`:: The character used to separate the token and the boost. Defaults to '|'. + +*Example:* + +[.dynamic-tabs] +-- +[example.tab-pane#byname-filter-delimitedBoost] +==== +[.tab-label]*With name* +[source,xml] +---- + + + + +---- +==== +[example.tab-pane#byclass-filter-delimitedBoost] +==== +[.tab-label]*With class name (legacy)* +[source,xml] +---- + + + + +---- +==== +-- + +*In:* "leopard|0.5 panthera uncia|0.9" + +*Tokenizer to Filter:* "leopard|0.5"(1), "panthera"(2), "uncia|0.9"(3) + +*Out:* "leopard"(1)[0.5], "panthera"(2), "uncia"(3)[0.9] + +The numeric floating point in square brackets is a float token boost attribute. + +*Example:* + +Using a different delimiter (`delimiter="/"`). + +[source,xml] +---- + + + + +---- + +*In:* "leopard/0.5 panthera uncia/0.9" + +*Tokenizer to Filter:* "leopard/0.5"(1), "panthera"(2), "uncia/0.9"(3) + +*Out:* "leopard"(1)[0.5], "panthera"(2), "uncia"(3)[0.9] + +*N.B.* make sure the delimiter is compatible with the tokenizer you use + == Edge N-Gram Filter This filter generates edge n-gram tokens of sizes within the given range. @@ -2292,6 +2358,41 @@ small => tiny,teeny,weeny *Out:* "the"(1), "large"(2), "large"(3), "couch"(4), "sofa"(4), "divan"(4) +*Weighted Synonyms:* + +Combining the DelimitedBoostFilter with the Synonym Graph Filter you can achieve Weighted synonyms at query time. +For the following examples, assume a synonyms file named `boostedSynonyms.txt`: + +[source,text] +---- +leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85 +lion => panthera leo|0.9, simba|0.8, kimba|0.75 +---- + +*Example:* + +[.dynamic-tabs] +-- +[example.tab-pane#byname-filter-stop-weightedsynonymgraph] +==== +[.tab-label]*With name* +[source,xml] +---- + + + + + +---- +==== +-- + +*In:* "lion" + +*Tokenizer to Filter:* "lion"(1) + +*Out:* "panthera"(1), "leo"(2)[0.9], "simba"(1)[0.8], "kimba"(1)[0.75] + == Token Offset Payload Filter This filter adds the numeric character offsets of the token as a payload value for that token. From 9d9f1c6289fc2d51f6ea4da8837a7fc5f43e4eb0 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Thu, 13 Feb 2020 11:47:54 +0000 Subject: [PATCH 32/36] [SOLR-12238] docs --- solr/solr-ref-guide/src/filter-descriptions.adoc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/solr/solr-ref-guide/src/filter-descriptions.adoc b/solr/solr-ref-guide/src/filter-descriptions.adoc index 18ef6dfc0a51..c2115fe865fc 100644 --- a/solr/solr-ref-guide/src/filter-descriptions.adoc +++ b/solr/solr-ref-guide/src/filter-descriptions.adoc @@ -2361,6 +2361,8 @@ small => tiny,teeny,weeny *Weighted Synonyms:* Combining the DelimitedBoostFilter with the Synonym Graph Filter you can achieve Weighted synonyms at query time. +For more information feel free to refer to: +https://sease.io/2020/02/introducing-weighted-synonyms-in-apache-lucene.html For the following examples, assume a synonyms file named `boostedSynonyms.txt`: [source,text] From 1881390196701b49e53d28bff49ccb2f251031be Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Thu, 13 Feb 2020 11:51:58 +0000 Subject: [PATCH 33/36] [SOLR-12238] minor spi name fix --- .../lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java | 1 + 1 file changed, 1 insertion(+) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java index b956a0870866..7436034b2830 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java @@ -31,6 +31,7 @@ * </analyzer> * </fieldType> * + * @lucene.spi {@value #NAME} */ public class DelimitedBoostTokenFilterFactory extends TokenFilterFactory { From 81a4217924aa30f9df0771b129e09acb2487b69f Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Thu, 13 Feb 2020 12:19:08 +0000 Subject: [PATCH 34/36] [SOLR-12238] adjustments of the github PR feedback --- solr/solr-ref-guide/src/filter-descriptions.adoc | 1 - 1 file changed, 1 deletion(-) diff --git a/solr/solr-ref-guide/src/filter-descriptions.adoc b/solr/solr-ref-guide/src/filter-descriptions.adoc index c2115fe865fc..83fc8d8d98b5 100644 --- a/solr/solr-ref-guide/src/filter-descriptions.adoc +++ b/solr/solr-ref-guide/src/filter-descriptions.adoc @@ -2373,7 +2373,6 @@ lion => panthera leo|0.9, simba|0.8, kimba|0.75 *Example:* -[.dynamic-tabs] -- [example.tab-pane#byname-filter-stop-weightedsynonymgraph] ==== From 8b8cb99bc637aefc3e3f185b00b8a9241e3f3a07 Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Thu, 13 Feb 2020 13:12:24 +0000 Subject: [PATCH 35/36] [SOLR-12238] adjustments of the github PR feedback --- solr/solr-ref-guide/src/filter-descriptions.adoc | 3 --- 1 file changed, 3 deletions(-) diff --git a/solr/solr-ref-guide/src/filter-descriptions.adoc b/solr/solr-ref-guide/src/filter-descriptions.adoc index 83fc8d8d98b5..f4f6cb7a3008 100644 --- a/solr/solr-ref-guide/src/filter-descriptions.adoc +++ b/solr/solr-ref-guide/src/filter-descriptions.adoc @@ -2373,8 +2373,6 @@ lion => panthera leo|0.9, simba|0.8, kimba|0.75 *Example:* --- -[example.tab-pane#byname-filter-stop-weightedsynonymgraph] ==== [.tab-label]*With name* [source,xml] @@ -2386,7 +2384,6 @@ lion => panthera leo|0.9, simba|0.8, kimba|0.75 ---- ==== --- *In:* "lion" From e4da3fba20c5a8b6fbac950993d671ebf60a576f Mon Sep 17 00:00:00 2001 From: Alessandro Benedetti Date: Thu, 13 Feb 2020 16:38:11 +0000 Subject: [PATCH 36/36] [SOLR-12238] minor comment fix --- .../src/test/org/apache/solr/search/TestSolrQueryParser.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java index 6593c9dcb4fa..69d12bb7b92a 100644 --- a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java +++ b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java @@ -1336,7 +1336,7 @@ public void testSynonymsBoost_singleConceptQueryMultiTermSynonymWithMultipleBoos } public void testSynonymsBoost_singleConceptQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { - //snow leopard|1.0, panthera uncia|0.9, big cat|0.8, white_leopard|0.6 + //snow leopard, panthera uncia|0.9, big cat|0.8, white_leopard|0.6 Query q = QParser.getParser("snow leopard",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery(); assertEquals("((t_pick_best_boosted_foo:\"panthera uncia\")^0.9 | (t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:white_leopard)^0.6 | t_pick_best_boosted_foo:\"snow leopard\")", q.toString()); @@ -1372,7 +1372,7 @@ public void testSynonymsBoost_multiConceptQuerySingleTermSynonym_shouldParseBoos } public void testSynonymsBoost_multiConceptsQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { - //snow leopard|1.0, panthera uncia|0.9, big cat|0.8, white_leopard|0.6 + //snow leopard, panthera uncia|0.9, big cat|0.8, white_leopard|0.6 //panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65 Query q = QParser.getParser("snow leopard panthera onca",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery(); assertEquals("((t_pick_best_boosted_foo:\"panthera uncia\")^0.9 | (t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:white_leopard)^0.6 | t_pick_best_boosted_foo:\"snow leopard\")" +