diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java new file mode 100644 index 000000000000..c37f7d716025 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.boost; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.search.BoostAttribute; + +import java.io.IOException; + + +/** + * Characters before the delimiter are the "token", those after are the boost. + *

+ * For example, if the delimiter is '|', then for the string "foo|0.7", foo is the token + * and 0.7 is the boost. + *

+ * Note make sure your Tokenizer doesn't split on the delimiter, or this won't work + */ +public final class DelimitedBoostTokenFilter extends TokenFilter { + private final char delimiter; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final BoostAttribute boostAtt = addAttribute(BoostAttribute.class); + + public DelimitedBoostTokenFilter(TokenStream input, char delimiter) { + super(input); + this.delimiter = delimiter; + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + final char[] buffer = termAtt.buffer(); + final int length = termAtt.length(); + for (int i = 0; i < length; i++) { + if (buffer[i] == delimiter) { + float boost = Float.parseFloat(new String(buffer, i + 1, (length - (i + 1)))); + boostAtt.setBoost(boost); + termAtt.setLength(i); + return true; + } + } + return true; + } else { + return false; + } + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java new file mode 100644 index 000000000000..7436034b2830 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.boost; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import java.util.Map; + +/** + * Factory for {@link DelimitedBoostTokenFilter}. + *

+ * <fieldType name="text_dlmtd" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.DelimitedBoostTokenFilterFactory" delimiter="|"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * @lucene.spi {@value #NAME} + */ +public class DelimitedBoostTokenFilterFactory extends TokenFilterFactory { + + /** + * SPI name + */ + public static final String NAME = "delimitedBoost"; + public static final String DELIMITER_ATTR = "delimiter"; + public static final char DEFAULT_DELIMITER = '|'; + + private final char delimiter; + + /** + * Creates a new DelimitedPayloadTokenFilterFactory + */ + public DelimitedBoostTokenFilterFactory(Map args) { + super(args); + delimiter = getChar(args, DELIMITER_ATTR, DEFAULT_DELIMITER); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public DelimitedBoostTokenFilter create(TokenStream input) { + return new DelimitedBoostTokenFilter(input, delimiter); + } + +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/package-info.java new file mode 100644 index 000000000000..9bae5dc4b235 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Provides various convenience classes for creating boosts on Tokens. + */ +package org.apache.lucene.analysis.boost; diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory index 16fca20f84e7..fd13e6fc86ce 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -17,6 +17,7 @@ org.apache.lucene.analysis.tr.ApostropheFilterFactory org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory org.apache.lucene.analysis.ar.ArabicStemFilterFactory org.apache.lucene.analysis.bg.BulgarianStemFilterFactory +org.apache.lucene.analysis.boost.DelimitedBoostTokenFilterFactory org.apache.lucene.analysis.bn.BengaliNormalizationFilterFactory org.apache.lucene.analysis.bn.BengaliStemFilterFactory org.apache.lucene.analysis.br.BrazilianStemFilterFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterTest.java new file mode 100644 index 000000000000..8b9d69000af6 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterTest.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.boost; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.search.BoostAttribute; + +public class DelimitedBoostTokenFilterTest extends BaseTokenStreamTestCase { + + public void testBoosts() throws Exception { + String test = "The quick|0.4 red|0.5 fox|0.2 jumped|0.1 over the lazy|0.8 brown|0.9 dogs|0.9"; + DelimitedBoostTokenFilter filter = new DelimitedBoostTokenFilter + (whitespaceMockTokenizer(test), + DelimitedBoostTokenFilterFactory.DEFAULT_DELIMITER); + CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); + BoostAttribute boostAtt = filter.addAttribute(BoostAttribute.class); + filter.reset(); + assertTermEquals("The", filter, termAtt, boostAtt, 1.0f); + assertTermEquals("quick", filter, termAtt, boostAtt, 0.4f); + assertTermEquals("red", filter, termAtt, boostAtt, 0.5f); + assertTermEquals("fox", filter, termAtt, boostAtt, 0.2f); + assertTermEquals("jumped", filter, termAtt, boostAtt, 0.1f); + assertTermEquals("over", filter, termAtt, boostAtt, 1.0f); + assertTermEquals("the", filter, termAtt, boostAtt, 1.0f); + assertTermEquals("lazy", filter, termAtt, boostAtt, 0.8f); + assertTermEquals("brown", filter, termAtt, boostAtt, 0.9f); + assertTermEquals("dogs", filter, termAtt, boostAtt, 0.9f); + assertFalse(filter.incrementToken()); + filter.end(); + filter.close(); + } + + public void testNext() throws Exception { + String test = "The quick|0.1 red|0.2 fox|0.3 jumped|0.4 over the lazy|0.5 brown|0.6 dogs|0.6"; + DelimitedBoostTokenFilter filter = new DelimitedBoostTokenFilter + (whitespaceMockTokenizer(test), + DelimitedBoostTokenFilterFactory.DEFAULT_DELIMITER); + filter.reset(); + assertTermEquals("The", filter, 1.0f); + assertTermEquals("quick", filter, 0.1f); + assertTermEquals("red", filter, 0.2f); + assertTermEquals("fox", filter, 0.3f); + assertTermEquals("jumped", filter, 0.4f); + assertTermEquals("over", filter, 1.0f); + assertTermEquals("the", filter, 1.0f); + assertTermEquals("lazy", filter, 0.5f); + assertTermEquals("brown", filter, 0.6f); + assertTermEquals("dogs", filter, 0.6f); + assertFalse(filter.incrementToken()); + filter.end(); + filter.close(); + } + + void assertTermEquals(String expected, TokenStream stream, float expectedBoost) throws Exception { + CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); + BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class); + assertTrue(stream.incrementToken()); + assertEquals(expected, termAtt.toString()); + float actualBoost = boostAtt.getBoost(); + assertTrue(actualBoost + " does not equal: " + expectedBoost, actualBoost == expectedBoost); + } + + void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, BoostAttribute boostAtt, float expectedBoost) throws Exception { + assertTrue(stream.incrementToken()); + assertEquals(expected, termAtt.toString()); + float actualBoost = boostAtt.getBoost(); + assertTrue(actualBoost + " does not equal: " + expectedBoost, actualBoost == expectedBoost); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/BoostAttribute.java b/lucene/core/src/java/org/apache/lucene/search/BoostAttribute.java index 2a99a0828ad7..9030b5728d71 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BoostAttribute.java +++ b/lucene/core/src/java/org/apache/lucene/search/BoostAttribute.java @@ -32,6 +32,7 @@ * @lucene.internal */ public interface BoostAttribute extends Attribute { + float DEFAULT_BOOST = 1.0f; /** Sets the boost in this attribute */ public void setBoost(float boost); /** Retrieves the boost, default is {@code 1.0f}. */ diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java index 31da91256517..55ccbd1d06d8 100644 --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java @@ -30,17 +30,21 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostAttribute; +import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.SynonymQuery; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.spans.SpanBoostQuery; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings; +import static org.apache.lucene.search.BoostAttribute.DEFAULT_BOOST; /** * Creates queries from the {@link Analyzer} chain. @@ -63,6 +67,24 @@ public class QueryBuilder { protected boolean enableGraphQueries = true; protected boolean autoGenerateMultiTermSynonymsPhraseQuery = false; + /** + * Wraps a term and boost + */ + public static class TermAndBoost { + /** the term */ + public final Term term; + /** the boost */ + public final float boost; + + /** + * Creates a new TermAndBoost + */ + public TermAndBoost(Term term, float boost) { + this.term = term; + this.boost = boost; + } + } + /** Creates a new QueryBuilder using the given analyzer. */ public QueryBuilder(Analyzer analyzer) { this.analyzer = analyzer; @@ -350,22 +372,32 @@ protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operato */ protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOException { TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class); + BoostAttribute boostAtt = in.addAttribute(BoostAttribute.class); + + SpanQuery result; + float boost = DEFAULT_BOOST; if (termAtt == null) { return null; } List terms = new ArrayList<>(); while (in.incrementToken()) { + boost *= boostAtt.getBoost(); terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef()))); } if (terms.isEmpty()) { return null; } else if (terms.size() == 1) { - return terms.get(0); + result = terms.get(0); } else { - return new SpanNearQuery(terms.toArray(new SpanTermQuery[0]), 0, true); + result = new SpanNearQuery(terms.toArray(new SpanQuery[0]), 0, true); } + + if (boost != DEFAULT_BOOST) { + result = new SpanBoostQuery(result, boost); + } + return result; } /** @@ -373,13 +405,14 @@ protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOExcep */ protected Query analyzeTerm(String field, TokenStream stream) throws IOException { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); + BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class); stream.reset(); if (!stream.incrementToken()) { throw new AssertionError(); } - return newTermQuery(new Term(field, termAtt.getBytesRef())); + return newTermQuery(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()); } /** @@ -387,24 +420,25 @@ protected Query analyzeTerm(String field, TokenStream stream) throws IOException */ protected Query analyzeBoolean(String field, TokenStream stream) throws IOException { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); + BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class); stream.reset(); - List terms = new ArrayList<>(); + List terms = new ArrayList<>(); while (stream.incrementToken()) { - terms.add(new Term(field, termAtt.getBytesRef())); + terms.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost())); } - return newSynonymQuery(terms.toArray(new Term[terms.size()])); + return newSynonymQuery(terms.toArray(new TermAndBoost[0])); } - protected void add(BooleanQuery.Builder q, List current, BooleanClause.Occur operator) { + protected void add(BooleanQuery.Builder q, List current, BooleanClause.Occur operator) { if (current.isEmpty()) { return; } if (current.size() == 1) { - q.add(newTermQuery(current.get(0)), operator); + q.add(newTermQuery(current.get(0).term, current.get(0).boost), operator); } else { - q.add(newSynonymQuery(current.toArray(new Term[current.size()])), operator); + q.add(newSynonymQuery(current.toArray(new TermAndBoost[0])), operator); } } @@ -413,10 +447,11 @@ protected void add(BooleanQuery.Builder q, List current, BooleanClause.Occ */ protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException { BooleanQuery.Builder q = newBooleanQuery(); - List currentQuery = new ArrayList<>(); + List currentQuery = new ArrayList<>(); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); + BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class); stream.reset(); while (stream.incrementToken()) { @@ -424,7 +459,7 @@ protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanCla add(q, currentQuery, operator); currentQuery.clear(); } - currentQuery.add(new Term(field, termAtt.getBytesRef())); + currentQuery.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost())); } add(q, currentQuery, operator); @@ -439,9 +474,10 @@ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws builder.setSlop(slop); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); + BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); - int position = -1; - + int position = -1; + float phraseBoost = DEFAULT_BOOST; stream.reset(); while (stream.incrementToken()) { if (enablePositionIncrements) { @@ -450,9 +486,13 @@ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws position += 1; } builder.add(new Term(field, termAtt.getBytesRef()), position); + phraseBoost *= boostAtt.getBoost(); } - - return builder.build(); + PhraseQuery query = builder.build(); + if (phraseBoost == DEFAULT_BOOST) { + return query; + } + return new BoostQuery(query, phraseBoost); } /** @@ -509,33 +549,40 @@ protected Query analyzeGraphBoolean(String field, TokenStream source, BooleanCla end = articulationPoints[i]; } lastState = end; - final Query queryPos; + final Query positionalQuery; if (graph.hasSidePath(start)) { - final Iterator it = graph.getFiniteStrings(start, end); + final Iterator sidePathsIterator = graph.getFiniteStrings(start, end); Iterator queries = new Iterator() { @Override public boolean hasNext() { - return it.hasNext(); + return sidePathsIterator.hasNext(); } @Override public Query next() { - TokenStream ts = it.next(); - return createFieldQuery(ts, BooleanClause.Occur.MUST, field, getAutoGenerateMultiTermSynonymsPhraseQuery(), 0); + TokenStream sidePath = sidePathsIterator.next(); + return createFieldQuery(sidePath, BooleanClause.Occur.MUST, field, getAutoGenerateMultiTermSynonymsPhraseQuery(), 0); } }; - queryPos = newGraphSynonymQuery(queries); + positionalQuery = newGraphSynonymQuery(queries); } else { - Term[] terms = graph.getTerms(field, start); + List attributes = graph.getTerms(start); + TermAndBoost[] terms = attributes.stream() + .map(s -> { + TermToBytesRefAttribute t = s.addAttribute(TermToBytesRefAttribute.class); + BoostAttribute b = s.addAttribute(BoostAttribute.class); + return new TermAndBoost(new Term(field, t.getBytesRef()), b.getBoost()); + }) + .toArray(TermAndBoost[]::new); assert terms.length > 0; if (terms.length == 1) { - queryPos = newTermQuery(terms[0]); + positionalQuery = newTermQuery(terms[0].term, terms[0].boost); } else { - queryPos = newSynonymQuery(terms); + positionalQuery = newSynonymQuery(terms); } } - if (queryPos != null) { - builder.add(queryPos, operator); + if (positionalQuery != null) { + builder.add(positionalQuery, operator); } } return builder.build(); @@ -650,10 +697,10 @@ protected BooleanQuery.Builder newBooleanQuery() { * This is intended for subclasses that wish to customize the generated queries. * @return new Query instance */ - protected Query newSynonymQuery(Term terms[]) { - SynonymQuery.Builder builder = new SynonymQuery.Builder(terms[0].field()); - for (Term term : terms) { - builder.addTerm(term); + protected Query newSynonymQuery(TermAndBoost[] terms) { + SynonymQuery.Builder builder = new SynonymQuery.Builder(terms[0].term.field()); + for (TermAndBoost t : terms) { + builder.addTerm(t.term, t.boost); } return builder.build(); } @@ -683,10 +730,15 @@ protected Query newGraphSynonymQuery(Iterator queries) { * @param term term * @return new TermQuery instance */ - protected Query newTermQuery(Term term) { - return new TermQuery(term); + protected Query newTermQuery(Term term, float boost) { + Query q = new TermQuery(term); + if (boost == DEFAULT_BOOST) { + return q; + } + return new BoostQuery(q, boost); } + /** * Builds a new MultiPhraseQuery instance. *

diff --git a/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java b/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java index 7289ead38eff..927dfd4080f9 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java @@ -20,6 +20,7 @@ import java.io.IOException; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.AnalyzerWrapper; import org.apache.lucene.analysis.CannedBinaryTokenStream; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockSynonymFilter; @@ -32,6 +33,8 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostAttribute; +import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; @@ -507,4 +510,51 @@ public void testMaxBooleanClause() throws Exception { expectThrows(IndexSearcher.TooManyClauses.class, () -> qb.analyzeGraphPhrase(ts, "", 0)); } } + + private static final class MockBoostTokenFilter extends TokenFilter { + + final BoostAttribute boostAtt = addAttribute(BoostAttribute.class); + final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + protected MockBoostTokenFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken() == false) { + return false; + } + if (termAtt.length() == 3) { + boostAtt.setBoost(0.5f); + } + return true; + } + } + + public void testTokenStreamBoosts() { + Analyzer msa = new MockSynonymAnalyzer(); + Analyzer a = new AnalyzerWrapper(msa.getReuseStrategy()) { + @Override + protected Analyzer getWrappedAnalyzer(String fieldName) { + return msa; + } + @Override + protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { + return new TokenStreamComponents(components.getSource(), new MockBoostTokenFilter(components.getTokenStream())); + } + }; + + QueryBuilder builder = new QueryBuilder(a); + Query q = builder.createBooleanQuery("field", "hot dogs"); + Query expected = new BooleanQuery.Builder() + .add(new BoostQuery(new TermQuery(new Term("field", "hot")), 0.5f), BooleanClause.Occur.SHOULD) + .add(new SynonymQuery.Builder("field") + .addTerm(new Term("field", "dogs")) + .addTerm(new Term("field", "dog"), 0.5f) + .build(), BooleanClause.Occur.SHOULD) + .build(); + + assertEquals(expected, q); + } } diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java index 9a4043d1d8a3..d552aef9fe75 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java @@ -147,7 +147,7 @@ public Query parse(String query) throws ParseException { // to throw a runtime exception here if a term for another field is embedded // in phrase query @Override - protected Query newTermQuery(Term term) { + protected Query newTermQuery(Term term, float boost) { if (isPass2ResolvingPhrases) { try { checkPhraseClauseIsForSameField(term.field()); @@ -155,7 +155,7 @@ protected Query newTermQuery(Term term) { throw new RuntimeException("Error parsing complex phrase", pe); } } - return super.newTermQuery(term); + return super.newTermQuery(term, boost); } // Helper method used to report on any clauses that appear in query syntax diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java index 469da7f1c16c..a4084d1509de 100644 --- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java +++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java @@ -21,6 +21,8 @@ import java.util.Collections; import java.util.EnumSet; import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -601,19 +603,35 @@ protected Query newRegexpQuery(Term regexp) { } @Override - protected Query newSynonymQuery(Term terms[]) { + protected Query newGraphSynonymQuery(Iterator sidePathQueriesIterator) { + switch (synonymQueryStyle) { + case PICK_BEST: { + List sidePathSynonymQueries = new LinkedList<>(); + sidePathQueriesIterator.forEachRemaining(sidePathSynonymQueries::add); + return new DisjunctionMaxQuery(sidePathSynonymQueries, 0.0f); + } + case AS_SAME_TERM: + case AS_DISTINCT_TERMS:{ + return super.newGraphSynonymQuery(sidePathQueriesIterator);} + default: + throw new AssertionError("unrecognized synonymQueryStyle passed when creating newSynonymQuery"); + } + } + + @Override + protected Query newSynonymQuery(TermAndBoost[] terms) { switch (synonymQueryStyle) { case PICK_BEST: List currPosnClauses = new ArrayList(terms.length); - for (Term term : terms) { - currPosnClauses.add(newTermQuery(term)); + for (TermAndBoost term : terms) { + currPosnClauses.add(newTermQuery(term.term, term.boost)); } DisjunctionMaxQuery dm = new DisjunctionMaxQuery(currPosnClauses, 0.0f); return dm; case AS_DISTINCT_TERMS: BooleanQuery.Builder builder = new BooleanQuery.Builder(); - for (Term term : terms) { - builder.add(newTermQuery(term), BooleanClause.Occur.SHOULD); + for (TermAndBoost term : terms) { + builder.add(newTermQuery(term.term, term.boost), BooleanClause.Occur.SHOULD); } return builder.build(); case AS_SAME_TERM: diff --git a/solr/core/src/test-files/solr/collection1/conf/schema12.xml b/solr/core/src/test-files/solr/collection1/conf/schema12.xml index 1368e6b04fc4..d4cb89e85f87 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema12.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema12.xml @@ -227,6 +227,41 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -656,6 +691,9 @@ + + + diff --git a/solr/core/src/test-files/solr/collection1/conf/synonyms.txt b/solr/core/src/test-files/solr/collection1/conf/synonyms.txt index 68dbf0bf62b1..d7feb34ee647 100644 --- a/solr/core/src/test-files/solr/collection1/conf/synonyms.txt +++ b/solr/core/src/test-files/solr/collection1/conf/synonyms.txt @@ -37,4 +37,18 @@ crow blackbird, grackle tabby => tabby, cat, feline, animal persian => persian, cat, feline, animal -jeans, denim pants \ No newline at end of file +jeans, denim pants + +# Boosted Synonyms +tiger, tigre|0.9 +lynx => lince|0.8, lynx_canadensis|0.9 + +leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85 +lion => panthera leo|0.9, simba leo|0.8, kimba|0.75 + +panthera pardus, leopard|0.6 +panthera tigris => tiger|0.99 + +snow leopard, panthera uncia|0.9, big cat|0.8, white_leopard|0.6 +panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65 +panthera blytheae, oldest|0.5 ancient|0.9 panthera \ No newline at end of file diff --git a/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymGraphFilterFactory.java b/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymGraphFilterFactory.java index fc1e735b355a..66e9efe5fce4 100644 --- a/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymGraphFilterFactory.java +++ b/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymGraphFilterFactory.java @@ -300,4 +300,80 @@ public void testCanHandleDecodingAndEncodingForSynonyms() throws Exception { assertJDelete(endpoint+"/fröhlich", "/error/code==404"); } + + /** + * Can we add and single term synonyms with weight + */ + @Test + public void testManagedSynonyms_singleTermWithWeight_shouldHandleSynonym() throws Exception { + String endpoint = "/schema/analysis/synonyms/englishgraph"; + + assertJQ(endpoint, + "/synonymMappings/initArgs/ignoreCase==false", + "/synonymMappings/managedMap=={}"); + + // does not exist + assertJQ(endpoint+"/tiger", + "/error/code==404"); + + Map> syns = new HashMap<>(); + + // now put a synonym + syns.put("tiger", Arrays.asList("tiger|1.0")); + assertJPut(endpoint, + toJSONString(syns), + "/responseHeader/status==0"); + + // and check if it exists + assertJQ(endpoint, + "/synonymMappings/managedMap/tiger==['tiger|1.0']"); + + // verify delete works + assertJDelete(endpoint+"/tiger", + "/responseHeader/status==0"); + + + // was it really deleted? + assertJDelete(endpoint+"/tiger", + "/error/code==404"); + } + + /** + * Can we add multi term synonyms with weight + */ + @Test + public void testManagedSynonyms_multiTermWithWeight_shouldHandleSynonym() throws Exception { + String endpoint = "/schema/analysis/synonyms/englishgraph"; + + assertJQ(endpoint, + "/synonymMappings/initArgs/ignoreCase==false", + "/synonymMappings/managedMap=={}"); + + // does not exist + assertJQ(endpoint+"/tiger", + "/error/code==404"); + + Map> syns = new HashMap<>(); + + // now put a synonym + List tigerSyonyms = Arrays.asList("tiger|1.0", "panthera tigris|0.9", "Shere Kan|0.8"); + syns.put("tiger", tigerSyonyms); + String jsonTigerSynonyms = toJSONString(syns); + assertJPut(endpoint, + jsonTigerSynonyms, + "/responseHeader/status==0"); + + // and check if it exists + assertJQ(endpoint, + "/synonymMappings/managedMap/tiger==[\"Shere Kan|0.8\",\"panthera tigris|0.9\",\"tiger|1.0\"]"); + + // verify delete works + assertJDelete(endpoint+"/tiger", + "/responseHeader/status==0"); + + + // was it really deleted? + assertJDelete(endpoint+"/tiger", + "/error/code==404"); + } } diff --git a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java index 9fb2598e2eae..69d12bb7b92a 100644 --- a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java +++ b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java @@ -1221,8 +1221,225 @@ public void testSynonymQueryStyle() throws Exception { assertEquals("(t_as_distinct_foo:\"denim pant\" t_as_distinct_foo:jean)", q.toString()); q = QParser.getParser("jeans", req(params("df", "t_pick_best_foo", "sow", "false"))).getQuery(); - assertEquals("(t_pick_best_foo:\"denim pant\" t_pick_best_foo:jean)", q.toString()); + assertEquals("(t_pick_best_foo:\"denim pant\" | t_pick_best_foo:jean)", q.toString()); + } + + public void testSynonymsBoost_singleTermQuerySingleTermSynonyms_shouldParseBoostedQuery() throws Exception { + //tiger, tigre|0.9 + Query q = QParser.getParser("tiger", req(params("df", "t_pick_best_boosted_foo"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:tigre)^0.9 | t_pick_best_boosted_foo:tiger)", q.toString()); + + q = QParser.getParser("tiger", req(params("df", "t_as_distinct_boosted_foo"))).getQuery(); + assertEquals("(t_as_distinct_boosted_foo:tigre)^0.9 t_as_distinct_boosted_foo:tiger", q.toString()); + + q = QParser.getParser("tiger", req(params("df", "t_as_same_term_boosted_foo"))).getQuery(); + assertEquals("Synonym(t_as_same_term_boosted_foo:tiger t_as_same_term_boosted_foo:tigre^0.9)", q.toString()); + + //lynx => lince|0.8, lynx_canadensis|0.9 + q = QParser.getParser("lynx", req(params("df", "t_pick_best_boosted_foo"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:lince)^0.8 | (t_pick_best_boosted_foo:lynx_canadensis)^0.9)", q.toString()); + + q = QParser.getParser("lynx", req(params("df", "t_as_distinct_boosted_foo"))).getQuery(); + assertEquals("(t_as_distinct_boosted_foo:lince)^0.8 (t_as_distinct_boosted_foo:lynx_canadensis)^0.9", q.toString()); + + q = QParser.getParser("lynx", req(params("df", "t_as_same_term_boosted_foo"))).getQuery(); + assertEquals("Synonym(t_as_same_term_boosted_foo:lince^0.8 t_as_same_term_boosted_foo:lynx_canadensis^0.9)", q.toString()); + } + + public void testSynonymsBoost_singleTermQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { + //leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85 + Query q = QParser.getParser("leopard", req(params("df", "t_pick_best_boosted_foo"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:bagheera)^0.9 | (t_pick_best_boosted_foo:\"panthera pardus\")^0.85 | t_pick_best_boosted_foo:leopard)", q.toString()); + + q = QParser.getParser("leopard", req(params("df", "t_as_distinct_boosted_foo"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:bagheera)^0.9 (t_as_distinct_boosted_foo:\"panthera pardus\")^0.85 t_as_distinct_boosted_foo:leopard)", q.toString()); + + q = QParser.getParser("leopard", req(params("df", "t_as_same_term_boosted_foo"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:bagheera)^0.9 (t_as_same_term_boosted_foo:\"panthera pardus\")^0.85 t_as_same_term_boosted_foo:leopard)", q.toString()); + + //lion => panthera leo|0.9, simba leo|0.8, kimba|0.75 + q = QParser.getParser("lion", req(params("df", "t_pick_best_boosted_foo"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:\"panthera leo\")^0.9 | (t_pick_best_boosted_foo:\"simba leo\")^0.8 | (t_pick_best_boosted_foo:kimba)^0.75)", q.toString()); + + q = QParser.getParser("lion", req(params("df", "t_as_distinct_boosted_foo"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:\"panthera leo\")^0.9 (t_as_distinct_boosted_foo:\"simba leo\")^0.8 (t_as_distinct_boosted_foo:kimba)^0.75)", q.toString()); + + q = QParser.getParser("lion", req(params("df", "t_as_same_term_boosted_foo"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:\"panthera leo\")^0.9 (t_as_same_term_boosted_foo:\"simba leo\")^0.8 (t_as_same_term_boosted_foo:kimba)^0.75)", q.toString()); + } + + public void testSynonymsBoost_multiTermQuerySingleTermSynonyms_shouldParseBoostedQuery() throws Exception { + //tiger, tigre|0.9 + //lynx => lince|0.8, lynx_canadensis|0.9 + Query q = QParser.getParser("tiger lynx", req(params("df", "t_pick_best_boosted_foo"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:tigre)^0.9 | t_pick_best_boosted_foo:tiger)" + + " ((t_pick_best_boosted_foo:lince)^0.8 | (t_pick_best_boosted_foo:lynx_canadensis)^0.9)", q.toString()); + + q = QParser.getParser("tiger lynx", req(params("df", "t_as_distinct_boosted_foo"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:tigre)^0.9 t_as_distinct_boosted_foo:tiger)" + + " ((t_as_distinct_boosted_foo:lince)^0.8 (t_as_distinct_boosted_foo:lynx_canadensis)^0.9)", q.toString()); + + q = QParser.getParser("tiger lynx", req(params("df", "t_as_same_term_boosted_foo"))).getQuery(); + assertEquals("Synonym(t_as_same_term_boosted_foo:tiger t_as_same_term_boosted_foo:tigre^0.9)" + + " Synonym(t_as_same_term_boosted_foo:lince^0.8 t_as_same_term_boosted_foo:lynx_canadensis^0.9)", q.toString()); + } + + public void testSynonymsBoost_multiTermQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { + //leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85 + //lion => panthera leo|0.9, simba leo|0.8, kimba|0.75 + Query q = QParser.getParser("leopard lion", req(params("df", "t_pick_best_boosted_foo"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:bagheera)^0.9 | (t_pick_best_boosted_foo:\"panthera pardus\")^0.85 | t_pick_best_boosted_foo:leopard)" + + " ((t_pick_best_boosted_foo:\"panthera leo\")^0.9 | (t_pick_best_boosted_foo:\"simba leo\")^0.8 | (t_pick_best_boosted_foo:kimba)^0.75)", q.toString()); + + q = QParser.getParser("leopard lion", req(params("df", "t_as_distinct_boosted_foo"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:bagheera)^0.9 (t_as_distinct_boosted_foo:\"panthera pardus\")^0.85 t_as_distinct_boosted_foo:leopard)" + + " ((t_as_distinct_boosted_foo:\"panthera leo\")^0.9 (t_as_distinct_boosted_foo:\"simba leo\")^0.8 (t_as_distinct_boosted_foo:kimba)^0.75)", q.toString()); + + q = QParser.getParser("leopard lion", req(params("df", "t_as_same_term_boosted_foo"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:bagheera)^0.9 (t_as_same_term_boosted_foo:\"panthera pardus\")^0.85 t_as_same_term_boosted_foo:leopard)" + + " ((t_as_same_term_boosted_foo:\"panthera leo\")^0.9 (t_as_same_term_boosted_foo:\"simba leo\")^0.8 (t_as_same_term_boosted_foo:kimba)^0.75)", q.toString()); + + } + + public void testSynonymsBoost_singleConceptQuerySingleTermSynonym_shouldParseBoostedQuery() throws Exception { + //panthera pardus, leopard|0.6 + Query q = QParser.getParser("panthera pardus story",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:leopard)^0.6 | t_pick_best_boosted_foo:\"panthera pardus\") t_pick_best_boosted_foo:story", q.toString()); + + q = QParser.getParser("panthera pardus story", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:leopard)^0.6 t_as_distinct_boosted_foo:\"panthera pardus\") t_as_distinct_boosted_foo:story", q.toString()); + + q = QParser.getParser("panthera pardus story", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:leopard)^0.6 t_as_same_term_boosted_foo:\"panthera pardus\") t_as_same_term_boosted_foo:story", q.toString()); + + //panthera tigris => tiger|0.99 + q = QParser.getParser("panthera tigris story", req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery(); + assertEquals("(t_pick_best_boosted_foo:tiger)^0.99 t_pick_best_boosted_foo:story", q.toString()); + + q = QParser.getParser("panthera tigris story", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery(); + assertEquals("(t_as_distinct_boosted_foo:tiger)^0.99 t_as_distinct_boosted_foo:story", q.toString()); + + q = QParser.getParser("panthera tigris story", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery(); + assertEquals("(t_as_same_term_boosted_foo:tiger)^0.99 t_as_same_term_boosted_foo:story", q.toString()); + } + + public void testSynonymsBoost_singleConceptQueryMultiTermSynonymWithMultipleBoost_shouldParseMultiplicativeBoostedQuery() throws Exception { + //panthera blytheae, oldest|0.5 ancient|0.9 panthera + Query q = QParser.getParser("panthera blytheae",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:\"oldest ancient panthera\")^0.45 | t_pick_best_boosted_foo:\"panthera blytheae\")", q.toString()); + + q = QParser.getParser("panthera blytheae", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:\"oldest ancient panthera\")^0.45 t_as_distinct_boosted_foo:\"panthera blytheae\")", q.toString()); + + q = QParser.getParser("panthera blytheae", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:\"oldest ancient panthera\")^0.45 t_as_same_term_boosted_foo:\"panthera blytheae\")", q.toString()); + } + + public void testSynonymsBoost_singleConceptQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { + //snow leopard, panthera uncia|0.9, big cat|0.8, white_leopard|0.6 + Query q = QParser.getParser("snow leopard",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:\"panthera uncia\")^0.9 | (t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:white_leopard)^0.6 | t_pick_best_boosted_foo:\"snow leopard\")", q.toString()); + + q = QParser.getParser("snow leopard", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:\"panthera uncia\")^0.9 (t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:white_leopard)^0.6 t_as_distinct_boosted_foo:\"snow leopard\")", q.toString()); + + q = QParser.getParser("snow leopard", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:\"panthera uncia\")^0.9 (t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:white_leopard)^0.6 t_as_same_term_boosted_foo:\"snow leopard\")", q.toString()); + + //panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65 + q = QParser.getParser("panthera onca", req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:jaguar)^0.95 | (t_pick_best_boosted_foo:\"big cat\")^0.85 | (t_pick_best_boosted_foo:\"black panther\")^0.65)", q.toString()); + + q = QParser.getParser("panthera onca", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:jaguar)^0.95 (t_as_distinct_boosted_foo:\"big cat\")^0.85 (t_as_distinct_boosted_foo:\"black panther\")^0.65)", q.toString()); + + q = QParser.getParser("panthera onca", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:jaguar)^0.95 (t_as_same_term_boosted_foo:\"big cat\")^0.85 (t_as_same_term_boosted_foo:\"black panther\")^0.65)", q.toString()); + + } + + public void testSynonymsBoost_multiConceptQuerySingleTermSynonym_shouldParseBoostedQuery() throws Exception { + //panthera pardus, leopard|0.6 + //tiger, tigre|0.9 + Query q = QParser.getParser("panthera pardus tiger",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:leopard)^0.6 | t_pick_best_boosted_foo:\"panthera pardus\") ((t_pick_best_boosted_foo:tigre)^0.9 | t_pick_best_boosted_foo:tiger)", q.toString()); + + q = QParser.getParser("panthera pardus tiger", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:leopard)^0.6 t_as_distinct_boosted_foo:\"panthera pardus\") ((t_as_distinct_boosted_foo:tigre)^0.9 t_as_distinct_boosted_foo:tiger)", q.toString()); + + q = QParser.getParser("panthera pardus tiger", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:leopard)^0.6 t_as_same_term_boosted_foo:\"panthera pardus\") Synonym(t_as_same_term_boosted_foo:tiger t_as_same_term_boosted_foo:tigre^0.9)", q.toString()); + } + + public void testSynonymsBoost_multiConceptsQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception { + //snow leopard, panthera uncia|0.9, big cat|0.8, white_leopard|0.6 + //panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65 + Query q = QParser.getParser("snow leopard panthera onca",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:\"panthera uncia\")^0.9 | (t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:white_leopard)^0.6 | t_pick_best_boosted_foo:\"snow leopard\")" + + " ((t_pick_best_boosted_foo:jaguar)^0.95 | (t_pick_best_boosted_foo:\"big cat\")^0.85 | (t_pick_best_boosted_foo:\"black panther\")^0.65)", q.toString()); + + q = QParser.getParser("snow leopard panthera onca", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:\"panthera uncia\")^0.9 (t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:white_leopard)^0.6 t_as_distinct_boosted_foo:\"snow leopard\")" + + " ((t_as_distinct_boosted_foo:jaguar)^0.95 (t_as_distinct_boosted_foo:\"big cat\")^0.85 (t_as_distinct_boosted_foo:\"black panther\")^0.65)", q.toString()); + + q = QParser.getParser("snow leopard panthera onca", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery(); + assertEquals("((t_as_same_term_boosted_foo:\"panthera uncia\")^0.9 (t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:white_leopard)^0.6 t_as_same_term_boosted_foo:\"snow leopard\")" + + " ((t_as_same_term_boosted_foo:jaguar)^0.95 (t_as_same_term_boosted_foo:\"big cat\")^0.85 (t_as_same_term_boosted_foo:\"black panther\")^0.65)", q.toString()); + + } + + public void testSynonymsBoost_edismaxBoost_shouldParseBoostedPhraseQuery() throws Exception { + Query q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_pick_best_boosted_foo^10"))).getQuery(); + assertEquals("+(" + + "((((t_pick_best_boosted_foo:\"panthera uncia\")^0.9 | (t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:white_leopard)^0.6 | t_pick_best_boosted_foo:\"snow leopard\"))^10.0)" + + " ((((t_pick_best_boosted_foo:\"panthera leo\")^0.9 | (t_pick_best_boosted_foo:\"simba leo\")^0.8 | (t_pick_best_boosted_foo:kimba)^0.75))^10.0)" + + ")", q.toString()); + + q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_as_distinct_boosted_foo^10"))).getQuery(); + assertEquals("+(" + + "(((t_as_distinct_boosted_foo:\"panthera uncia\")^0.9 (t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:white_leopard)^0.6 t_as_distinct_boosted_foo:\"snow leopard\")^10.0)" + + " (((t_as_distinct_boosted_foo:\"panthera leo\")^0.9 (t_as_distinct_boosted_foo:\"simba leo\")^0.8 (t_as_distinct_boosted_foo:kimba)^0.75)^10.0))", q.toString()); + + q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_as_same_term_boosted_foo^10"))).getQuery(); + assertEquals("+(" + + "(((t_as_same_term_boosted_foo:\"panthera uncia\")^0.9 (t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:white_leopard)^0.6 t_as_same_term_boosted_foo:\"snow leopard\")^10.0)" + + " (((t_as_same_term_boosted_foo:\"panthera leo\")^0.9 (t_as_same_term_boosted_foo:\"simba leo\")^0.8 (t_as_same_term_boosted_foo:kimba)^0.75)^10.0))", q.toString()); + + } + + public void testSynonymsBoost_phraseQueryMultiTermSynonymsBoost_shouldParseBoostedSpanQuery() throws Exception { + Query q = QParser.getParser("\"snow leopard lion\"", req(params("df", "t_pick_best_boosted_foo", "sow", "false"))).getQuery(); + assertEquals("spanNear([" + + "spanOr([" + + "(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:uncia], 0, true))^0.9," + + " (spanNear([t_pick_best_boosted_foo:big, t_pick_best_boosted_foo:cat], 0, true))^0.8," + + " (t_pick_best_boosted_foo:white_leopard)^0.6," + + " spanNear([t_pick_best_boosted_foo:snow, t_pick_best_boosted_foo:leopard], 0, true)])," + + " spanOr([" + + "(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:leo], 0, true))^0.9," + + " (spanNear([t_pick_best_boosted_foo:simba, t_pick_best_boosted_foo:leo], 0, true))^0.8," + + " (t_pick_best_boosted_foo:kimba)^0.75])], 0, true)", q.toString()); + } + + public void testSynonymsBoost_phraseQueryMultiTermSynonymsMultipleBoost_shouldParseMultiplicativeBoostedSpanQuery() throws Exception { + Query q = QParser.getParser("\"panthera blytheae lion\"", req(params("df", "t_pick_best_boosted_foo", "sow", "false"))).getQuery(); + assertEquals("spanNear([" + + "spanOr([" + + "(spanNear([t_pick_best_boosted_foo:oldest, t_pick_best_boosted_foo:ancient, t_pick_best_boosted_foo:panthera], 0, true))^0.45," + + " spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:blytheae], 0, true)])," + + " spanOr([" + + "(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:leo], 0, true))^0.9," + + " (spanNear([t_pick_best_boosted_foo:simba, t_pick_best_boosted_foo:leo], 0, true))^0.8," + + " (t_pick_best_boosted_foo:kimba)^0.75])], 0, true)", q.toString()); + } + + public void testSynonymsBoost_BoostMissing_shouldAssignDefaultBoost() throws Exception { + //leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85 + Query q = QParser.getParser("leopard", req(params("df", "t_pick_best_boosted_foo"))).getQuery(); + assertEquals("((t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:bagheera)^0.9 | (t_pick_best_boosted_foo:\"panthera pardus\")^0.85 | t_pick_best_boosted_foo:leopard)", q.toString()); + q = QParser.getParser("leopard", req(params("df", "t_as_distinct_boosted_foo"))).getQuery(); + assertEquals("((t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:bagheera)^0.9 (t_as_distinct_boosted_foo:\"panthera pardus\")^0.85 t_as_distinct_boosted_foo:leopard)", q.toString()); } @Test diff --git a/solr/solr-ref-guide/src/filter-descriptions.adoc b/solr/solr-ref-guide/src/filter-descriptions.adoc index daa1f85dfe16..f4f6cb7a3008 100644 --- a/solr/solr-ref-guide/src/filter-descriptions.adoc +++ b/solr/solr-ref-guide/src/filter-descriptions.adoc @@ -398,6 +398,72 @@ Discard original token (`inject="false"`). Note that "Kuczewski" has two encodings, which are added at the same position. +== Delimited Boost Filter + +This filter adds a numeric floating point boost value to tokens, splitting on a delimiter character. + +*Factory class:* `solr.DelimitedBoostTokenFilterFactory` + +*Arguments:* + +`delimiter`:: The character used to separate the token and the boost. Defaults to '|'. + +*Example:* + +[.dynamic-tabs] +-- +[example.tab-pane#byname-filter-delimitedBoost] +==== +[.tab-label]*With name* +[source,xml] +---- + + + + +---- +==== +[example.tab-pane#byclass-filter-delimitedBoost] +==== +[.tab-label]*With class name (legacy)* +[source,xml] +---- + + + + +---- +==== +-- + +*In:* "leopard|0.5 panthera uncia|0.9" + +*Tokenizer to Filter:* "leopard|0.5"(1), "panthera"(2), "uncia|0.9"(3) + +*Out:* "leopard"(1)[0.5], "panthera"(2), "uncia"(3)[0.9] + +The numeric floating point in square brackets is a float token boost attribute. + +*Example:* + +Using a different delimiter (`delimiter="/"`). + +[source,xml] +---- + + + + +---- + +*In:* "leopard/0.5 panthera uncia/0.9" + +*Tokenizer to Filter:* "leopard/0.5"(1), "panthera"(2), "uncia/0.9"(3) + +*Out:* "leopard"(1)[0.5], "panthera"(2), "uncia"(3)[0.9] + +*N.B.* make sure the delimiter is compatible with the tokenizer you use + == Edge N-Gram Filter This filter generates edge n-gram tokens of sizes within the given range. @@ -2292,6 +2358,39 @@ small => tiny,teeny,weeny *Out:* "the"(1), "large"(2), "large"(3), "couch"(4), "sofa"(4), "divan"(4) +*Weighted Synonyms:* + +Combining the DelimitedBoostFilter with the Synonym Graph Filter you can achieve Weighted synonyms at query time. +For more information feel free to refer to: +https://sease.io/2020/02/introducing-weighted-synonyms-in-apache-lucene.html +For the following examples, assume a synonyms file named `boostedSynonyms.txt`: + +[source,text] +---- +leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85 +lion => panthera leo|0.9, simba|0.8, kimba|0.75 +---- + +*Example:* + +==== +[.tab-label]*With name* +[source,xml] +---- + + + + + +---- +==== + +*In:* "lion" + +*Tokenizer to Filter:* "lion"(1) + +*Out:* "panthera"(1), "leo"(2)[0.9], "simba"(1)[0.8], "kimba"(1)[0.75] + == Token Offset Payload Filter This filter adds the numeric character offsets of the token as a payload value for that token.