Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SOLR-12238] Synonym Queries boost #357

Merged
merged 41 commits into from
Feb 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
74bbc02
[SOLR-12238] Synonym Queries boost by payload + tests
alessandrobenedetti Apr 19, 2018
cacf48a
[SOLR-12238] query builder style revert
alessandrobenedetti Apr 19, 2018
ed3aa61
[SOLR-12238] query edismax boost test
alessandrobenedetti Apr 20, 2018
5c1e50c
[SOLR-12238] comments correction
alessandrobenedetti Apr 20, 2018
3d7ad62
[SOLR-12238] use PayloadHelper to decode the payload
alessandrobenedetti Apr 26, 2018
3d77474
[SOLR-12238] tests to verify managed synonym handle weighted synonyms…
alessandrobenedetti Apr 27, 2018
e02ed0d
[SOLR-12238] minor style
alessandrobenedetti Apr 27, 2018
2e64120
[SOLR-12238] precommit fixes
alessandrobenedetti May 3, 2018
9abd7ab
Merge remote-tracking branch 'upstream/master' into SOLR-12238
alessandrobenedetti May 4, 2018
3258a1c
Merge branch 'upstreamMaster' into SOLR-12238
alessandrobenedetti Jan 24, 2020
42f9a99
[SOLR-12238] merge conflicts fixed
alessandrobenedetti Jan 24, 2020
6113658
[SOLR-12238] temp commit
alessandrobenedetti Jan 24, 2020
b44be68
[SOLR-12238] re-design
alessandrobenedetti Jan 24, 2020
f5567a6
[SOLR-12238] re-design
alessandrobenedetti Jan 24, 2020
89e55d1
[SOLR-12238] re-design and refinement
alessandrobenedetti Jan 25, 2020
5321a93
[SOLR-12238] re-design and refinement
alessandrobenedetti Jan 25, 2020
60551b4
[SOLR-12238] minor fix for an pre-commit
alessandrobenedetti Jan 25, 2020
e0f23b1
[SOLR-12238] schema simplification
alessandrobenedetti Jan 25, 2020
e715fb1
[SOLR-12238] Refactor based on PR comments
alessandrobenedetti Jan 27, 2020
6d2d291
[SOLR-12238] Extract the functionality to Solr classes keeping Lucene…
alessandrobenedetti Jan 27, 2020
6d9f1ee
[SOLR-12238] documentation improvement
alessandrobenedetti Jan 27, 2020
7f70c00
LUCENE-9171: Add BoostAttribute handling to QueryBuilder
romseygeek Feb 3, 2020
96e1ed3
imports
romseygeek Feb 3, 2020
8f349f0
javadocs
romseygeek Feb 3, 2020
edcdb8a
Merge remote-tracking branch 'AlanCommitter/queryparser/boosts' into …
alessandrobenedetti Feb 7, 2020
1d8cd9a
Merge remote-tracking branch 'upstream/master' into SOLR-12238
alessandrobenedetti Feb 7, 2020
9928a4a
[SOLR-12238] first implementation of the boostAttribute approach
alessandrobenedetti Feb 7, 2020
71b5a43
[SOLR-12238] refinement of the boostAttribute approach
alessandrobenedetti Feb 7, 2020
a75d618
[SOLR-12238] test for boost token filter
alessandrobenedetti Feb 7, 2020
61995cd
[SOLR-12238] package info fix
alessandrobenedetti Feb 7, 2020
52d848a
[SOLR-12238] adjustments of the github PR feedback
alessandrobenedetti Feb 7, 2020
6da4b82
[SOLR-12238] adjustments of the github PR feedback
alessandrobenedetti Feb 7, 2020
b6cbace
[SOLR-12238] adjustments of the github PR feedback
alessandrobenedetti Feb 10, 2020
54225c8
[SOLR-12238] adjustments of the github PR feedback
alessandrobenedetti Feb 11, 2020
bfdaf9d
[SOLR-12238] docs
alessandrobenedetti Feb 13, 2020
9d9f1c6
[SOLR-12238] docs
alessandrobenedetti Feb 13, 2020
1881390
[SOLR-12238] minor spi name fix
alessandrobenedetti Feb 13, 2020
81a4217
[SOLR-12238] adjustments of the github PR feedback
alessandrobenedetti Feb 13, 2020
8b8cb99
[SOLR-12238] adjustments of the github PR feedback
alessandrobenedetti Feb 13, 2020
899553b
Merge remote-tracking branch 'upstream/master' into SOLR-12238
alessandrobenedetti Feb 13, 2020
e4da3fb
[SOLR-12238] minor comment fix
alessandrobenedetti Feb 13, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.boost;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.search.BoostAttribute;

import java.io.IOException;


/**
* Characters before the delimiter are the "token", those after are the boost.
* <p>
* For example, if the delimiter is '|', then for the string "foo|0.7", foo is the token
* and 0.7 is the boost.
* <p>
* Note make sure your Tokenizer doesn't split on the delimiter, or this won't work
*/
public final class DelimitedBoostTokenFilter extends TokenFilter {
private final char delimiter;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final BoostAttribute boostAtt = addAttribute(BoostAttribute.class);

public DelimitedBoostTokenFilter(TokenStream input, char delimiter) {
super(input);
this.delimiter = delimiter;
}

@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
final char[] buffer = termAtt.buffer();
final int length = termAtt.length();
for (int i = 0; i < length; i++) {
if (buffer[i] == delimiter) {
float boost = Float.parseFloat(new String(buffer, i + 1, (length - (i + 1))));
boostAtt.setBoost(boost);
termAtt.setLength(i);
return true;
}
}
return true;
} else {
return false;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.boost;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;

import java.util.Map;

/**
* Factory for {@link DelimitedBoostTokenFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_dlmtd" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.DelimitedBoostTokenFilterFactory" delimiter="|"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
* @lucene.spi {@value #NAME}
*/
public class DelimitedBoostTokenFilterFactory extends TokenFilterFactory {

/**
* SPI name
*/
public static final String NAME = "delimitedBoost";
public static final String DELIMITER_ATTR = "delimiter";
public static final char DEFAULT_DELIMITER = '|';

private final char delimiter;

/**
* Creates a new DelimitedPayloadTokenFilterFactory
*/
public DelimitedBoostTokenFilterFactory(Map<String, String> args) {
super(args);
delimiter = getChar(args, DELIMITER_ATTR, DEFAULT_DELIMITER);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}

@Override
public DelimitedBoostTokenFilter create(TokenStream input) {
return new DelimitedBoostTokenFilter(input, delimiter);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/**
* Provides various convenience classes for creating boosts on Tokens.
*/
package org.apache.lucene.analysis.boost;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While I can see why you chose a new "boost" sub-package because the payload based filter from which you drew inspiration was in a "payload" sub-package, I lean towards the "miscellaneous" package. Note that DelimitedTermFrequencyTokenFilter is in "miscellaneous" too. WDYT @romseygeek ? Or maybe we need a new "delimited" sub-package for all these to go; I dunno.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the boost package - I'm already thinking about a TypeToBoostTokenFilter that would automatically boost tokens marked with a SYNONYM type for example, and there are probably other boosting filters we can come up with, so a package to collect them all makes sense to me. I prefer to group packages by functionality rather than implementation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So let's keep boost package then? no strong opinion here my side

Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ org.apache.lucene.analysis.tr.ApostropheFilterFactory
org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory
org.apache.lucene.analysis.ar.ArabicStemFilterFactory
org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
org.apache.lucene.analysis.boost.DelimitedBoostTokenFilterFactory
org.apache.lucene.analysis.bn.BengaliNormalizationFilterFactory
org.apache.lucene.analysis.bn.BengaliStemFilterFactory
org.apache.lucene.analysis.br.BrazilianStemFilterFactory
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.boost;

import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.search.BoostAttribute;

public class DelimitedBoostTokenFilterTest extends BaseTokenStreamTestCase {

public void testBoosts() throws Exception {
String test = "The quick|0.4 red|0.5 fox|0.2 jumped|0.1 over the lazy|0.8 brown|0.9 dogs|0.9";
DelimitedBoostTokenFilter filter = new DelimitedBoostTokenFilter
(whitespaceMockTokenizer(test),
DelimitedBoostTokenFilterFactory.DEFAULT_DELIMITER);
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
BoostAttribute boostAtt = filter.addAttribute(BoostAttribute.class);
filter.reset();
assertTermEquals("The", filter, termAtt, boostAtt, 1.0f);
assertTermEquals("quick", filter, termAtt, boostAtt, 0.4f);
assertTermEquals("red", filter, termAtt, boostAtt, 0.5f);
assertTermEquals("fox", filter, termAtt, boostAtt, 0.2f);
assertTermEquals("jumped", filter, termAtt, boostAtt, 0.1f);
assertTermEquals("over", filter, termAtt, boostAtt, 1.0f);
assertTermEquals("the", filter, termAtt, boostAtt, 1.0f);
assertTermEquals("lazy", filter, termAtt, boostAtt, 0.8f);
assertTermEquals("brown", filter, termAtt, boostAtt, 0.9f);
assertTermEquals("dogs", filter, termAtt, boostAtt, 0.9f);
assertFalse(filter.incrementToken());
filter.end();
filter.close();
}

public void testNext() throws Exception {
String test = "The quick|0.1 red|0.2 fox|0.3 jumped|0.4 over the lazy|0.5 brown|0.6 dogs|0.6";
DelimitedBoostTokenFilter filter = new DelimitedBoostTokenFilter
(whitespaceMockTokenizer(test),
DelimitedBoostTokenFilterFactory.DEFAULT_DELIMITER);
filter.reset();
assertTermEquals("The", filter, 1.0f);
assertTermEquals("quick", filter, 0.1f);
assertTermEquals("red", filter, 0.2f);
assertTermEquals("fox", filter, 0.3f);
assertTermEquals("jumped", filter, 0.4f);
assertTermEquals("over", filter, 1.0f);
assertTermEquals("the", filter, 1.0f);
assertTermEquals("lazy", filter, 0.5f);
assertTermEquals("brown", filter, 0.6f);
assertTermEquals("dogs", filter, 0.6f);
assertFalse(filter.incrementToken());
filter.end();
filter.close();
}

void assertTermEquals(String expected, TokenStream stream, float expectedBoost) throws Exception {
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
assertTrue(stream.incrementToken());
assertEquals(expected, termAtt.toString());
float actualBoost = boostAtt.getBoost();
assertTrue(actualBoost + " does not equal: " + expectedBoost, actualBoost == expectedBoost);
}

void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, BoostAttribute boostAtt, float expectedBoost) throws Exception {
assertTrue(stream.incrementToken());
assertEquals(expected, termAtt.toString());
float actualBoost = boostAtt.getBoost();
assertTrue(actualBoost + " does not equal: " + expectedBoost, actualBoost == expectedBoost);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
* @lucene.internal
*/
public interface BoostAttribute extends Attribute {
float DEFAULT_BOOST = 1.0f;
/** Sets the boost in this attribute */
public void setBoost(float boost);
/** Retrieves the boost, default is {@code 1.0f}. */
Expand Down
Loading