apache · romseygeek · Feb 24, 2020 · Apr 19, 2018 · Apr 19, 2018 · Apr 20, 2018
diff --git a/.../analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java b/.../analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.boost;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.search.BoostAttribute;
+
+import java.io.IOException;
+
+
+/**
+ * Characters before the delimiter are the "token", those after are the boost.
+ * <p>
+ * For example, if the delimiter is '|', then for the string "foo|0.7", foo is the token
+ * and 0.7 is the boost.
+ * <p>
+ * Note make sure your Tokenizer doesn't split on the delimiter, or this won't work
+ */
+public final class DelimitedBoostTokenFilter extends TokenFilter {
+  private final char delimiter;
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final BoostAttribute boostAtt = addAttribute(BoostAttribute.class);
+
+  public DelimitedBoostTokenFilter(TokenStream input, char delimiter) {
+    super(input);
+    this.delimiter = delimiter;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      final char[] buffer = termAtt.buffer();
+      final int length = termAtt.length();
+      for (int i = 0; i < length; i++) {
+        if (buffer[i] == delimiter) {
+          float boost = Float.parseFloat(new String(buffer, i + 1, (length - (i + 1))));
+          boostAtt.setBoost(boost);
+          termAtt.setLength(i);
+          return true;
+        }
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
diff --git a/...is/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java b/...is/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.boost;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+import java.util.Map;
+
+/**
+ * Factory for {@link DelimitedBoostTokenFilter}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_dlmtd" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.DelimitedBoostTokenFilterFactory" delimiter="|"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ *
+ * @lucene.spi {@value #NAME}
+ */
+public class DelimitedBoostTokenFilterFactory extends TokenFilterFactory {
+
+  /**
+   * SPI name
+   */
+  public static final String NAME = "delimitedBoost";
+  public static final String DELIMITER_ATTR = "delimiter";
+  public static final char DEFAULT_DELIMITER = '|';
+
+  private final char delimiter;
+
+  /**
+   * Creates a new DelimitedPayloadTokenFilterFactory
+   */
+  public DelimitedBoostTokenFilterFactory(Map<String, String> args) {
+    super(args);
+    delimiter = getChar(args, DELIMITER_ATTR, DEFAULT_DELIMITER);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public DelimitedBoostTokenFilter create(TokenStream input) {
+    return new DelimitedBoostTokenFilter(input, delimiter);
+  }
+
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/package-info.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Provides various convenience classes for creating boosts on Tokens.
+ */
+package org.apache.lucene.analysis.boost;
diff --git a/...common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/...common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -17,6 +17,7 @@ org.apache.lucene.analysis.tr.ApostropheFilterFactory
 org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory
 org.apache.lucene.analysis.ar.ArabicStemFilterFactory
 org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
+org.apache.lucene.analysis.boost.DelimitedBoostTokenFilterFactory
 org.apache.lucene.analysis.bn.BengaliNormalizationFilterFactory
 org.apache.lucene.analysis.bn.BengaliStemFilterFactory
 org.apache.lucene.analysis.br.BrazilianStemFilterFactory

diff --git a/...lysis/common/src/test/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterTest.java b/...lysis/common/src/test/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterTest.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.boost;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.search.BoostAttribute;
+
+public class DelimitedBoostTokenFilterTest extends BaseTokenStreamTestCase {
+
+  public void testBoosts() throws Exception {
+    String test = "The quick|0.4 red|0.5 fox|0.2 jumped|0.1 over the lazy|0.8 brown|0.9 dogs|0.9";
+    DelimitedBoostTokenFilter filter = new DelimitedBoostTokenFilter
+            (whitespaceMockTokenizer(test),
+                    DelimitedBoostTokenFilterFactory.DEFAULT_DELIMITER);
+    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
+    BoostAttribute boostAtt = filter.addAttribute(BoostAttribute.class);
+    filter.reset();
+    assertTermEquals("The", filter, termAtt, boostAtt, 1.0f);
+    assertTermEquals("quick", filter, termAtt, boostAtt, 0.4f);
+    assertTermEquals("red", filter, termAtt, boostAtt, 0.5f);
+    assertTermEquals("fox", filter, termAtt, boostAtt, 0.2f);
+    assertTermEquals("jumped", filter, termAtt, boostAtt, 0.1f);
+    assertTermEquals("over", filter, termAtt, boostAtt, 1.0f);
+    assertTermEquals("the", filter, termAtt, boostAtt, 1.0f);
+    assertTermEquals("lazy", filter, termAtt, boostAtt, 0.8f);
+    assertTermEquals("brown", filter, termAtt, boostAtt, 0.9f);
+    assertTermEquals("dogs", filter, termAtt, boostAtt, 0.9f);
+    assertFalse(filter.incrementToken());
+    filter.end();
+    filter.close();
+  }
+
+  public void testNext() throws Exception {
+    String test = "The quick|0.1 red|0.2 fox|0.3 jumped|0.4 over the lazy|0.5 brown|0.6 dogs|0.6";
+    DelimitedBoostTokenFilter filter = new DelimitedBoostTokenFilter
+      (whitespaceMockTokenizer(test), 
+       DelimitedBoostTokenFilterFactory.DEFAULT_DELIMITER);
+    filter.reset();
+    assertTermEquals("The", filter, 1.0f);
+    assertTermEquals("quick", filter, 0.1f);
+    assertTermEquals("red", filter, 0.2f);
+    assertTermEquals("fox", filter, 0.3f);
+    assertTermEquals("jumped", filter, 0.4f);
+    assertTermEquals("over", filter, 1.0f);
+    assertTermEquals("the", filter, 1.0f);
+    assertTermEquals("lazy", filter, 0.5f);
+    assertTermEquals("brown", filter, 0.6f);
+    assertTermEquals("dogs", filter, 0.6f);
+    assertFalse(filter.incrementToken());
+    filter.end();
+    filter.close();
+  }
+
+  void assertTermEquals(String expected, TokenStream stream, float expectedBoost) throws Exception {
+    CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
+    BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
+    assertTrue(stream.incrementToken());
+    assertEquals(expected, termAtt.toString());
+    float actualBoost = boostAtt.getBoost();
+    assertTrue(actualBoost + " does not equal: " + expectedBoost, actualBoost == expectedBoost);
+  }
+
+  void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, BoostAttribute boostAtt, float expectedBoost) throws Exception {
+    assertTrue(stream.incrementToken());
+    assertEquals(expected, termAtt.toString());
+    float actualBoost = boostAtt.getBoost();
+    assertTrue(actualBoost + " does not equal: " + expectedBoost, actualBoost == expectedBoost);
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/BoostAttribute.java b/lucene/core/src/java/org/apache/lucene/search/BoostAttribute.java
@@ -32,6 +32,7 @@
  * @lucene.internal
  */
 public interface BoostAttribute extends Attribute {
+  float DEFAULT_BOOST = 1.0f;
   /** Sets the boost in this attribute */
   public void setBoost(float boost);
   /** Retrieves the boost, default is {@code 1.0f}. */