LUCENE-10312: Add PersianStemmer (#540)

Co-authored-by: Tomoko Uchida <tomoko.uchida.1111@gmail.com>
apache · May 7, 2022 · 9f04771 · 9f04771
1 parent 749bfb8
commit 9f04771
Show file tree

Hide file tree

Showing 9 changed files with 350 additions and 1 deletion.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -22,6 +22,8 @@ API Changes
 New Features
 ---------------------
 
+* LUCENE-10312: Add PersianStemmer based on the Arabic stemmer. (Ramin Alirezaee)
+
 * LUCENE-10539: Return a stream of completions from FSTCompletion. (Dawid Weiss)
 
 * LUCENE-10385: Implement Weight#count on IndexSortSortedNumericDocValuesRangeQuery

diff --git a/lucene/analysis/common/src/java/module-info.java b/lucene/analysis/common/src/java/module-info.java
@@ -188,6 +188,7 @@
       org.apache.lucene.analysis.es.SpanishMinimalStemFilterFactory,
       org.apache.lucene.analysis.es.SpanishPluralStemFilterFactory,
       org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory,
+      org.apache.lucene.analysis.fa.PersianStemFilterFactory,
       org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory,
       org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory,
       org.apache.lucene.analysis.fr.FrenchMinimalStemFilterFactory,

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
@@ -29,6 +29,7 @@
 import org.apache.lucene.analysis.WordlistLoader;
 import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
 import org.apache.lucene.analysis.core.DecimalDigitFilter;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.util.IOUtils;
 
@@ -86,6 +87,8 @@ private static class DefaultSetHolder {
     }
   }
 
+  private final CharArraySet stemExclusionSet;
+
   /** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */
   public PersianAnalyzer() {
     this(DefaultSetHolder.DEFAULT_STOP_SET);
@@ -97,7 +100,19 @@ public PersianAnalyzer() {
    * @param stopwords a stopword set
    */
   public PersianAnalyzer(CharArraySet stopwords) {
+    this(stopwords, CharArraySet.EMPTY_SET);
+  }
+
+  /**
+   * Builds an analyzer with the given stop word. If a none-empty stem exclusion set is provided
+   * this analyzer will add a {@link SetKeywordMarkerFilter} before {@link PersianStemFilter}.
+   *
+   * @param stopwords a stopword set
+   * @param stemExclusionSet a set of terms not to be stemmed
+   */
+  public PersianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
     super(stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
   }
 
   /**
@@ -121,7 +136,11 @@ protected TokenStreamComponents createComponents(String fieldName) {
      * the order here is important: the stopword list is normalized with the
      * above!
      */
-    return new TokenStreamComponents(source, new StopFilter(result, stopwords));
+    result = new StopFilter(result, stopwords);
+    if (!stemExclusionSet.isEmpty()) {
+      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+    }
+    return new TokenStreamComponents(source, new PersianStemFilter(result));
   }
 
   @Override

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianStemFilter.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.fa;
+
+import java.io.IOException;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link PersianStemmer} to stem Persian words.
+ *
+ * <p>To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a
+ * custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link
+ * TokenStream}.
+ *
+ * @see SetKeywordMarkerFilter
+ */
+public final class PersianStemFilter extends TokenFilter {
+  private final PersianStemmer stemmer = new PersianStemmer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+  public PersianStemFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if (!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+        termAtt.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianStemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianStemFilterFactory.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.fa;
+
+import java.util.Map;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Factory for {@link PersianStemFilter}.
+ *
+ * @since 9.2
+ * @lucene.spi {@value #NAME}
+ */
+public class PersianStemFilterFactory extends TokenFilterFactory {
+
+  /** SPI name */
+  public static final String NAME = "persianStem";
+
+  /** Creates a new PersianStemFilterFactory */
+  public PersianStemFilterFactory(Map<String, String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  /** Default ctor for compatibility with SPI */
+  public PersianStemFilterFactory() {
+    throw defaultCtorException();
+  }
+
+  @Override
+  public PersianStemFilter create(TokenStream input) {
+    return new PersianStemFilter(input);
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianStemmer.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.fa;
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+import java.util.Arrays;
+
+/**
+ * Stemmer for Persian.
+ *
+ * <p>Stemming is done in-place for efficiency, operating on a termbuffer.
+ *
+ * <p>Stemming is defined as:
+ *
+ * <ul>
+ *   <li>Removal of attached definite article, conjunction, and prepositions.
+ *   <li>Stemming of common suffixes.
+ * </ul>
+ */
+public class PersianStemmer {
+  private static final char ALEF = '\u0627';
+  private static final char HEH = '\u0647';
+  private static final char TEH = '\u062A';
+  private static final char REH = '\u0631';
+  private static final char NOON = '\u0646';
+  private static final char YEH = '\u064A';
+  private static final char ZWNJ = '\u200c'; // ZERO WIDTH NON-JOINER character
+
+  private static final char[][] suffixes = {
+    ("" + ALEF + TEH).toCharArray(),
+    ("" + ALEF + NOON).toCharArray(),
+    ("" + TEH + REH + YEH + NOON).toCharArray(),
+    ("" + TEH + REH).toCharArray(),
+    ("" + YEH + YEH).toCharArray(),
+    ("" + YEH).toCharArray(),
+    ("" + HEH + ALEF).toCharArray(),
+    ("" + ZWNJ).toCharArray(),
+  };
+
+  /**
+   * Stem an input buffer of Persian text.
+   *
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int stem(char[] s, int len) {
+    len = stemSuffix(s, len);
+
+    return len;
+  }
+
+  /**
+   * Stem suffix(es) off a Persian word.
+   *
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return new length of input buffer after stemming
+   */
+  private int stemSuffix(char[] s, int len) {
+    for (char[] suffix : suffixes) {
+      if (endsWithCheckLength(s, len, suffix)) {
+        len = deleteN(s, len - suffix.length, len, suffix.length);
+      }
+    }
+
+    return len;
+  }
+
+  /**
+   * Returns true if the suffix matches and can be stemmed
+   *
+   * @param s input buffer
+   * @param len length of input buffer
+   * @param suffix suffix to check
+   * @return true if the suffix matches and can be stemmed
+   */
+  private boolean endsWithCheckLength(char[] s, int len, char[] suffix) {
+    if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming
+      return false;
+    }
+
+    return Arrays.equals(s, len - suffix.length, len, suffix, 0, suffix.length);
+  }
+}
diff --git a/...ysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory b/...ysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
@@ -50,6 +50,7 @@ org.apache.lucene.analysis.es.SpanishLightStemFilterFactory
 org.apache.lucene.analysis.es.SpanishMinimalStemFilterFactory
 org.apache.lucene.analysis.es.SpanishPluralStemFilterFactory
 org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory
+org.apache.lucene.analysis.fa.PersianStemFilterFactory
 org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory
 org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory
 org.apache.lucene.analysis.fr.FrenchMinimalStemFilterFactory

diff --git a/.../common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilterFactory.java b/.../common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilterFactory.java
@@ -19,6 +19,7 @@
 import java.io.Reader;
 import java.io.StringReader;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase;
 
 /** Simple tests to ensure the Persian normalization factory is working. */
@@ -31,6 +32,15 @@ public void testNormalization() throws Exception {
     assertTokenStreamContents(stream, new String[] {"هاي"});
   }
 
+  /** Test PersianStemFilterFactory */
+  public void testStemmer() throws Exception {
+    Reader reader = new StringReader("کتابها بهترین دوستان");
+    Tokenizer tokenizer = whitespaceMockTokenizer(reader);
+    TokenStream stream = tokenFilterFactory("PersianNormalization").create(tokenizer);
+    stream = tokenFilterFactory("PersianStem").create(stream);
+    assertTokenStreamContents(stream, new String[] {"كتاب", "به", "دوست"});
+  }
+
   /** Test that bogus arguments result in exception */
   public void testBogusArguments() throws Exception {
     IllegalArgumentException expected =
@@ -40,5 +50,13 @@ public void testBogusArguments() throws Exception {
               tokenFilterFactory("PersianNormalization", "bogusArg", "bogusValue");
             });
     assertTrue(expected.getMessage().contains("Unknown parameters"));
+
+    expected =
+        expectThrows(
+            IllegalArgumentException.class,
+            () -> {
+              tokenFilterFactory("PersianStem", "bogusArg", "bogusValue");
+            });
+    assertTrue(expected.getMessage().contains("Unknown parameters"));
   }
 }