Skip to content

Commit

Permalink
LUCENE-10312: Add PersianStemmer (#540)
Browse files Browse the repository at this point in the history
Co-authored-by: Tomoko Uchida <tomoko.uchida.1111@gmail.com>
  • Loading branch information
raminmjj and mocobeta committed May 7, 2022
1 parent 749bfb8 commit 9f04771
Show file tree
Hide file tree
Showing 9 changed files with 350 additions and 1 deletion.
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ API Changes
New Features
---------------------

* LUCENE-10312: Add PersianStemmer based on the Arabic stemmer. (Ramin Alirezaee)

* LUCENE-10539: Return a stream of completions from FSTCompletion. (Dawid Weiss)

* LUCENE-10385: Implement Weight#count on IndexSortSortedNumericDocValuesRangeQuery
Expand Down
1 change: 1 addition & 0 deletions lucene/analysis/common/src/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@
org.apache.lucene.analysis.es.SpanishMinimalStemFilterFactory,
org.apache.lucene.analysis.es.SpanishPluralStemFilterFactory,
org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory,
org.apache.lucene.analysis.fa.PersianStemFilterFactory,
org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory,
org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory,
org.apache.lucene.analysis.fr.FrenchMinimalStemFilterFactory,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.IOUtils;

Expand Down Expand Up @@ -86,6 +87,8 @@ private static class DefaultSetHolder {
}
}

private final CharArraySet stemExclusionSet;

/** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */
public PersianAnalyzer() {
this(DefaultSetHolder.DEFAULT_STOP_SET);
Expand All @@ -97,7 +100,19 @@ public PersianAnalyzer() {
* @param stopwords a stopword set
*/
public PersianAnalyzer(CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET);
}

/**
* Builds an analyzer with the given stop word. If a none-empty stem exclusion set is provided
* this analyzer will add a {@link SetKeywordMarkerFilter} before {@link PersianStemFilter}.
*
* @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed
*/
public PersianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
}

/**
Expand All @@ -121,7 +136,11 @@ protected TokenStreamComponents createComponents(String fieldName) {
* the order here is important: the stopword list is normalized with the
* above!
*/
return new TokenStreamComponents(source, new StopFilter(result, stopwords));
result = new StopFilter(result, stopwords);
if (!stemExclusionSet.isEmpty()) {
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
}
return new TokenStreamComponents(source, new PersianStemFilter(result));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.fa;

import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;

/**
* A {@link TokenFilter} that applies {@link PersianStemmer} to stem Persian words.
*
* <p>To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a
* custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link
* TokenStream}.
*
* @see SetKeywordMarkerFilter
*/
public final class PersianStemFilter extends TokenFilter {
private final PersianStemmer stemmer = new PersianStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);

public PersianStemFilter(TokenStream input) {
super(input);
}

@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.fa;

import java.util.Map;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;

/**
* Factory for {@link PersianStemFilter}.
*
* @since 9.2
* @lucene.spi {@value #NAME}
*/
public class PersianStemFilterFactory extends TokenFilterFactory {

/** SPI name */
public static final String NAME = "persianStem";

/** Creates a new PersianStemFilterFactory */
public PersianStemFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}

/** Default ctor for compatibility with SPI */
public PersianStemFilterFactory() {
throw defaultCtorException();
}

@Override
public PersianStemFilter create(TokenStream input) {
return new PersianStemFilter(input);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.fa;

import static org.apache.lucene.analysis.util.StemmerUtil.*;

import java.util.Arrays;

/**
* Stemmer for Persian.
*
* <p>Stemming is done in-place for efficiency, operating on a termbuffer.
*
* <p>Stemming is defined as:
*
* <ul>
* <li>Removal of attached definite article, conjunction, and prepositions.
* <li>Stemming of common suffixes.
* </ul>
*/
public class PersianStemmer {
private static final char ALEF = '\u0627';
private static final char HEH = '\u0647';
private static final char TEH = '\u062A';
private static final char REH = '\u0631';
private static final char NOON = '\u0646';
private static final char YEH = '\u064A';
private static final char ZWNJ = '\u200c'; // ZERO WIDTH NON-JOINER character

private static final char[][] suffixes = {
("" + ALEF + TEH).toCharArray(),
("" + ALEF + NOON).toCharArray(),
("" + TEH + REH + YEH + NOON).toCharArray(),
("" + TEH + REH).toCharArray(),
("" + YEH + YEH).toCharArray(),
("" + YEH).toCharArray(),
("" + HEH + ALEF).toCharArray(),
("" + ZWNJ).toCharArray(),
};

/**
* Stem an input buffer of Persian text.
*
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
*/
public int stem(char[] s, int len) {
len = stemSuffix(s, len);

return len;
}

/**
* Stem suffix(es) off a Persian word.
*
* @param s input buffer
* @param len length of input buffer
* @return new length of input buffer after stemming
*/
private int stemSuffix(char[] s, int len) {
for (char[] suffix : suffixes) {
if (endsWithCheckLength(s, len, suffix)) {
len = deleteN(s, len - suffix.length, len, suffix.length);
}
}

return len;
}

/**
* Returns true if the suffix matches and can be stemmed
*
* @param s input buffer
* @param len length of input buffer
* @param suffix suffix to check
* @return true if the suffix matches and can be stemmed
*/
private boolean endsWithCheckLength(char[] s, int len, char[] suffix) {
if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming
return false;
}

return Arrays.equals(s, len - suffix.length, len, suffix, 0, suffix.length);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ org.apache.lucene.analysis.es.SpanishLightStemFilterFactory
org.apache.lucene.analysis.es.SpanishMinimalStemFilterFactory
org.apache.lucene.analysis.es.SpanishPluralStemFilterFactory
org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory
org.apache.lucene.analysis.fa.PersianStemFilterFactory
org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory
org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory
org.apache.lucene.analysis.fr.FrenchMinimalStemFilterFactory
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase;

/** Simple tests to ensure the Persian normalization factory is working. */
Expand All @@ -31,6 +32,15 @@ public void testNormalization() throws Exception {
assertTokenStreamContents(stream, new String[] {"هاي"});
}

/** Test PersianStemFilterFactory */
public void testStemmer() throws Exception {
Reader reader = new StringReader("کتابها بهترین دوستان");
Tokenizer tokenizer = whitespaceMockTokenizer(reader);
TokenStream stream = tokenFilterFactory("PersianNormalization").create(tokenizer);
stream = tokenFilterFactory("PersianStem").create(stream);
assertTokenStreamContents(stream, new String[] {"كتاب", "به", "دوست"});
}

/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
IllegalArgumentException expected =
Expand All @@ -40,5 +50,13 @@ public void testBogusArguments() throws Exception {
tokenFilterFactory("PersianNormalization", "bogusArg", "bogusValue");
});
assertTrue(expected.getMessage().contains("Unknown parameters"));

expected =
expectThrows(
IllegalArgumentException.class,
() -> {
tokenFilterFactory("PersianStem", "bogusArg", "bogusValue");
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
Loading

0 comments on commit 9f04771

Please sign in to comment.