Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LUCENE-10312: Add PersianStemmer #540

Merged
merged 12 commits into from
May 7, 2022
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ API Changes
New Features
---------------------

* LUCENE-10312: Add PersianStemmer based on the Arabic stemmer. (Ramin Alirezaee)

* LUCENE-10539: Return a stream of completions from FSTCompletion. (Dawid Weiss)

* LUCENE-10385: Implement Weight#count on IndexSortSortedNumericDocValuesRangeQuery
Expand Down
1 change: 1 addition & 0 deletions lucene/analysis/common/src/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@
org.apache.lucene.analysis.es.SpanishMinimalStemFilterFactory,
org.apache.lucene.analysis.es.SpanishPluralStemFilterFactory,
org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory,
org.apache.lucene.analysis.fa.PersianStemFilterFactory,
org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory,
org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory,
org.apache.lucene.analysis.fr.FrenchMinimalStemFilterFactory,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.IOUtils;

Expand Down Expand Up @@ -86,6 +87,8 @@ private static class DefaultSetHolder {
}
}

private final CharArraySet stemExclusionSet;

/** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */
public PersianAnalyzer() {
this(DefaultSetHolder.DEFAULT_STOP_SET);
Expand All @@ -97,7 +100,19 @@ public PersianAnalyzer() {
* @param stopwords a stopword set
*/
public PersianAnalyzer(CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET);
}

/**
* Builds an analyzer with the given stop word. If a none-empty stem exclusion set is provided
* this analyzer will add a {@link SetKeywordMarkerFilter} before {@link PersianStemFilter}.
*
* @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed
*/
public PersianAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
}

/**
Expand All @@ -121,7 +136,11 @@ protected TokenStreamComponents createComponents(String fieldName) {
* the order here is important: the stopword list is normalized with the
* above!
*/
return new TokenStreamComponents(source, new StopFilter(result, stopwords));
result = new StopFilter(result, stopwords);
if (!stemExclusionSet.isEmpty()) {
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
}
return new TokenStreamComponents(source, new PersianStemFilter(result));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.fa;

import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;

/**
* A {@link TokenFilter} that applies {@link PersianStemmer} to stem Persian words.
*
* <p>To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a
* custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link
* TokenStream}.
*
* @see SetKeywordMarkerFilter
*/
public final class PersianStemFilter extends TokenFilter {
private final PersianStemmer stemmer = new PersianStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);

public PersianStemFilter(TokenStream input) {
super(input);
}

@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.fa;

import java.util.Map;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;

/**
* Factory for {@link PersianStemFilter}.
*
* @since 9.2
* @lucene.spi {@value #NAME}
*/
public class PersianStemFilterFactory extends TokenFilterFactory {

/** SPI name */
public static final String NAME = "persianStem";

/** Creates a new PersianStemFilterFactory */
public PersianStemFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}

/** Default ctor for compatibility with SPI */
public PersianStemFilterFactory() {
throw defaultCtorException();
}

@Override
public PersianStemFilter create(TokenStream input) {
return new PersianStemFilter(input);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.fa;

import static org.apache.lucene.analysis.util.StemmerUtil.*;

import java.util.Arrays;

/**
* Stemmer for Persian.
*
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

@mocobeta mocobeta May 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found the ArabicStemmer does not mention the algorithms or rules it bases on. As @NightOwl888 told me, this PersianStemmer is a derivative component of it; then I'm fine with the javadocs as is.

* <p>Stemming is done in-place for efficiency, operating on a termbuffer.
*
* <p>Stemming is defined as:
*
* <ul>
* <li>Removal of attached definite article, conjunction, and prepositions.
* <li>Stemming of common suffixes.
* </ul>
*/
public class PersianStemmer {
private static final char ALEF = '\u0627';
private static final char HEH = '\u0647';
private static final char TEH = '\u062A';
private static final char REH = '\u0631';
private static final char NOON = '\u0646';
private static final char YEH = '\u064A';
private static final char ZWNJ = '\u200c'; // ZERO WIDTH NON-JOINER character

private static final char[][] suffixes = {
("" + ALEF + TEH).toCharArray(),
("" + ALEF + NOON).toCharArray(),
("" + TEH + REH + YEH + NOON).toCharArray(),
("" + TEH + REH).toCharArray(),
("" + YEH + YEH).toCharArray(),
("" + YEH).toCharArray(),
("" + HEH + ALEF).toCharArray(),
("" + ZWNJ).toCharArray(),
};

/**
* Stem an input buffer of Persian text.
*
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
*/
public int stem(char[] s, int len) {
len = stemSuffix(s, len);

return len;
}

/**
* Stem suffix(es) off a Persian word.
*
* @param s input buffer
* @param len length of input buffer
* @return new length of input buffer after stemming
*/
private int stemSuffix(char[] s, int len) {
for (char[] suffix : suffixes) {
if (endsWithCheckLength(s, len, suffix)) {
len = deleteN(s, len - suffix.length, len, suffix.length);
}
}

return len;
}

/**
* Returns true if the suffix matches and can be stemmed
*
* @param s input buffer
* @param len length of input buffer
* @param suffix suffix to check
* @return true if the suffix matches and can be stemmed
*/
private boolean endsWithCheckLength(char[] s, int len, char[] suffix) {
if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming
return false;
}

return Arrays.equals(s, len - suffix.length, len, suffix, 0, suffix.length);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ org.apache.lucene.analysis.es.SpanishLightStemFilterFactory
org.apache.lucene.analysis.es.SpanishMinimalStemFilterFactory
org.apache.lucene.analysis.es.SpanishPluralStemFilterFactory
org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory
org.apache.lucene.analysis.fa.PersianStemFilterFactory
org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory
org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory
org.apache.lucene.analysis.fr.FrenchMinimalStemFilterFactory
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase;

/** Simple tests to ensure the Persian normalization factory is working. */
Expand All @@ -31,6 +32,15 @@ public void testNormalization() throws Exception {
assertTokenStreamContents(stream, new String[] {"هاي"});
}

/** Test PersianStemFilterFactory */
public void testStemmer() throws Exception {
Reader reader = new StringReader("کتابها بهترین دوستان");
Tokenizer tokenizer = whitespaceMockTokenizer(reader);
TokenStream stream = tokenFilterFactory("PersianNormalization").create(tokenizer);
stream = tokenFilterFactory("PersianStem").create(stream);
assertTokenStreamContents(stream, new String[] {"كتاب", "به", "دوست"});
}

/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
IllegalArgumentException expected =
Expand All @@ -40,5 +50,13 @@ public void testBogusArguments() throws Exception {
tokenFilterFactory("PersianNormalization", "bogusArg", "bogusValue");
});
assertTrue(expected.getMessage().contains("Unknown parameters"));

expected =
expectThrows(
IllegalArgumentException.class,
() -> {
tokenFilterFactory("PersianStem", "bogusArg", "bogusValue");
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
Loading