-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
LUCENE-10312: Add PersianStemmer (#540)
Co-authored-by: Tomoko Uchida <tomoko.uchida.1111@gmail.com>
- Loading branch information
Showing
9 changed files
with
350 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
56 changes: 56 additions & 0 deletions
56
lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianStemFilter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.fa; | ||
|
||
import java.io.IOException; | ||
import org.apache.lucene.analysis.TokenFilter; | ||
import org.apache.lucene.analysis.TokenStream; | ||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link | ||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; | ||
|
||
/** | ||
* A {@link TokenFilter} that applies {@link PersianStemmer} to stem Persian words. | ||
* | ||
* <p>To prevent terms from being stemmed use an instance of {@link SetKeywordMarkerFilter} or a | ||
* custom {@link TokenFilter} that sets the {@link KeywordAttribute} before this {@link | ||
* TokenStream}. | ||
* | ||
* @see SetKeywordMarkerFilter | ||
*/ | ||
public final class PersianStemFilter extends TokenFilter { | ||
private final PersianStemmer stemmer = new PersianStemmer(); | ||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); | ||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); | ||
|
||
public PersianStemFilter(TokenStream input) { | ||
super(input); | ||
} | ||
|
||
@Override | ||
public boolean incrementToken() throws IOException { | ||
if (input.incrementToken()) { | ||
if (!keywordAttr.isKeyword()) { | ||
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); | ||
termAtt.setLength(newlen); | ||
} | ||
return true; | ||
} else { | ||
return false; | ||
} | ||
} | ||
} |
51 changes: 51 additions & 0 deletions
51
lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianStemFilterFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.fa; | ||
|
||
import java.util.Map; | ||
import org.apache.lucene.analysis.TokenFilterFactory; | ||
import org.apache.lucene.analysis.TokenStream; | ||
|
||
/** | ||
* Factory for {@link PersianStemFilter}. | ||
* | ||
* @since 9.2 | ||
* @lucene.spi {@value #NAME} | ||
*/ | ||
public class PersianStemFilterFactory extends TokenFilterFactory { | ||
|
||
/** SPI name */ | ||
public static final String NAME = "persianStem"; | ||
|
||
/** Creates a new PersianStemFilterFactory */ | ||
public PersianStemFilterFactory(Map<String, String> args) { | ||
super(args); | ||
if (!args.isEmpty()) { | ||
throw new IllegalArgumentException("Unknown parameters: " + args); | ||
} | ||
} | ||
|
||
/** Default ctor for compatibility with SPI */ | ||
public PersianStemFilterFactory() { | ||
throw defaultCtorException(); | ||
} | ||
|
||
@Override | ||
public PersianStemFilter create(TokenStream input) { | ||
return new PersianStemFilter(input); | ||
} | ||
} |
100 changes: 100 additions & 0 deletions
100
lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianStemmer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.fa; | ||
|
||
import static org.apache.lucene.analysis.util.StemmerUtil.*; | ||
|
||
import java.util.Arrays; | ||
|
||
/** | ||
* Stemmer for Persian. | ||
* | ||
* <p>Stemming is done in-place for efficiency, operating on a termbuffer. | ||
* | ||
* <p>Stemming is defined as: | ||
* | ||
* <ul> | ||
* <li>Removal of attached definite article, conjunction, and prepositions. | ||
* <li>Stemming of common suffixes. | ||
* </ul> | ||
*/ | ||
public class PersianStemmer { | ||
private static final char ALEF = '\u0627'; | ||
private static final char HEH = '\u0647'; | ||
private static final char TEH = '\u062A'; | ||
private static final char REH = '\u0631'; | ||
private static final char NOON = '\u0646'; | ||
private static final char YEH = '\u064A'; | ||
private static final char ZWNJ = '\u200c'; // ZERO WIDTH NON-JOINER character | ||
|
||
private static final char[][] suffixes = { | ||
("" + ALEF + TEH).toCharArray(), | ||
("" + ALEF + NOON).toCharArray(), | ||
("" + TEH + REH + YEH + NOON).toCharArray(), | ||
("" + TEH + REH).toCharArray(), | ||
("" + YEH + YEH).toCharArray(), | ||
("" + YEH).toCharArray(), | ||
("" + HEH + ALEF).toCharArray(), | ||
("" + ZWNJ).toCharArray(), | ||
}; | ||
|
||
/** | ||
* Stem an input buffer of Persian text. | ||
* | ||
* @param s input buffer | ||
* @param len length of input buffer | ||
* @return length of input buffer after normalization | ||
*/ | ||
public int stem(char[] s, int len) { | ||
len = stemSuffix(s, len); | ||
|
||
return len; | ||
} | ||
|
||
/** | ||
* Stem suffix(es) off a Persian word. | ||
* | ||
* @param s input buffer | ||
* @param len length of input buffer | ||
* @return new length of input buffer after stemming | ||
*/ | ||
private int stemSuffix(char[] s, int len) { | ||
for (char[] suffix : suffixes) { | ||
if (endsWithCheckLength(s, len, suffix)) { | ||
len = deleteN(s, len - suffix.length, len, suffix.length); | ||
} | ||
} | ||
|
||
return len; | ||
} | ||
|
||
/** | ||
* Returns true if the suffix matches and can be stemmed | ||
* | ||
* @param s input buffer | ||
* @param len length of input buffer | ||
* @param suffix suffix to check | ||
* @return true if the suffix matches and can be stemmed | ||
*/ | ||
private boolean endsWithCheckLength(char[] s, int len, char[] suffix) { | ||
if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming | ||
return false; | ||
} | ||
|
||
return Arrays.equals(s, len - suffix.length, len, suffix, 0, suffix.length); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.