Skip to content

Commit

Permalink
GITHUB-12291: Skip blank lines from stopwords list. (#12299)
Browse files Browse the repository at this point in the history
  • Loading branch information
JerryChin authored and uschindler committed May 18, 2023
1 parent 59110fc commit d1db558
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 13 deletions.
3 changes: 2 additions & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ Optimizations

Bug Fixes
---------------------
(No changes)

* GITHUB#12291: Skip blank lines from stopwords list. (Jerry Chin)

Other
---------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,5 @@ $
// the line below contains an IDEOGRAPHIC SPACE character (Used as a space in Chinese)


//////////////// English Stop Words ////////////////

//////////////// Chinese Stop Words ////////////////
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ public class WordlistLoader {
private WordlistLoader() {}

/**
* Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting leading
* and trailing whitespace). Every line of the Reader should contain only one word. The words need
* to be in lowercase if you make use of an Analyzer which uses LowerCaseFilter (like
* Reads lines from a Reader and adds every non-blank line as an entry to a CharArraySet (omitting
* leading and trailing whitespace). Every line of the Reader should contain only one word. The
* words need to be in lowercase if you make use of an Analyzer which uses LowerCaseFilter (like
* StandardAnalyzer).
*
* @param reader Reader containing the wordlist
Expand All @@ -53,7 +53,10 @@ public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws
try (BufferedReader br = getBufferedReader(reader)) {
String word = null;
while ((word = br.readLine()) != null) {
result.add(word.trim());
word = word.trim();
// skip blank lines
if (word.isEmpty()) continue;
result.add(word);
}
}
return result;
Expand Down Expand Up @@ -101,10 +104,10 @@ public static CharArraySet getWordSet(InputStream stream, Charset charset) throw
}

/**
* Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet
* (omitting leading and trailing whitespace). Every line of the Reader should contain only one
* word. The words need to be in lowercase if you make use of an Analyzer which uses
* LowerCaseFilter (like StandardAnalyzer).
* Reads lines from a Reader and adds every non-blank non-comment line as an entry to a
* CharArraySet (omitting leading and trailing whitespace). Every line of the Reader should
* contain only one word. The words need to be in lowercase if you make use of an Analyzer which
* uses LowerCaseFilter (like StandardAnalyzer).
*
* @param reader Reader containing the wordlist
* @param comment The string representing a comment.
Expand All @@ -117,7 +120,10 @@ public static CharArraySet getWordSet(Reader reader, String comment, CharArraySe
String word = null;
while ((word = br.readLine()) != null) {
if (word.startsWith(comment) == false) {
result.add(word.trim());
word = word.trim();
// skip blank lines
if (word.isEmpty()) continue;
result.add(word);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
public class TestWordlistLoader extends LuceneTestCase {

public void testWordlistLoading() throws IOException {
String s = "ONE\n two \nthree";
String s = "ONE\n two \nthree\n\n";
CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s));
checkSet(wordSet1);
CharArraySet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)));
Expand Down

0 comments on commit d1db558

Please sign in to comment.