GITHUB-12291: Skip blank lines from stopwords list. (#12299)

apache · May 18, 2023 · d1db558 · d1db558
1 parent 59110fc
commit d1db558
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 13 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -35,7 +35,8 @@ Optimizations
 
 Bug Fixes
 ---------------------
-(No changes)
+
+* GITHUB#12291: Skip blank lines from stopwords list. (Jerry Chin)
 
 Other
 ---------------------

diff --git a/lucene/analysis/smartcn/src/resources/org/apache/lucene/analysis/cn/smart/stopwords.txt b/lucene/analysis/smartcn/src/resources/org/apache/lucene/analysis/cn/smart/stopwords.txt
@@ -53,7 +53,5 @@ $
 ●
 // the line below contains an IDEOGRAPHIC SPACE character (Used as a space in Chinese)
 
-
 //////////////// English Stop Words ////////////////
-
 //////////////// Chinese Stop Words ////////////////
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java b/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java
@@ -40,9 +40,9 @@ public class WordlistLoader {
   private WordlistLoader() {}
 
   /**
-   * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting leading
-   * and trailing whitespace). Every line of the Reader should contain only one word. The words need
-   * to be in lowercase if you make use of an Analyzer which uses LowerCaseFilter (like
+   * Reads lines from a Reader and adds every non-blank line as an entry to a CharArraySet (omitting
+   * leading and trailing whitespace). Every line of the Reader should contain only one word. The
+   * words need to be in lowercase if you make use of an Analyzer which uses LowerCaseFilter (like
    * StandardAnalyzer).
    *
    * @param reader Reader containing the wordlist
@@ -53,7 +53,10 @@ public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws
     try (BufferedReader br = getBufferedReader(reader)) {
       String word = null;
       while ((word = br.readLine()) != null) {
-        result.add(word.trim());
+        word = word.trim();
+        // skip blank lines
+        if (word.isEmpty()) continue;
+        result.add(word);
       }
     }
     return result;
@@ -101,10 +104,10 @@ public static CharArraySet getWordSet(InputStream stream, Charset charset) throw
   }
 
   /**
-   * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet
-   * (omitting leading and trailing whitespace). Every line of the Reader should contain only one
-   * word. The words need to be in lowercase if you make use of an Analyzer which uses
-   * LowerCaseFilter (like StandardAnalyzer).
+   * Reads lines from a Reader and adds every non-blank non-comment line as an entry to a
+   * CharArraySet (omitting leading and trailing whitespace). Every line of the Reader should
+   * contain only one word. The words need to be in lowercase if you make use of an Analyzer which
+   * uses LowerCaseFilter (like StandardAnalyzer).
    *
    * @param reader Reader containing the wordlist
    * @param comment The string representing a comment.
@@ -117,7 +120,10 @@ public static CharArraySet getWordSet(Reader reader, String comment, CharArraySe
       String word = null;
       while ((word = br.readLine()) != null) {
         if (word.startsWith(comment) == false) {
-          result.add(word.trim());
+          word = word.trim();
+          // skip blank lines
+          if (word.isEmpty()) continue;
+          result.add(word);
         }
       }
     }

diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java b/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java
@@ -24,7 +24,7 @@
 public class TestWordlistLoader extends LuceneTestCase {
 
   public void testWordlistLoading() throws IOException {
-    String s = "ONE\n  two \nthree";
+    String s = "ONE\n  two \nthree\n\n";
     CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s));
     checkSet(wordSet1);
     CharArraySet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)));
-Original file line number
+Diff line change
@@ Expand Up / @@ -35,7 +35,8 @@ Optimizations @@
     Bug Fixes
     ---------------------
-    (No changes)
+    * GITHUB#12291: Skip blank lines from stopwords list. (Jerry Chin)
     Other
     ---------------------
@@ Expand Down @@