diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c022616c709..4f981b13cc3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -138,7 +138,8 @@ Optimizations Bug Fixes --------------------- -(No changes) + +* GITHUB#12291: Skip blank lines from stopwords list. (Jerry Chin) Other --------------------- diff --git a/lucene/analysis/smartcn/src/resources/org/apache/lucene/analysis/cn/smart/stopwords.txt b/lucene/analysis/smartcn/src/resources/org/apache/lucene/analysis/cn/smart/stopwords.txt index fb0d71ad7d2..65bcfd4e1b6 100644 --- a/lucene/analysis/smartcn/src/resources/org/apache/lucene/analysis/cn/smart/stopwords.txt +++ b/lucene/analysis/smartcn/src/resources/org/apache/lucene/analysis/cn/smart/stopwords.txt @@ -53,7 +53,5 @@ $ ● // the line below contains an IDEOGRAPHIC SPACE character (Used as a space in Chinese)   - //////////////// English Stop Words //////////////// - //////////////// Chinese Stop Words //////////////// diff --git a/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java b/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java index 30ada92eb39..8e18f4ad76d 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java @@ -40,9 +40,9 @@ public class WordlistLoader { private WordlistLoader() {} /** - * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting leading - * and trailing whitespace). Every line of the Reader should contain only one word. The words need - * to be in lowercase if you make use of an Analyzer which uses LowerCaseFilter (like + * Reads lines from a Reader and adds every non-blank line as an entry to a CharArraySet (omitting + * leading and trailing whitespace). Every line of the Reader should contain only one word. The + * words need to be in lowercase if you make use of an Analyzer which uses LowerCaseFilter (like * StandardAnalyzer). * * @param reader Reader containing the wordlist @@ -53,7 +53,10 @@ public class WordlistLoader { try (BufferedReader br = getBufferedReader(reader)) { String word = null; while ((word = br.readLine()) != null) { - result.add(word.trim()); + word = word.trim(); + // skip blank lines + if (word.isEmpty()) continue; + result.add(word); } } return result; @@ -101,10 +104,10 @@ public class WordlistLoader { } /** - * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet - * (omitting leading and trailing whitespace). Every line of the Reader should contain only one - * word. The words need to be in lowercase if you make use of an Analyzer which uses - * LowerCaseFilter (like StandardAnalyzer). + * Reads lines from a Reader and adds every non-blank non-comment line as an entry to a + * CharArraySet (omitting leading and trailing whitespace). Every line of the Reader should + * contain only one word. The words need to be in lowercase if you make use of an Analyzer which + * uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist * @param comment The string representing a comment. @@ -117,7 +120,10 @@ public class WordlistLoader { String word = null; while ((word = br.readLine()) != null) { if (word.startsWith(comment) == false) { - result.add(word.trim()); + word = word.trim(); + // skip blank lines + if (word.isEmpty()) continue; + result.add(word); } } } diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java b/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java index 7af64c0011e..4747c86834e 100644 --- a/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java +++ b/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java @@ -24,7 +24,7 @@ import org.apache.lucene.tests.util.LuceneTestCase; public class TestWordlistLoader extends LuceneTestCase { public void testWordlistLoading() throws IOException { - String s = "ONE\n two \nthree"; + String s = "ONE\n two \nthree\n\n"; CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s)); checkSet(wordSet1); CharArraySet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)));