LUCENE-10349: Cleanup WordListLoader to use try-with-resources and make the default stop words unmodifiable (#577)

This commit is contained in:
Uwe Schindler 2022-01-03 15:07:44 +01:00 committed by GitHub
parent 835e821287
commit 305d9ebb86
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 39 additions and 58 deletions

View File

@ -55,6 +55,8 @@ API Changes
are caller sensitive in Java 11). Instead add utility method IOUtils#requireResourceNonNull(T)
to test existence of resource based on null return value. (Uwe Schindler, Dawid Weiss)
* LUCENE-10349: WordListLoader methods now return unmodifiable CharArraySets. (Uwe Schindler)
New Features
---------------------
@ -148,6 +150,9 @@ Bug Fixes
* LUCENE-10236: Stop duplicating norms when scoring in CombinedFieldQuery.
(Zach Chen, Jim Ferenczi, Julie Tibshirani)
* LUCENE-10349: Fix all analyzers to behave according to their documentation:
getDefaultStopSet() methods now return unmodifiable CharArraySets. (Uwe Schindler)
Other
---------------------

View File

@ -20,8 +20,8 @@ import java.io.IOException;
import java.io.Reader;
import java.io.UncheckedIOException;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
@ -80,24 +80,24 @@ public class JapaneseAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET =
WordlistLoader.getWordSet(
IOUtils.getDecodingReader(
IOUtils.requireResourceNonNull(
JapaneseAnalyzer.class.getResourceAsStream("stopwords.txt"),
"stopwords.txt"),
StandardCharsets.UTF_8),
"#",
new CharArraySet(16, true)); // ignore case
CharArraySet.unmodifiableSet(
WordlistLoader.getWordSet(
IOUtils.getDecodingReader(
IOUtils.requireResourceNonNull(
JapaneseAnalyzer.class.getResourceAsStream("stopwords.txt"),
"stopwords.txt"),
StandardCharsets.UTF_8),
"#",
new CharArraySet(16, true))); // ignore case
final CharArraySet tagset =
WordlistLoader.getWordSet(
IOUtils.requireResourceNonNull(
JapaneseAnalyzer.class.getResourceAsStream("stoptags.txt"), "stoptags.txt"),
"#");
DEFAULT_STOP_TAGS = new HashSet<>();
for (Object element : tagset) {
char[] chars = (char[]) element;
DEFAULT_STOP_TAGS.add(new String(chars));
}
DEFAULT_STOP_TAGS =
tagset.stream()
.map(ca -> new String((char[]) ca))
.collect(Collectors.toUnmodifiableSet());
} catch (IOException ex) {
// default set should always be present as it is part of the distribution (JAR)
throw new UncheckedIOException("Unable to load default stopword or stoptag set", ex);

View File

@ -50,15 +50,11 @@ public class WordlistLoader {
* @return the given {@link CharArraySet} with the reader's words
*/
public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException {
BufferedReader br = null;
try {
br = getBufferedReader(reader);
try (BufferedReader br = getBufferedReader(reader)) {
String word = null;
while ((word = br.readLine()) != null) {
result.add(word.trim());
}
} finally {
IOUtils.close(br);
}
return result;
}
@ -70,10 +66,11 @@ public class WordlistLoader {
* StandardAnalyzer).
*
* @param reader Reader containing the wordlist
* @return A {@link CharArraySet} with the reader's words
* @return An unmodifiable {@link CharArraySet} with the reader's words
*/
public static CharArraySet getWordSet(Reader reader) throws IOException {
return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
return CharArraySet.unmodifiableSet(
getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)));
}
/**
@ -83,7 +80,7 @@ public class WordlistLoader {
* uses LowerCaseFilter (like StandardAnalyzer).
*
* @param stream InputStream containing the wordlist
* @return A {@link CharArraySet} with the reader's words
* @return An unmodifiable {@link CharArraySet} with the reader's words
*/
public static CharArraySet getWordSet(InputStream stream) throws IOException {
return getWordSet(stream, StandardCharsets.UTF_8);
@ -97,7 +94,7 @@ public class WordlistLoader {
*
* @param stream InputStream containing the wordlist
* @param charset Charset of the wordlist
* @return A {@link CharArraySet} with the reader's words
* @return An unmodifiable {@link CharArraySet} with the reader's words
*/
public static CharArraySet getWordSet(InputStream stream, Charset charset) throws IOException {
return getWordSet(IOUtils.getDecodingReader(stream, charset));
@ -116,17 +113,13 @@ public class WordlistLoader {
*/
public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result)
throws IOException {
BufferedReader br = null;
try {
br = getBufferedReader(reader);
try (BufferedReader br = getBufferedReader(reader)) {
String word = null;
while ((word = br.readLine()) != null) {
if (word.startsWith(comment) == false) {
result.add(word.trim());
}
}
} finally {
IOUtils.close(br);
}
return result;
}
@ -139,10 +132,11 @@ public class WordlistLoader {
*
* @param reader Reader containing the wordlist
* @param comment The string representing a comment.
* @return A CharArraySet with the reader's words
* @return An unmodifiable CharArraySet with the reader's words
*/
public static CharArraySet getWordSet(Reader reader, String comment) throws IOException {
return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false));
return CharArraySet.unmodifiableSet(
getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false)));
}
/**
@ -153,7 +147,7 @@ public class WordlistLoader {
*
* @param stream InputStream in UTF-8 encoding containing the wordlist
* @param comment The string representing a comment.
* @return A CharArraySet with the reader's words
* @return An unmodifiable CharArraySet with the reader's words
*/
public static CharArraySet getWordSet(InputStream stream, String comment) throws IOException {
return getWordSet(stream, StandardCharsets.UTF_8, comment);
@ -168,7 +162,7 @@ public class WordlistLoader {
* @param stream InputStream containing the wordlist
* @param charset Charset of the wordlist
* @param comment The string representing a comment.
* @return A CharArraySet with the reader's words
* @return An unmodifiable CharArraySet with the reader's words
*/
public static CharArraySet getWordSet(InputStream stream, Charset charset, String comment)
throws IOException {
@ -192,9 +186,7 @@ public class WordlistLoader {
*/
public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
throws IOException {
BufferedReader br = null;
try {
br = getBufferedReader(reader);
try (BufferedReader br = getBufferedReader(reader)) {
String line = null;
while ((line = br.readLine()) != null) {
int comment = line.indexOf('|');
@ -204,8 +196,6 @@ public class WordlistLoader {
if (words[i].length() > 0) result.add(words[i]);
}
}
} finally {
IOUtils.close(br);
}
return result;
}
@ -222,10 +212,11 @@ public class WordlistLoader {
* </ul>
*
* @param reader Reader containing a Snowball stopword list
* @return A {@link CharArraySet} with the reader's words
* @return An unmodifiable {@link CharArraySet} with the reader's words
*/
public static CharArraySet getSnowballWordSet(Reader reader) throws IOException {
return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
return CharArraySet.unmodifiableSet(
getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)));
}
/**
@ -240,7 +231,7 @@ public class WordlistLoader {
* </ul>
*
* @param stream InputStream in UTF-8 encoding containing a Snowball stopword list
* @return A {@link CharArraySet} with the reader's words
* @return An unmodifiable {@link CharArraySet} with the reader's words
*/
public static CharArraySet getSnowballWordSet(InputStream stream) throws IOException {
return getSnowballWordSet(stream, StandardCharsets.UTF_8);
@ -259,7 +250,7 @@ public class WordlistLoader {
*
* @param stream InputStream containing a Snowball stopword list
* @param charset Charset of the stopword list
* @return A {@link CharArraySet} with the reader's words
* @return An unmodifiable {@link CharArraySet} with the reader's words
*/
public static CharArraySet getSnowballWordSet(InputStream stream, Charset charset)
throws IOException {
@ -278,16 +269,12 @@ public class WordlistLoader {
*/
public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result)
throws IOException {
BufferedReader br = null;
try {
br = getBufferedReader(reader);
try (BufferedReader br = getBufferedReader(reader)) {
String line;
while ((line = br.readLine()) != null) {
String[] wordstem = line.split("\t", 2);
result.put(wordstem[0], wordstem[1]);
}
} finally {
IOUtils.close(br);
}
return result;
}
@ -302,12 +289,8 @@ public class WordlistLoader {
* @throws IOException If there is a low-level I/O error.
*/
public static List<String> getLines(InputStream stream, Charset charset) throws IOException {
BufferedReader input = null;
ArrayList<String> lines;
boolean success = false;
try {
input = getBufferedReader(IOUtils.getDecodingReader(stream, charset));
try (BufferedReader input = getBufferedReader(IOUtils.getDecodingReader(stream, charset))) {
lines = new ArrayList<>();
for (String word = null; (word = input.readLine()) != null; ) {
// skip initial bom marker
@ -320,14 +303,7 @@ public class WordlistLoader {
if (word.length() == 0) continue;
lines.add(word);
}
success = true;
return lines;
} finally {
if (success) {
IOUtils.close(input);
} else {
IOUtils.closeWhileHandlingException(input);
}
}
}