mirror of https://github.com/apache/lucene.git
LUCENE-10349: Cleanup WordListLoader to use try-with-resources and make the default stop words unmodifiable (#577)
This commit is contained in:
parent
835e821287
commit
305d9ebb86
|
@ -55,6 +55,8 @@ API Changes
|
|||
are caller sensitive in Java 11). Instead add utility method IOUtils#requireResourceNonNull(T)
|
||||
to test existence of resource based on null return value. (Uwe Schindler, Dawid Weiss)
|
||||
|
||||
* LUCENE-10349: WordListLoader methods now return unmodifiable CharArraySets. (Uwe Schindler)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
||||
|
@ -148,6 +150,9 @@ Bug Fixes
|
|||
* LUCENE-10236: Stop duplicating norms when scoring in CombinedFieldQuery.
|
||||
(Zach Chen, Jim Ferenczi, Julie Tibshirani)
|
||||
|
||||
* LUCENE-10349: Fix all analyzers to behave according to their documentation:
|
||||
getDefaultStopSet() methods now return unmodifiable CharArraySets. (Uwe Schindler)
|
||||
|
||||
Other
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -20,8 +20,8 @@ import java.io.IOException;
|
|||
import java.io.Reader;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
|
@ -80,24 +80,24 @@ public class JapaneseAnalyzer extends StopwordAnalyzerBase {
|
|||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET =
|
||||
WordlistLoader.getWordSet(
|
||||
IOUtils.getDecodingReader(
|
||||
IOUtils.requireResourceNonNull(
|
||||
JapaneseAnalyzer.class.getResourceAsStream("stopwords.txt"),
|
||||
"stopwords.txt"),
|
||||
StandardCharsets.UTF_8),
|
||||
"#",
|
||||
new CharArraySet(16, true)); // ignore case
|
||||
CharArraySet.unmodifiableSet(
|
||||
WordlistLoader.getWordSet(
|
||||
IOUtils.getDecodingReader(
|
||||
IOUtils.requireResourceNonNull(
|
||||
JapaneseAnalyzer.class.getResourceAsStream("stopwords.txt"),
|
||||
"stopwords.txt"),
|
||||
StandardCharsets.UTF_8),
|
||||
"#",
|
||||
new CharArraySet(16, true))); // ignore case
|
||||
final CharArraySet tagset =
|
||||
WordlistLoader.getWordSet(
|
||||
IOUtils.requireResourceNonNull(
|
||||
JapaneseAnalyzer.class.getResourceAsStream("stoptags.txt"), "stoptags.txt"),
|
||||
"#");
|
||||
DEFAULT_STOP_TAGS = new HashSet<>();
|
||||
for (Object element : tagset) {
|
||||
char[] chars = (char[]) element;
|
||||
DEFAULT_STOP_TAGS.add(new String(chars));
|
||||
}
|
||||
DEFAULT_STOP_TAGS =
|
||||
tagset.stream()
|
||||
.map(ca -> new String((char[]) ca))
|
||||
.collect(Collectors.toUnmodifiableSet());
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the distribution (JAR)
|
||||
throw new UncheckedIOException("Unable to load default stopword or stoptag set", ex);
|
||||
|
|
|
@ -50,15 +50,11 @@ public class WordlistLoader {
|
|||
* @return the given {@link CharArraySet} with the reader's words
|
||||
*/
|
||||
public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException {
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
br = getBufferedReader(reader);
|
||||
try (BufferedReader br = getBufferedReader(reader)) {
|
||||
String word = null;
|
||||
while ((word = br.readLine()) != null) {
|
||||
result.add(word.trim());
|
||||
}
|
||||
} finally {
|
||||
IOUtils.close(br);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -70,10 +66,11 @@ public class WordlistLoader {
|
|||
* StandardAnalyzer).
|
||||
*
|
||||
* @param reader Reader containing the wordlist
|
||||
* @return A {@link CharArraySet} with the reader's words
|
||||
* @return An unmodifiable {@link CharArraySet} with the reader's words
|
||||
*/
|
||||
public static CharArraySet getWordSet(Reader reader) throws IOException {
|
||||
return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
|
||||
return CharArraySet.unmodifiableSet(
|
||||
getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -83,7 +80,7 @@ public class WordlistLoader {
|
|||
* uses LowerCaseFilter (like StandardAnalyzer).
|
||||
*
|
||||
* @param stream InputStream containing the wordlist
|
||||
* @return A {@link CharArraySet} with the reader's words
|
||||
* @return An unmodifiable {@link CharArraySet} with the reader's words
|
||||
*/
|
||||
public static CharArraySet getWordSet(InputStream stream) throws IOException {
|
||||
return getWordSet(stream, StandardCharsets.UTF_8);
|
||||
|
@ -97,7 +94,7 @@ public class WordlistLoader {
|
|||
*
|
||||
* @param stream InputStream containing the wordlist
|
||||
* @param charset Charset of the wordlist
|
||||
* @return A {@link CharArraySet} with the reader's words
|
||||
* @return An unmodifiable {@link CharArraySet} with the reader's words
|
||||
*/
|
||||
public static CharArraySet getWordSet(InputStream stream, Charset charset) throws IOException {
|
||||
return getWordSet(IOUtils.getDecodingReader(stream, charset));
|
||||
|
@ -116,17 +113,13 @@ public class WordlistLoader {
|
|||
*/
|
||||
public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result)
|
||||
throws IOException {
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
br = getBufferedReader(reader);
|
||||
try (BufferedReader br = getBufferedReader(reader)) {
|
||||
String word = null;
|
||||
while ((word = br.readLine()) != null) {
|
||||
if (word.startsWith(comment) == false) {
|
||||
result.add(word.trim());
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
IOUtils.close(br);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -139,10 +132,11 @@ public class WordlistLoader {
|
|||
*
|
||||
* @param reader Reader containing the wordlist
|
||||
* @param comment The string representing a comment.
|
||||
* @return A CharArraySet with the reader's words
|
||||
* @return An unmodifiable CharArraySet with the reader's words
|
||||
*/
|
||||
public static CharArraySet getWordSet(Reader reader, String comment) throws IOException {
|
||||
return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false));
|
||||
return CharArraySet.unmodifiableSet(
|
||||
getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false)));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -153,7 +147,7 @@ public class WordlistLoader {
|
|||
*
|
||||
* @param stream InputStream in UTF-8 encoding containing the wordlist
|
||||
* @param comment The string representing a comment.
|
||||
* @return A CharArraySet with the reader's words
|
||||
* @return An unmodifiable CharArraySet with the reader's words
|
||||
*/
|
||||
public static CharArraySet getWordSet(InputStream stream, String comment) throws IOException {
|
||||
return getWordSet(stream, StandardCharsets.UTF_8, comment);
|
||||
|
@ -168,7 +162,7 @@ public class WordlistLoader {
|
|||
* @param stream InputStream containing the wordlist
|
||||
* @param charset Charset of the wordlist
|
||||
* @param comment The string representing a comment.
|
||||
* @return A CharArraySet with the reader's words
|
||||
* @return An unmodifiable CharArraySet with the reader's words
|
||||
*/
|
||||
public static CharArraySet getWordSet(InputStream stream, Charset charset, String comment)
|
||||
throws IOException {
|
||||
|
@ -192,9 +186,7 @@ public class WordlistLoader {
|
|||
*/
|
||||
public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
|
||||
throws IOException {
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
br = getBufferedReader(reader);
|
||||
try (BufferedReader br = getBufferedReader(reader)) {
|
||||
String line = null;
|
||||
while ((line = br.readLine()) != null) {
|
||||
int comment = line.indexOf('|');
|
||||
|
@ -204,8 +196,6 @@ public class WordlistLoader {
|
|||
if (words[i].length() > 0) result.add(words[i]);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
IOUtils.close(br);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -222,10 +212,11 @@ public class WordlistLoader {
|
|||
* </ul>
|
||||
*
|
||||
* @param reader Reader containing a Snowball stopword list
|
||||
* @return A {@link CharArraySet} with the reader's words
|
||||
* @return An unmodifiable {@link CharArraySet} with the reader's words
|
||||
*/
|
||||
public static CharArraySet getSnowballWordSet(Reader reader) throws IOException {
|
||||
return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
|
||||
return CharArraySet.unmodifiableSet(
|
||||
getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -240,7 +231,7 @@ public class WordlistLoader {
|
|||
* </ul>
|
||||
*
|
||||
* @param stream InputStream in UTF-8 encoding containing a Snowball stopword list
|
||||
* @return A {@link CharArraySet} with the reader's words
|
||||
* @return An unmodifiable {@link CharArraySet} with the reader's words
|
||||
*/
|
||||
public static CharArraySet getSnowballWordSet(InputStream stream) throws IOException {
|
||||
return getSnowballWordSet(stream, StandardCharsets.UTF_8);
|
||||
|
@ -259,7 +250,7 @@ public class WordlistLoader {
|
|||
*
|
||||
* @param stream InputStream containing a Snowball stopword list
|
||||
* @param charset Charset of the stopword list
|
||||
* @return A {@link CharArraySet} with the reader's words
|
||||
* @return An unmodifiable {@link CharArraySet} with the reader's words
|
||||
*/
|
||||
public static CharArraySet getSnowballWordSet(InputStream stream, Charset charset)
|
||||
throws IOException {
|
||||
|
@ -278,16 +269,12 @@ public class WordlistLoader {
|
|||
*/
|
||||
public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result)
|
||||
throws IOException {
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
br = getBufferedReader(reader);
|
||||
try (BufferedReader br = getBufferedReader(reader)) {
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
String[] wordstem = line.split("\t", 2);
|
||||
result.put(wordstem[0], wordstem[1]);
|
||||
}
|
||||
} finally {
|
||||
IOUtils.close(br);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -302,12 +289,8 @@ public class WordlistLoader {
|
|||
* @throws IOException If there is a low-level I/O error.
|
||||
*/
|
||||
public static List<String> getLines(InputStream stream, Charset charset) throws IOException {
|
||||
BufferedReader input = null;
|
||||
ArrayList<String> lines;
|
||||
boolean success = false;
|
||||
try {
|
||||
input = getBufferedReader(IOUtils.getDecodingReader(stream, charset));
|
||||
|
||||
try (BufferedReader input = getBufferedReader(IOUtils.getDecodingReader(stream, charset))) {
|
||||
lines = new ArrayList<>();
|
||||
for (String word = null; (word = input.readLine()) != null; ) {
|
||||
// skip initial bom marker
|
||||
|
@ -320,14 +303,7 @@ public class WordlistLoader {
|
|||
if (word.length() == 0) continue;
|
||||
lines.add(word);
|
||||
}
|
||||
success = true;
|
||||
return lines;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(input);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(input);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue