diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 634eb978377..2c879192c50 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -23,6 +23,8 @@ API Changes are caller sensitive in Java 11). Instead add utility method IOUtils#requireResourceNonNull(T) to test existence of resource based on null return value. (Uwe Schindler, Dawid Weiss) +* LUCENE-10349: WordListLoader methods now return unmodifiable CharArraySets. (Uwe Schindler) + New Features --------------------- @@ -111,6 +113,9 @@ Bug Fixes * LUCENE-10279: Fix equals in MultiRangeQuery. (Ignacio Vera) +* LUCENE-10349: Fix all analyzers to behave according to their documentation: + getDefaultStopSet() methods now return unmodifiable CharArraySets. (Uwe Schindler) + Other --------------------- diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java index 8c79a2d3d76..9411902a582 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java @@ -20,8 +20,8 @@ import java.io.IOException; import java.io.Reader; import java.io.UncheckedIOException; import java.nio.charset.StandardCharsets; -import java.util.HashSet; import java.util.Set; +import java.util.stream.Collectors; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; @@ -80,24 +80,24 @@ public class JapaneseAnalyzer extends StopwordAnalyzerBase { static { try { DEFAULT_STOP_SET = - WordlistLoader.getWordSet( - IOUtils.getDecodingReader( - IOUtils.requireResourceNonNull( - JapaneseAnalyzer.class.getResourceAsStream("stopwords.txt"), - "stopwords.txt"), - StandardCharsets.UTF_8), - "#", - new CharArraySet(16, true)); // ignore case + CharArraySet.unmodifiableSet( + WordlistLoader.getWordSet( + IOUtils.getDecodingReader( + IOUtils.requireResourceNonNull( + JapaneseAnalyzer.class.getResourceAsStream("stopwords.txt"), + "stopwords.txt"), + StandardCharsets.UTF_8), + "#", + new CharArraySet(16, true))); // ignore case final CharArraySet tagset = WordlistLoader.getWordSet( IOUtils.requireResourceNonNull( JapaneseAnalyzer.class.getResourceAsStream("stoptags.txt"), "stoptags.txt"), "#"); - DEFAULT_STOP_TAGS = new HashSet<>(); - for (Object element : tagset) { - char[] chars = (char[]) element; - DEFAULT_STOP_TAGS.add(new String(chars)); - } + DEFAULT_STOP_TAGS = + tagset.stream() + .map(ca -> new String((char[]) ca)) + .collect(Collectors.toUnmodifiableSet()); } catch (IOException ex) { // default set should always be present as it is part of the distribution (JAR) throw new UncheckedIOException("Unable to load default stopword or stoptag set", ex); diff --git a/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java b/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java index 7437fc1d984..30ada92eb39 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java @@ -50,15 +50,11 @@ public class WordlistLoader { * @return the given {@link CharArraySet} with the reader's words */ public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException { - BufferedReader br = null; - try { - br = getBufferedReader(reader); + try (BufferedReader br = getBufferedReader(reader)) { String word = null; while ((word = br.readLine()) != null) { result.add(word.trim()); } - } finally { - IOUtils.close(br); } return result; } @@ -70,10 +66,11 @@ public class WordlistLoader { * StandardAnalyzer). * * @param reader Reader containing the wordlist - * @return A {@link CharArraySet} with the reader's words + * @return An unmodifiable {@link CharArraySet} with the reader's words */ public static CharArraySet getWordSet(Reader reader) throws IOException { - return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)); + return CharArraySet.unmodifiableSet( + getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false))); } /** @@ -83,7 +80,7 @@ public class WordlistLoader { * uses LowerCaseFilter (like StandardAnalyzer). * * @param stream InputStream containing the wordlist - * @return A {@link CharArraySet} with the reader's words + * @return An unmodifiable {@link CharArraySet} with the reader's words */ public static CharArraySet getWordSet(InputStream stream) throws IOException { return getWordSet(stream, StandardCharsets.UTF_8); @@ -97,7 +94,7 @@ public class WordlistLoader { * * @param stream InputStream containing the wordlist * @param charset Charset of the wordlist - * @return A {@link CharArraySet} with the reader's words + * @return An unmodifiable {@link CharArraySet} with the reader's words */ public static CharArraySet getWordSet(InputStream stream, Charset charset) throws IOException { return getWordSet(IOUtils.getDecodingReader(stream, charset)); @@ -116,17 +113,13 @@ public class WordlistLoader { */ public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException { - BufferedReader br = null; - try { - br = getBufferedReader(reader); + try (BufferedReader br = getBufferedReader(reader)) { String word = null; while ((word = br.readLine()) != null) { if (word.startsWith(comment) == false) { result.add(word.trim()); } } - } finally { - IOUtils.close(br); } return result; } @@ -139,10 +132,11 @@ public class WordlistLoader { * * @param reader Reader containing the wordlist * @param comment The string representing a comment. - * @return A CharArraySet with the reader's words + * @return An unmodifiable CharArraySet with the reader's words */ public static CharArraySet getWordSet(Reader reader, String comment) throws IOException { - return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false)); + return CharArraySet.unmodifiableSet( + getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false))); } /** @@ -153,7 +147,7 @@ public class WordlistLoader { * * @param stream InputStream in UTF-8 encoding containing the wordlist * @param comment The string representing a comment. - * @return A CharArraySet with the reader's words + * @return An unmodifiable CharArraySet with the reader's words */ public static CharArraySet getWordSet(InputStream stream, String comment) throws IOException { return getWordSet(stream, StandardCharsets.UTF_8, comment); @@ -168,7 +162,7 @@ public class WordlistLoader { * @param stream InputStream containing the wordlist * @param charset Charset of the wordlist * @param comment The string representing a comment. - * @return A CharArraySet with the reader's words + * @return An unmodifiable CharArraySet with the reader's words */ public static CharArraySet getWordSet(InputStream stream, Charset charset, String comment) throws IOException { @@ -192,9 +186,7 @@ public class WordlistLoader { */ public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result) throws IOException { - BufferedReader br = null; - try { - br = getBufferedReader(reader); + try (BufferedReader br = getBufferedReader(reader)) { String line = null; while ((line = br.readLine()) != null) { int comment = line.indexOf('|'); @@ -204,8 +196,6 @@ public class WordlistLoader { if (words[i].length() > 0) result.add(words[i]); } } - } finally { - IOUtils.close(br); } return result; } @@ -222,10 +212,11 @@ public class WordlistLoader { * * * @param reader Reader containing a Snowball stopword list - * @return A {@link CharArraySet} with the reader's words + * @return An unmodifiable {@link CharArraySet} with the reader's words */ public static CharArraySet getSnowballWordSet(Reader reader) throws IOException { - return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)); + return CharArraySet.unmodifiableSet( + getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false))); } /** @@ -240,7 +231,7 @@ public class WordlistLoader { * * * @param stream InputStream in UTF-8 encoding containing a Snowball stopword list - * @return A {@link CharArraySet} with the reader's words + * @return An unmodifiable {@link CharArraySet} with the reader's words */ public static CharArraySet getSnowballWordSet(InputStream stream) throws IOException { return getSnowballWordSet(stream, StandardCharsets.UTF_8); @@ -259,7 +250,7 @@ public class WordlistLoader { * * @param stream InputStream containing a Snowball stopword list * @param charset Charset of the stopword list - * @return A {@link CharArraySet} with the reader's words + * @return An unmodifiable {@link CharArraySet} with the reader's words */ public static CharArraySet getSnowballWordSet(InputStream stream, Charset charset) throws IOException { @@ -278,16 +269,12 @@ public class WordlistLoader { */ public static CharArrayMap getStemDict(Reader reader, CharArrayMap result) throws IOException { - BufferedReader br = null; - try { - br = getBufferedReader(reader); + try (BufferedReader br = getBufferedReader(reader)) { String line; while ((line = br.readLine()) != null) { String[] wordstem = line.split("\t", 2); result.put(wordstem[0], wordstem[1]); } - } finally { - IOUtils.close(br); } return result; } @@ -302,12 +289,8 @@ public class WordlistLoader { * @throws IOException If there is a low-level I/O error. */ public static List getLines(InputStream stream, Charset charset) throws IOException { - BufferedReader input = null; ArrayList lines; - boolean success = false; - try { - input = getBufferedReader(IOUtils.getDecodingReader(stream, charset)); - + try (BufferedReader input = getBufferedReader(IOUtils.getDecodingReader(stream, charset))) { lines = new ArrayList<>(); for (String word = null; (word = input.readLine()) != null; ) { // skip initial bom marker @@ -320,14 +303,7 @@ public class WordlistLoader { if (word.length() == 0) continue; lines.add(word); } - success = true; return lines; - } finally { - if (success) { - IOUtils.close(input); - } else { - IOUtils.closeWhileHandlingException(input); - } } }