LUCENE-2564: Cut over WordListLoader to CharArrayMap/Set and use CharSetDecoder to detect encoding problems early

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1200080 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2011-11-10 01:21:25 +00:00
parent c0c36d00d6
commit dc6b4b6533
27 changed files with 327 additions and 236 deletions

View File

@ -99,6 +99,11 @@ Changes in backwards compatibility policy
* LUCENE-3558: Moved NRTManager & NRTManagerReopenThread into lucene core
o.a.l.search. (Simon Willnauer)
* LUCENE-2564: WordListLoader is now flaged as @lucene.internal. All methods in
WordListLoader now return CharArraySet/Map and expect Reader instances for
efficiency. Utilities to open Readers from Files, InputStreams or Java
resources were added to IOUtils. (Simon Willnauer, Robert Muir)
New Features

View File

@ -17,15 +17,35 @@ package org.apache.lucene.util;
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.lang.reflect.Method;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
/** This class emulates the new Java 7 "Try-With-Resources" statement.
* Remove once Lucene is on Java 7.
* @lucene.internal */
public final class IOUtils {
/**
* UTF-8 charset string
* @see Charset#forName(String)
*/
public static final String UTF_8 = "UTF-8";
/**
* UTF-8 {@link Charset} instance to prevent repeated
* {@link Charset#forName(String)} lookups
*/
public static final Charset CHARSET_UTF_8 = Charset.forName("UTF-8");
private IOUtils() {} // no instance
/**
@ -220,5 +240,84 @@ public final class IOUtils {
}
}
}
/**
* Wrapping the given {@link InputStream} in a reader using a {@link CharsetDecoder}.
* Unlike Java's defaults this reader will throw an exception if your it detects
* the read charset doesn't match the expected {@link Charset}.
* <p>
* Decoding readers are useful to load configuration files, stopword lists or synonym files
* to detect character set problems. However, its not recommended to use as a common purpose
* reader.
*
* @param stream the stream to wrap in a reader
* @param charSet the expected charset
* @return a wrapping reader
*/
public static Reader getDecodingReader(InputStream stream, Charset charSet) {
final CharsetDecoder charSetDecoder = charSet.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
return new BufferedReader(new InputStreamReader(stream, charSetDecoder));
}
/**
* Opens a Reader for the given {@link File} using a {@link CharsetDecoder}.
* Unlike Java's defaults this reader will throw an exception if your it detects
* the read charset doesn't match the expected {@link Charset}.
* <p>
* Decoding readers are useful to load configuration files, stopword lists or synonym files
* to detect character set problems. However, its not recommended to use as a common purpose
* reader.
* @param file the file to open a reader on
* @param charSet the expected charset
* @return a reader to read the given file
*/
public static Reader getDecodingReader(File file, Charset charSet) throws IOException {
FileInputStream stream = null;
boolean success = false;
try {
stream = new FileInputStream(file);
final Reader reader = getDecodingReader(stream, charSet);
success = true;
return reader;
} finally {
if (!success) {
IOUtils.close(stream);
}
}
}
/**
* Opens a Reader for the given resource using a {@link CharsetDecoder}.
* Unlike Java's defaults this reader will throw an exception if your it detects
* the read charset doesn't match the expected {@link Charset}.
* <p>
* Decoding readers are useful to load configuration files, stopword lists or synonym files
* to detect character set problems. However, its not recommended to use as a common purpose
* reader.
* @param clazz the class used to locate the resource
* @param resource the resource name to load
* @param charSet the expected charset
* @return a reader to read the given file
*
*/
public static Reader getDecodingReader(Class<?> clazz, String resource, Charset charSet) throws IOException {
InputStream stream = null;
boolean success = false;
try {
stream = clazz
.getResourceAsStream(resource);
final Reader reader = getDecodingReader(stream, charSet);
success = true;
return reader;
} finally {
if (!success) {
IOUtils.close(stream);
}
}
}
}

View File

@ -34,6 +34,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
@ -64,9 +65,8 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = CharArraySet.unmodifiableSet(new CharArraySet(
Version.LUCENE_CURRENT, WordlistLoader.getWordSet(BrazilianAnalyzer.class,
DEFAULT_STOPWORD_FILE, "#"), false));
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(BrazilianAnalyzer.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -57,8 +57,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
"they", "this", "to", "was", "will", "with"
);
final CharArraySet stopSet = new CharArraySet(Version.LUCENE_CURRENT,
stopWords.size(), false);
stopSet.addAll(stopWords);
stopWords, false);
ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
}
@ -82,7 +81,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion See <a href="#version">above</a>
* @param stopwordsFile File to load stop words from */
public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException {
this(matchVersion, WordlistLoader.getWordSet(stopwordsFile));
this(matchVersion, loadStopwordSet(stopwordsFile, matchVersion));
}
/** Builds an analyzer with the stop words from the given reader.
@ -90,7 +89,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
* @param matchVersion See <a href="#version">above</a>
* @param stopwords Reader to load stop words from */
public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
this(matchVersion, WordlistLoader.getWordSet(stopwords));
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/**

View File

@ -28,6 +28,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.*;
@ -70,9 +71,8 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
Version.LUCENE_CURRENT, WordlistLoader.getWordSet(CzechAnalyzer.class,
DEFAULT_STOPWORD_FILE, "#"), false));
DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.DanishStemmer;
@ -62,8 +63,8 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
DEFAULT_STOPWORD_FILE);
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -36,6 +36,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.German2Stemmer;
@ -100,8 +101,8 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
private static final Set<?> DEFAULT_SET;
static {
try {
DEFAULT_SET =
WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.SpanishStemmer;
@ -62,8 +63,8 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
DEFAULT_STOPWORD_FILE);
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.FinnishStemmer;
@ -62,8 +63,8 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
DEFAULT_STOPWORD_FILE);
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -30,6 +30,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.IOException;
@ -118,8 +119,8 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
static final Set<?> DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET =
WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -32,6 +32,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
@ -60,12 +61,12 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(GalicianAnalyzer.class,
DEFAULT_STOPWORD_FILE);
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(GalicianAnalyzer.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
throw new RuntimeException("Unable to load default stopword set", ex);
}
}
}

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.HungarianStemmer;
@ -62,8 +63,8 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
DEFAULT_STOPWORD_FILE);
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -35,6 +35,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.ItalianStemmer;
@ -79,8 +80,8 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
DEFAULT_STOPWORD_FILE);
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -27,11 +27,13 @@ import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
@ -60,8 +62,8 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(LatvianAnalyzer.class,
DEFAULT_STOPWORD_FILE);
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(LatvianAnalyzer.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -30,6 +30,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.File;
@ -83,8 +84,8 @@ public final class DutchAnalyzer extends Analyzer {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
DEFAULT_STOPWORD_FILE);
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.NorwegianStemmer;
@ -62,8 +63,8 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
DEFAULT_STOPWORD_FILE);
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.PortugueseStemmer;
@ -62,8 +63,8 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
DEFAULT_STOPWORD_FILE);
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -34,6 +34,7 @@ import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
@ -84,12 +85,12 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
static {
try {
DEFAULT_STOP_SET =
WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
throw new RuntimeException("Unable to load default stopword set", ex);
}
}
}

View File

@ -23,6 +23,7 @@ import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.File;
@ -85,7 +86,7 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
* <a href="#version">above</a>}
* @param stopwords File to read stop words from */
public ClassicAnalyzer(Version matchVersion, File stopwords) throws IOException {
this(matchVersion, WordlistLoader.getWordSet(stopwords));
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/** Builds an analyzer with the stop words from the given reader.
@ -94,7 +95,7 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
* <a href="#version">above</a>}
* @param stopwords Reader to read stop words from */
public ClassicAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
this(matchVersion, WordlistLoader.getWordSet(stopwords));
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/**

View File

@ -23,6 +23,7 @@ import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.File;
@ -86,7 +87,7 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
* <a href="#version">above</a>}
* @param stopwords File to read stop words from */
public StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
this(matchVersion, WordlistLoader.getWordSet(stopwords));
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/** Builds an analyzer with the stop words from the given reader.
@ -95,7 +96,7 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
* <a href="#version">above</a>}
* @param stopwords Reader to read stop words from */
public StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
this(matchVersion, WordlistLoader.getWordSet(stopwords));
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
}
/**

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.SwedishStemmer;
@ -62,8 +63,8 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
DEFAULT_STOPWORD_FILE);
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)

View File

@ -17,10 +17,13 @@
package org.apache.lucene.analysis.util;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
@ -93,11 +96,59 @@ public abstract class StopwordAnalyzerBase extends Analyzer {
protected static CharArraySet loadStopwordSet(final boolean ignoreCase,
final Class<? extends Analyzer> aClass, final String resource,
final String comment) throws IOException {
final Set<String> wordSet = WordlistLoader.getWordSet(aClass, resource,
comment);
final CharArraySet set = new CharArraySet(Version.LUCENE_31, wordSet.size(), ignoreCase);
set.addAll(wordSet);
return set;
Reader reader = null;
try {
reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), IOUtils.CHARSET_UTF_8);
return WordlistLoader.getWordSet(reader, comment, new CharArraySet(Version.LUCENE_31, 16, ignoreCase));
} finally {
IOUtils.close(reader);
}
}
/**
* Creates a CharArraySet from a file.
*
* @param stopwords
* the stopwords file to load
*
* @param matchVersion
* the Lucene version for cross version compatibility
* @return a CharArraySet containing the distinct stopwords from the given
* file
* @throws IOException
* if loading the stopwords throws an {@link IOException}
*/
protected static CharArraySet loadStopwordSet(File stopwords,
Version matchVersion) throws IOException {
Reader reader = null;
try {
reader = IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8);
return WordlistLoader.getWordSet(reader, matchVersion);
} finally {
IOUtils.close(reader);
}
}
/**
* Creates a CharArraySet from a file.
*
* @param stopwords
* the stopwords reader to load
*
* @param matchVersion
* the Lucene version for cross version compatibility
* @return a CharArraySet containing the distinct stopwords from the given
* reader
* @throws IOException
* if loading the stopwords throws an {@link IOException}
*/
protected static CharArraySet loadStopwordSet(Reader stopwords,
Version matchVersion) throws IOException {
try {
return WordlistLoader.getWordSet(stopwords, matchVersion);
} finally {
IOUtils.close(stopwords);
}
}
}

View File

@ -18,165 +18,91 @@ package org.apache.lucene.analysis.util;
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
* Loader for text files that represent a list of stopwords.
*
* @see IOUtils to obtain {@link Reader} instances
* @lucene.internal
*/
public class WordlistLoader {
/**
* Loads a text file associated with a given class (See
* {@link Class#getResourceAsStream(String)}) and adds every line as an entry
* to a {@link Set} (omitting leading and trailing whitespace). Every line of
* the file should contain only one word. The words need to be in lower-case if
* you make use of an Analyzer which uses LowerCaseFilter (like
* StandardAnalyzer).
*
* @param aClass
* a class that is associated with the given stopwordResource
* @param stopwordResource
* name of the resource file associated with the given class
* @return a {@link Set} with the file's words
*/
public static Set<String> getWordSet(Class<?> aClass, String stopwordResource)
throws IOException {
final Reader reader = new BufferedReader(new InputStreamReader(aClass
.getResourceAsStream(stopwordResource), "UTF-8"));
try {
return getWordSet(reader);
} finally {
reader.close();
}
}
private static final int INITITAL_CAPACITY = 16;
/**
* Loads a text file associated with a given class (See
* {@link Class#getResourceAsStream(String)}) and adds every line as an entry
* to a {@link Set} (omitting leading and trailing whitespace). Every line of
* the file should contain only one word. The words need to be in lower-case if
* you make use of an Analyzer which uses LowerCaseFilter (like
* StandardAnalyzer).
*
* @param aClass
* a class that is associated with the given stopwordResource
* @param stopwordResource
* name of the resource file associated with the given class
* @param comment
* the comment string to ignore
* @return a {@link Set} with the file's words
*/
public static Set<String> getWordSet(Class<?> aClass,
String stopwordResource, String comment) throws IOException {
final Reader reader = new BufferedReader(new InputStreamReader(aClass
.getResourceAsStream(stopwordResource), "UTF-8"));
try {
return getWordSet(reader, comment);
} finally {
reader.close();
}
}
/**
* Loads a text file and adds every line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the file should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param wordfile File containing the wordlist
* @return A HashSet with the file's words
*/
public static HashSet<String> getWordSet(File wordfile) throws IOException {
FileReader reader = null;
try {
reader = new FileReader(wordfile);
return getWordSet(reader);
}
finally {
if (reader != null)
reader.close();
}
}
/**
* Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the file should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param wordfile File containing the wordlist
* @param comment The comment string to ignore
* @return A HashSet with the file's words
*/
public static HashSet<String> getWordSet(File wordfile, String comment) throws IOException {
FileReader reader = null;
try {
reader = new FileReader(wordfile);
return getWordSet(reader, comment);
}
finally {
if (reader != null)
reader.close();
}
}
/**
* Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
* Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
* leading and trailing whitespace). Every line of the Reader should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param reader Reader containing the wordlist
* @return A HashSet with the reader's words
* @param result the {@link CharArraySet} to fill with the readers words
* @return the given {@link CharArraySet} with the reader's words
*/
public static HashSet<String> getWordSet(Reader reader) throws IOException {
final HashSet<String> result = new HashSet<String>();
public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException {
BufferedReader br = null;
try {
if (reader instanceof BufferedReader) {
br = (BufferedReader) reader;
} else {
br = new BufferedReader(reader);
}
br = getBufferedReader(reader);
String word = null;
while ((word = br.readLine()) != null) {
result.add(word.trim());
}
}
finally {
if (br != null)
br.close();
IOUtils.close(br);
}
return result;
}
/**
* Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
* leading and trailing whitespace). Every line of the Reader should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param reader Reader containing the wordlist
* @param matchVersion the Lucene {@link Version}
* @return A {@link CharArraySet} with the reader's words
*/
public static CharArraySet getWordSet(Reader reader, Version matchVersion) throws IOException {
return getWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
}
/**
* Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
* Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
* leading and trailing whitespace). Every line of the Reader should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param reader Reader containing the wordlist
* @param comment The string representing a comment.
* @return A HashSet with the reader's words
* @param matchVersion the Lucene {@link Version}
* @return A CharArraySet with the reader's words
*/
public static HashSet<String> getWordSet(Reader reader, String comment) throws IOException {
final HashSet<String> result = new HashSet<String>();
public static CharArraySet getWordSet(Reader reader, String comment, Version matchVersion) throws IOException {
return getWordSet(reader, comment, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
}
/**
* Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
* leading and trailing whitespace). Every line of the Reader should contain only
* one word. The words need to be in lowercase if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
*
* @param reader Reader containing the wordlist
* @param comment The string representing a comment.
* @param result the {@link CharArraySet} to fill with the readers words
* @return the given {@link CharArraySet} with the reader's words
*/
public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException {
BufferedReader br = null;
try {
if (reader instanceof BufferedReader) {
br = (BufferedReader) reader;
} else {
br = new BufferedReader(reader);
}
br = getBufferedReader(reader);
String word = null;
while ((word = br.readLine()) != null) {
if (word.startsWith(comment) == false){
@ -185,33 +111,44 @@ public class WordlistLoader {
}
}
finally {
if (br != null)
br.close();
IOUtils.close(br);
}
return result;
}
/**
* Loads a text file in Snowball format associated with a given class (See
* {@link Class#getResourceAsStream(String)}) and adds all words as entries to
* a {@link Set}. The words need to be in lower-case if you make use of an
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
* Reads stopwords from a stopword list in Snowball format.
* <p>
* The snowball format is the following:
* <ul>
* <li>Lines may contain multiple words separated by whitespace.
* <li>The comment character is the vertical line (&#124;).
* <li>Lines may contain trailing comments.
* </ul>
* </p>
*
* @param aClass a class that is associated with the given stopwordResource
* @param stopwordResource name of the resource file associated with the given
* class
* @return a {@link Set} with the file's words
* @see #getSnowballWordSet(Reader)
* @param reader Reader containing a Snowball stopword list
* @param result the {@link CharArraySet} to fill with the readers words
* @return the given {@link CharArraySet} with the reader's words
*/
public static Set<String> getSnowballWordSet(Class<?> aClass,
String stopwordResource) throws IOException {
final Reader reader = new BufferedReader(new InputStreamReader(aClass
.getResourceAsStream(stopwordResource), "UTF-8"));
public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
throws IOException {
BufferedReader br = null;
try {
return getSnowballWordSet(reader);
br = getBufferedReader(reader);
String line = null;
while ((line = br.readLine()) != null) {
int comment = line.indexOf('|');
if (comment >= 0) line = line.substring(0, comment);
String words[] = line.split("\\s+");
for (int i = 0; i < words.length; i++)
if (words[i].length() > 0) result.add(words[i]);
}
} finally {
reader.close();
IOUtils.close(br);
}
return result;
}
/**
@ -226,30 +163,12 @@ public class WordlistLoader {
* </p>
*
* @param reader Reader containing a Snowball stopword list
* @return A Set with the reader's words
* @param result the {@link CharArraySet} to fill with the readers words
* @param matchVersion the Lucene {@link Version}
* @return A {@link CharArraySet} with the reader's words
*/
public static Set<String> getSnowballWordSet(Reader reader)
throws IOException {
final Set<String> result = new HashSet<String>();
BufferedReader br = null;
try {
if (reader instanceof BufferedReader) {
br = (BufferedReader) reader;
} else {
br = new BufferedReader(reader);
}
String line = null;
while ((line = br.readLine()) != null) {
int comment = line.indexOf('|');
if (comment >= 0) line = line.substring(0, comment);
String words[] = line.split("\\s+");
for (int i = 0; i < words.length; i++)
if (words[i].length() > 0) result.add(words[i]);
}
} finally {
if (br != null) br.close();
}
return result;
public static CharArraySet getSnowballWordSet(Reader reader, Version matchVersion) throws IOException {
return getSnowballWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
}
@ -261,24 +180,24 @@ public class WordlistLoader {
* @return stem dictionary that overrules the stemming algorithm
* @throws IOException
*/
public static HashMap<String, String> getStemDict(File wordstemfile) throws IOException {
if (wordstemfile == null)
throw new NullPointerException("wordstemfile may not be null");
final HashMap<String, String> result = new HashMap<String,String>();
public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result) throws IOException {
BufferedReader br = null;
try {
br = new BufferedReader(new FileReader(wordstemfile));
br = getBufferedReader(reader);
String line;
while ((line = br.readLine()) != null) {
String[] wordstem = line.split("\t", 2);
result.put(wordstem[0], wordstem[1]);
}
} finally {
if(br != null)
br.close();
IOUtils.close(br);
}
return result;
}
private static BufferedReader getBufferedReader(Reader reader) {
return (reader instanceof BufferedReader) ? (BufferedReader) reader
: new BufferedReader(reader);
}
}

View File

@ -46,7 +46,7 @@ public class TestCharArraySet extends LuceneTestCase {
public void testNonZeroOffset() {
String[] words={"Hello","World","this","is","a","test"};
char[] findme="xthisy".toCharArray();
CharArraySet set=new CharArraySet(TEST_VERSION_CURRENT, 10,true);
CharArraySet set= new CharArraySet(TEST_VERSION_CURRENT, 10, true);
set.addAll(Arrays.asList(words));
assertTrue(set.contains(findme, 1, 4));
assertTrue(set.contains(new String(findme,1,4)));

View File

@ -20,8 +20,6 @@ package org.apache.lucene.analysis.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.util.LuceneTestCase;
@ -31,22 +29,22 @@ public class TestWordlistLoader extends LuceneTestCase {
public void testWordlistLoading() throws IOException {
String s = "ONE\n two \nthree";
HashSet<String> wordSet1 = WordlistLoader.getWordSet(new StringReader(s));
CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), TEST_VERSION_CURRENT);
checkSet(wordSet1);
HashSet<String> wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)));
CharArraySet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)), TEST_VERSION_CURRENT);
checkSet(wordSet2);
}
public void testComments() throws Exception {
String s = "ONE\n two \nthree\n#comment";
HashSet<String> wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#");
CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#", TEST_VERSION_CURRENT);
checkSet(wordSet1);
assertFalse(wordSet1.contains("#comment"));
assertFalse(wordSet1.contains("comment"));
}
private void checkSet(HashSet<String> wordset) {
private void checkSet(CharArraySet wordset) {
assertEquals(3, wordset.size());
assertTrue(wordset.contains("ONE")); // case is not modified
assertTrue(wordset.contains("two")); // surrounding whitespace is removed
@ -68,7 +66,7 @@ public class TestWordlistLoader extends LuceneTestCase {
" two \n" + // stopword with leading/trailing space
" three four five \n" + // multiple stopwords
"six seven | comment\n"; //multiple stopwords + comment
Set<String> wordset = WordlistLoader.getSnowballWordSet(new StringReader(s));
CharArraySet wordset = WordlistLoader.getSnowballWordSet(new StringReader(s), TEST_VERSION_CURRENT);
assertEquals(7, wordset.size());
assertTrue(wordset.contains("ONE"));
assertTrue(wordset.contains("two"));

View File

@ -26,6 +26,7 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@ -66,7 +67,7 @@ public final class SmartChineseAnalyzer extends Analyzer {
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
public static Set<String> getDefaultStopSet(){
public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
@ -75,7 +76,7 @@ public final class SmartChineseAnalyzer extends Analyzer {
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final Set<String> DEFAULT_STOP_SET;
static final CharArraySet DEFAULT_STOP_SET;
static {
try {
@ -87,13 +88,14 @@ public final class SmartChineseAnalyzer extends Analyzer {
}
}
static Set<String> loadDefaultStopWordSet() throws IOException {
static CharArraySet loadDefaultStopWordSet() throws IOException {
InputStream stream = SmartChineseAnalyzer.class
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
try {
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
// make sure it is unmodifiable as we expose it in the outer class
return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, STOPWORD_FILE_COMMENT));
return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(reader,
STOPWORD_FILE_COMMENT, Version.LUCENE_CURRENT));
} finally {
stream.close();
}

View File

@ -34,6 +34,7 @@ import org.apache.lucene.analysis.stempel.StempelFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import org.egothor.stemmer.Trie;
@ -68,8 +69,8 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(PolishAnalyzer.class,
DEFAULT_STOPWORD_FILE);
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(PolishAnalyzer.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)