mirror of https://github.com/apache/lucene.git
LUCENE-2564: Cut over WordListLoader to CharArrayMap/Set and use CharSetDecoder to detect encoding problems early
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1200080 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c0c36d00d6
commit
dc6b4b6533
|
@ -100,6 +100,11 @@ Changes in backwards compatibility policy
|
|||
* LUCENE-3558: Moved NRTManager & NRTManagerReopenThread into lucene core
|
||||
o.a.l.search. (Simon Willnauer)
|
||||
|
||||
* LUCENE-2564: WordListLoader is now flaged as @lucene.internal. All methods in
|
||||
WordListLoader now return CharArraySet/Map and expect Reader instances for
|
||||
efficiency. Utilities to open Readers from Files, InputStreams or Java
|
||||
resources were added to IOUtils. (Simon Willnauer, Robert Muir)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-1824: Add BoundaryScanner interface and its implementation classes,
|
||||
|
|
|
@ -17,15 +17,35 @@ package org.apache.lucene.util;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.Closeable;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.lang.reflect.Method;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
|
||||
/** This class emulates the new Java 7 "Try-With-Resources" statement.
|
||||
* Remove once Lucene is on Java 7.
|
||||
* @lucene.internal */
|
||||
public final class IOUtils {
|
||||
|
||||
/**
|
||||
* UTF-8 charset string
|
||||
* @see Charset#forName(String)
|
||||
*/
|
||||
public static final String UTF_8 = "UTF-8";
|
||||
|
||||
/**
|
||||
* UTF-8 {@link Charset} instance to prevent repeated
|
||||
* {@link Charset#forName(String)} lookups
|
||||
*/
|
||||
public static final Charset CHARSET_UTF_8 = Charset.forName("UTF-8");
|
||||
private IOUtils() {} // no instance
|
||||
|
||||
/**
|
||||
|
@ -221,4 +241,83 @@ public final class IOUtils {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrapping the given {@link InputStream} in a reader using a {@link CharsetDecoder}.
|
||||
* Unlike Java's defaults this reader will throw an exception if your it detects
|
||||
* the read charset doesn't match the expected {@link Charset}.
|
||||
* <p>
|
||||
* Decoding readers are useful to load configuration files, stopword lists or synonym files
|
||||
* to detect character set problems. However, its not recommended to use as a common purpose
|
||||
* reader.
|
||||
*
|
||||
* @param stream the stream to wrap in a reader
|
||||
* @param charSet the expected charset
|
||||
* @return a wrapping reader
|
||||
*/
|
||||
public static Reader getDecodingReader(InputStream stream, Charset charSet) {
|
||||
final CharsetDecoder charSetDecoder = charSet.newDecoder()
|
||||
.onMalformedInput(CodingErrorAction.REPORT)
|
||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||
return new BufferedReader(new InputStreamReader(stream, charSetDecoder));
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens a Reader for the given {@link File} using a {@link CharsetDecoder}.
|
||||
* Unlike Java's defaults this reader will throw an exception if your it detects
|
||||
* the read charset doesn't match the expected {@link Charset}.
|
||||
* <p>
|
||||
* Decoding readers are useful to load configuration files, stopword lists or synonym files
|
||||
* to detect character set problems. However, its not recommended to use as a common purpose
|
||||
* reader.
|
||||
* @param file the file to open a reader on
|
||||
* @param charSet the expected charset
|
||||
* @return a reader to read the given file
|
||||
*/
|
||||
public static Reader getDecodingReader(File file, Charset charSet) throws IOException {
|
||||
FileInputStream stream = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
stream = new FileInputStream(file);
|
||||
final Reader reader = getDecodingReader(stream, charSet);
|
||||
success = true;
|
||||
return reader;
|
||||
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.close(stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens a Reader for the given resource using a {@link CharsetDecoder}.
|
||||
* Unlike Java's defaults this reader will throw an exception if your it detects
|
||||
* the read charset doesn't match the expected {@link Charset}.
|
||||
* <p>
|
||||
* Decoding readers are useful to load configuration files, stopword lists or synonym files
|
||||
* to detect character set problems. However, its not recommended to use as a common purpose
|
||||
* reader.
|
||||
* @param clazz the class used to locate the resource
|
||||
* @param resource the resource name to load
|
||||
* @param charSet the expected charset
|
||||
* @return a reader to read the given file
|
||||
*
|
||||
*/
|
||||
public static Reader getDecodingReader(Class<?> clazz, String resource, Charset charSet) throws IOException {
|
||||
InputStream stream = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
stream = clazz
|
||||
.getResourceAsStream(resource);
|
||||
final Reader reader = getDecodingReader(stream, charSet);
|
||||
success = true;
|
||||
return reader;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.close(stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -34,6 +34,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -64,9 +65,8 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = CharArraySet.unmodifiableSet(new CharArraySet(
|
||||
Version.LUCENE_CURRENT, WordlistLoader.getWordSet(BrazilianAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, "#"), false));
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(BrazilianAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -57,8 +57,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
|
|||
"they", "this", "to", "was", "will", "with"
|
||||
);
|
||||
final CharArraySet stopSet = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
stopWords.size(), false);
|
||||
stopSet.addAll(stopWords);
|
||||
stopWords, false);
|
||||
ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
|
||||
}
|
||||
|
||||
|
@ -82,7 +81,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param stopwordsFile File to load stop words from */
|
||||
public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException {
|
||||
this(matchVersion, WordlistLoader.getWordSet(stopwordsFile));
|
||||
this(matchVersion, loadStopwordSet(stopwordsFile, matchVersion));
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given reader.
|
||||
|
@ -90,7 +89,7 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param stopwords Reader to load stop words from */
|
||||
public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
|
||||
this(matchVersion, WordlistLoader.getWordSet(stopwords));
|
||||
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.*;
|
||||
|
@ -70,9 +71,8 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
|
||||
Version.LUCENE_CURRENT, WordlistLoader.getWordSet(CzechAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, "#"), false));
|
||||
DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.DanishStemmer;
|
||||
|
||||
|
@ -62,8 +63,8 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -36,6 +36,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.German2Stemmer;
|
||||
|
||||
|
@ -100,8 +101,8 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
private static final Set<?> DEFAULT_SET;
|
||||
static {
|
||||
try {
|
||||
DEFAULT_SET =
|
||||
WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
|
||||
DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.SpanishStemmer;
|
||||
|
||||
|
@ -62,8 +63,8 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.FinnishStemmer;
|
||||
|
||||
|
@ -62,8 +63,8 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -118,8 +119,8 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
static final Set<?> DEFAULT_STOP_SET;
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET =
|
||||
WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -32,6 +32,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -60,12 +61,12 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(GalicianAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(GalicianAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
throw new RuntimeException("Unable to load default stopword set", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.HungarianStemmer;
|
||||
|
||||
|
@ -62,8 +63,8 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -35,6 +35,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.ItalianStemmer;
|
||||
|
||||
|
@ -79,8 +80,8 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -27,11 +27,13 @@ import org.apache.lucene.analysis.core.StopFilter;
|
|||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -60,8 +62,8 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(LatvianAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(LatvianAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.File;
|
||||
|
@ -83,8 +84,8 @@ public final class DutchAnalyzer extends Analyzer {
|
|||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.NorwegianStemmer;
|
||||
|
||||
|
@ -62,8 +63,8 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.PortugueseStemmer;
|
||||
|
||||
|
@ -62,8 +63,8 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -34,6 +34,7 @@ import org.apache.lucene.analysis.core.StopFilter;
|
|||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -84,12 +85,12 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
|||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET =
|
||||
WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
throw new RuntimeException("Unable to load default stopword set", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.core.StopAnalyzer;
|
|||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.File;
|
||||
|
@ -85,7 +86,7 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
|
|||
* <a href="#version">above</a>}
|
||||
* @param stopwords File to read stop words from */
|
||||
public ClassicAnalyzer(Version matchVersion, File stopwords) throws IOException {
|
||||
this(matchVersion, WordlistLoader.getWordSet(stopwords));
|
||||
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given reader.
|
||||
|
@ -94,7 +95,7 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
|
|||
* <a href="#version">above</a>}
|
||||
* @param stopwords Reader to read stop words from */
|
||||
public ClassicAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
|
||||
this(matchVersion, WordlistLoader.getWordSet(stopwords));
|
||||
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.core.StopAnalyzer;
|
|||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.File;
|
||||
|
@ -86,7 +87,7 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
|
|||
* <a href="#version">above</a>}
|
||||
* @param stopwords File to read stop words from */
|
||||
public StandardAnalyzer(Version matchVersion, File stopwords) throws IOException {
|
||||
this(matchVersion, WordlistLoader.getWordSet(stopwords));
|
||||
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given reader.
|
||||
|
@ -95,7 +96,7 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
|
|||
* <a href="#version">above</a>}
|
||||
* @param stopwords Reader to read stop words from */
|
||||
public StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
|
||||
this(matchVersion, WordlistLoader.getWordSet(stopwords));
|
||||
this(matchVersion, loadStopwordSet(stopwords, matchVersion));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.SwedishStemmer;
|
||||
|
||||
|
@ -62,8 +63,8 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
|
@ -17,10 +17,13 @@
|
|||
|
||||
package org.apache.lucene.analysis.util;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -93,11 +96,59 @@ public abstract class StopwordAnalyzerBase extends Analyzer {
|
|||
protected static CharArraySet loadStopwordSet(final boolean ignoreCase,
|
||||
final Class<? extends Analyzer> aClass, final String resource,
|
||||
final String comment) throws IOException {
|
||||
final Set<String> wordSet = WordlistLoader.getWordSet(aClass, resource,
|
||||
comment);
|
||||
final CharArraySet set = new CharArraySet(Version.LUCENE_31, wordSet.size(), ignoreCase);
|
||||
set.addAll(wordSet);
|
||||
return set;
|
||||
Reader reader = null;
|
||||
try {
|
||||
reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), IOUtils.CHARSET_UTF_8);
|
||||
return WordlistLoader.getWordSet(reader, comment, new CharArraySet(Version.LUCENE_31, 16, ignoreCase));
|
||||
} finally {
|
||||
IOUtils.close(reader);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a CharArraySet from a file.
|
||||
*
|
||||
* @param stopwords
|
||||
* the stopwords file to load
|
||||
*
|
||||
* @param matchVersion
|
||||
* the Lucene version for cross version compatibility
|
||||
* @return a CharArraySet containing the distinct stopwords from the given
|
||||
* file
|
||||
* @throws IOException
|
||||
* if loading the stopwords throws an {@link IOException}
|
||||
*/
|
||||
protected static CharArraySet loadStopwordSet(File stopwords,
|
||||
Version matchVersion) throws IOException {
|
||||
Reader reader = null;
|
||||
try {
|
||||
reader = IOUtils.getDecodingReader(stopwords, IOUtils.CHARSET_UTF_8);
|
||||
return WordlistLoader.getWordSet(reader, matchVersion);
|
||||
} finally {
|
||||
IOUtils.close(reader);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a CharArraySet from a file.
|
||||
*
|
||||
* @param stopwords
|
||||
* the stopwords reader to load
|
||||
*
|
||||
* @param matchVersion
|
||||
* the Lucene version for cross version compatibility
|
||||
* @return a CharArraySet containing the distinct stopwords from the given
|
||||
* reader
|
||||
* @throws IOException
|
||||
* if loading the stopwords throws an {@link IOException}
|
||||
*/
|
||||
protected static CharArraySet loadStopwordSet(Reader stopwords,
|
||||
Version matchVersion) throws IOException {
|
||||
try {
|
||||
return WordlistLoader.getWordSet(stopwords, matchVersion);
|
||||
} finally {
|
||||
IOUtils.close(stopwords);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,165 +18,91 @@ package org.apache.lucene.analysis.util;
|
|||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Loader for text files that represent a list of stopwords.
|
||||
*
|
||||
* @see IOUtils to obtain {@link Reader} instances
|
||||
* @lucene.internal
|
||||
*/
|
||||
public class WordlistLoader {
|
||||
|
||||
/**
|
||||
* Loads a text file associated with a given class (See
|
||||
* {@link Class#getResourceAsStream(String)}) and adds every line as an entry
|
||||
* to a {@link Set} (omitting leading and trailing whitespace). Every line of
|
||||
* the file should contain only one word. The words need to be in lower-case if
|
||||
* you make use of an Analyzer which uses LowerCaseFilter (like
|
||||
* StandardAnalyzer).
|
||||
*
|
||||
* @param aClass
|
||||
* a class that is associated with the given stopwordResource
|
||||
* @param stopwordResource
|
||||
* name of the resource file associated with the given class
|
||||
* @return a {@link Set} with the file's words
|
||||
*/
|
||||
public static Set<String> getWordSet(Class<?> aClass, String stopwordResource)
|
||||
throws IOException {
|
||||
final Reader reader = new BufferedReader(new InputStreamReader(aClass
|
||||
.getResourceAsStream(stopwordResource), "UTF-8"));
|
||||
try {
|
||||
return getWordSet(reader);
|
||||
} finally {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
private static final int INITITAL_CAPACITY = 16;
|
||||
|
||||
/**
|
||||
* Loads a text file associated with a given class (See
|
||||
* {@link Class#getResourceAsStream(String)}) and adds every line as an entry
|
||||
* to a {@link Set} (omitting leading and trailing whitespace). Every line of
|
||||
* the file should contain only one word. The words need to be in lower-case if
|
||||
* you make use of an Analyzer which uses LowerCaseFilter (like
|
||||
* StandardAnalyzer).
|
||||
*
|
||||
* @param aClass
|
||||
* a class that is associated with the given stopwordResource
|
||||
* @param stopwordResource
|
||||
* name of the resource file associated with the given class
|
||||
* @param comment
|
||||
* the comment string to ignore
|
||||
* @return a {@link Set} with the file's words
|
||||
*/
|
||||
public static Set<String> getWordSet(Class<?> aClass,
|
||||
String stopwordResource, String comment) throws IOException {
|
||||
final Reader reader = new BufferedReader(new InputStreamReader(aClass
|
||||
.getResourceAsStream(stopwordResource), "UTF-8"));
|
||||
try {
|
||||
return getWordSet(reader, comment);
|
||||
} finally {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a text file and adds every line as an entry to a HashSet (omitting
|
||||
* leading and trailing whitespace). Every line of the file should contain only
|
||||
* one word. The words need to be in lowercase if you make use of an
|
||||
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
|
||||
*
|
||||
* @param wordfile File containing the wordlist
|
||||
* @return A HashSet with the file's words
|
||||
*/
|
||||
public static HashSet<String> getWordSet(File wordfile) throws IOException {
|
||||
FileReader reader = null;
|
||||
try {
|
||||
reader = new FileReader(wordfile);
|
||||
return getWordSet(reader);
|
||||
}
|
||||
finally {
|
||||
if (reader != null)
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
|
||||
* leading and trailing whitespace). Every line of the file should contain only
|
||||
* one word. The words need to be in lowercase if you make use of an
|
||||
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
|
||||
*
|
||||
* @param wordfile File containing the wordlist
|
||||
* @param comment The comment string to ignore
|
||||
* @return A HashSet with the file's words
|
||||
*/
|
||||
public static HashSet<String> getWordSet(File wordfile, String comment) throws IOException {
|
||||
FileReader reader = null;
|
||||
try {
|
||||
reader = new FileReader(wordfile);
|
||||
return getWordSet(reader, comment);
|
||||
}
|
||||
finally {
|
||||
if (reader != null)
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
|
||||
* Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
|
||||
* leading and trailing whitespace). Every line of the Reader should contain only
|
||||
* one word. The words need to be in lowercase if you make use of an
|
||||
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
|
||||
*
|
||||
* @param reader Reader containing the wordlist
|
||||
* @return A HashSet with the reader's words
|
||||
* @param result the {@link CharArraySet} to fill with the readers words
|
||||
* @return the given {@link CharArraySet} with the reader's words
|
||||
*/
|
||||
public static HashSet<String> getWordSet(Reader reader) throws IOException {
|
||||
final HashSet<String> result = new HashSet<String>();
|
||||
public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException {
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
if (reader instanceof BufferedReader) {
|
||||
br = (BufferedReader) reader;
|
||||
} else {
|
||||
br = new BufferedReader(reader);
|
||||
}
|
||||
br = getBufferedReader(reader);
|
||||
String word = null;
|
||||
while ((word = br.readLine()) != null) {
|
||||
result.add(word.trim());
|
||||
}
|
||||
}
|
||||
finally {
|
||||
if (br != null)
|
||||
br.close();
|
||||
IOUtils.close(br);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
|
||||
* Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
|
||||
* leading and trailing whitespace). Every line of the Reader should contain only
|
||||
* one word. The words need to be in lowercase if you make use of an
|
||||
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
|
||||
*
|
||||
* @param reader Reader containing the wordlist
|
||||
* @param matchVersion the Lucene {@link Version}
|
||||
* @return A {@link CharArraySet} with the reader's words
|
||||
*/
|
||||
public static CharArraySet getWordSet(Reader reader, Version matchVersion) throws IOException {
|
||||
return getWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
|
||||
* leading and trailing whitespace). Every line of the Reader should contain only
|
||||
* one word. The words need to be in lowercase if you make use of an
|
||||
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
|
||||
*
|
||||
* @param reader Reader containing the wordlist
|
||||
* @param comment The string representing a comment.
|
||||
* @return A HashSet with the reader's words
|
||||
* @param matchVersion the Lucene {@link Version}
|
||||
* @return A CharArraySet with the reader's words
|
||||
*/
|
||||
public static HashSet<String> getWordSet(Reader reader, String comment) throws IOException {
|
||||
final HashSet<String> result = new HashSet<String>();
|
||||
public static CharArraySet getWordSet(Reader reader, String comment, Version matchVersion) throws IOException {
|
||||
return getWordSet(reader, comment, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
|
||||
* leading and trailing whitespace). Every line of the Reader should contain only
|
||||
* one word. The words need to be in lowercase if you make use of an
|
||||
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
|
||||
*
|
||||
* @param reader Reader containing the wordlist
|
||||
* @param comment The string representing a comment.
|
||||
* @param result the {@link CharArraySet} to fill with the readers words
|
||||
* @return the given {@link CharArraySet} with the reader's words
|
||||
*/
|
||||
public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException {
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
if (reader instanceof BufferedReader) {
|
||||
br = (BufferedReader) reader;
|
||||
} else {
|
||||
br = new BufferedReader(reader);
|
||||
}
|
||||
br = getBufferedReader(reader);
|
||||
String word = null;
|
||||
while ((word = br.readLine()) != null) {
|
||||
if (word.startsWith(comment) == false){
|
||||
|
@ -185,33 +111,44 @@ public class WordlistLoader {
|
|||
}
|
||||
}
|
||||
finally {
|
||||
if (br != null)
|
||||
br.close();
|
||||
IOUtils.close(br);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Loads a text file in Snowball format associated with a given class (See
|
||||
* {@link Class#getResourceAsStream(String)}) and adds all words as entries to
|
||||
* a {@link Set}. The words need to be in lower-case if you make use of an
|
||||
* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
|
||||
* Reads stopwords from a stopword list in Snowball format.
|
||||
* <p>
|
||||
* The snowball format is the following:
|
||||
* <ul>
|
||||
* <li>Lines may contain multiple words separated by whitespace.
|
||||
* <li>The comment character is the vertical line (|).
|
||||
* <li>Lines may contain trailing comments.
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* @param aClass a class that is associated with the given stopwordResource
|
||||
* @param stopwordResource name of the resource file associated with the given
|
||||
* class
|
||||
* @return a {@link Set} with the file's words
|
||||
* @see #getSnowballWordSet(Reader)
|
||||
* @param reader Reader containing a Snowball stopword list
|
||||
* @param result the {@link CharArraySet} to fill with the readers words
|
||||
* @return the given {@link CharArraySet} with the reader's words
|
||||
*/
|
||||
public static Set<String> getSnowballWordSet(Class<?> aClass,
|
||||
String stopwordResource) throws IOException {
|
||||
final Reader reader = new BufferedReader(new InputStreamReader(aClass
|
||||
.getResourceAsStream(stopwordResource), "UTF-8"));
|
||||
public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
|
||||
throws IOException {
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
return getSnowballWordSet(reader);
|
||||
} finally {
|
||||
reader.close();
|
||||
br = getBufferedReader(reader);
|
||||
String line = null;
|
||||
while ((line = br.readLine()) != null) {
|
||||
int comment = line.indexOf('|');
|
||||
if (comment >= 0) line = line.substring(0, comment);
|
||||
String words[] = line.split("\\s+");
|
||||
for (int i = 0; i < words.length; i++)
|
||||
if (words[i].length() > 0) result.add(words[i]);
|
||||
}
|
||||
} finally {
|
||||
IOUtils.close(br);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -226,30 +163,12 @@ public class WordlistLoader {
|
|||
* </p>
|
||||
*
|
||||
* @param reader Reader containing a Snowball stopword list
|
||||
* @return A Set with the reader's words
|
||||
* @param result the {@link CharArraySet} to fill with the readers words
|
||||
* @param matchVersion the Lucene {@link Version}
|
||||
* @return A {@link CharArraySet} with the reader's words
|
||||
*/
|
||||
public static Set<String> getSnowballWordSet(Reader reader)
|
||||
throws IOException {
|
||||
final Set<String> result = new HashSet<String>();
|
||||
BufferedReader br = null;
|
||||
try {
|
||||
if (reader instanceof BufferedReader) {
|
||||
br = (BufferedReader) reader;
|
||||
} else {
|
||||
br = new BufferedReader(reader);
|
||||
}
|
||||
String line = null;
|
||||
while ((line = br.readLine()) != null) {
|
||||
int comment = line.indexOf('|');
|
||||
if (comment >= 0) line = line.substring(0, comment);
|
||||
String words[] = line.split("\\s+");
|
||||
for (int i = 0; i < words.length; i++)
|
||||
if (words[i].length() > 0) result.add(words[i]);
|
||||
}
|
||||
} finally {
|
||||
if (br != null) br.close();
|
||||
}
|
||||
return result;
|
||||
public static CharArraySet getSnowballWordSet(Reader reader, Version matchVersion) throws IOException {
|
||||
return getSnowballWordSet(reader, new CharArraySet(matchVersion, INITITAL_CAPACITY, false));
|
||||
}
|
||||
|
||||
|
||||
|
@ -261,24 +180,24 @@ public class WordlistLoader {
|
|||
* @return stem dictionary that overrules the stemming algorithm
|
||||
* @throws IOException
|
||||
*/
|
||||
public static HashMap<String, String> getStemDict(File wordstemfile) throws IOException {
|
||||
if (wordstemfile == null)
|
||||
throw new NullPointerException("wordstemfile may not be null");
|
||||
final HashMap<String, String> result = new HashMap<String,String>();
|
||||
public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result) throws IOException {
|
||||
BufferedReader br = null;
|
||||
|
||||
try {
|
||||
br = new BufferedReader(new FileReader(wordstemfile));
|
||||
br = getBufferedReader(reader);
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
String[] wordstem = line.split("\t", 2);
|
||||
result.put(wordstem[0], wordstem[1]);
|
||||
}
|
||||
} finally {
|
||||
if(br != null)
|
||||
br.close();
|
||||
IOUtils.close(br);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private static BufferedReader getBufferedReader(Reader reader) {
|
||||
return (reader instanceof BufferedReader) ? (BufferedReader) reader
|
||||
: new BufferedReader(reader);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -46,7 +46,7 @@ public class TestCharArraySet extends LuceneTestCase {
|
|||
public void testNonZeroOffset() {
|
||||
String[] words={"Hello","World","this","is","a","test"};
|
||||
char[] findme="xthisy".toCharArray();
|
||||
CharArraySet set=new CharArraySet(TEST_VERSION_CURRENT, 10,true);
|
||||
CharArraySet set= new CharArraySet(TEST_VERSION_CURRENT, 10, true);
|
||||
set.addAll(Arrays.asList(words));
|
||||
assertTrue(set.contains(findme, 1, 4));
|
||||
assertTrue(set.contains(new String(findme,1,4)));
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.analysis.util;
|
|||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
@ -31,22 +29,22 @@ public class TestWordlistLoader extends LuceneTestCase {
|
|||
|
||||
public void testWordlistLoading() throws IOException {
|
||||
String s = "ONE\n two \nthree";
|
||||
HashSet<String> wordSet1 = WordlistLoader.getWordSet(new StringReader(s));
|
||||
CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), TEST_VERSION_CURRENT);
|
||||
checkSet(wordSet1);
|
||||
HashSet<String> wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)));
|
||||
CharArraySet wordSet2 = WordlistLoader.getWordSet(new BufferedReader(new StringReader(s)), TEST_VERSION_CURRENT);
|
||||
checkSet(wordSet2);
|
||||
}
|
||||
|
||||
public void testComments() throws Exception {
|
||||
String s = "ONE\n two \nthree\n#comment";
|
||||
HashSet<String> wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#");
|
||||
CharArraySet wordSet1 = WordlistLoader.getWordSet(new StringReader(s), "#", TEST_VERSION_CURRENT);
|
||||
checkSet(wordSet1);
|
||||
assertFalse(wordSet1.contains("#comment"));
|
||||
assertFalse(wordSet1.contains("comment"));
|
||||
}
|
||||
|
||||
|
||||
private void checkSet(HashSet<String> wordset) {
|
||||
private void checkSet(CharArraySet wordset) {
|
||||
assertEquals(3, wordset.size());
|
||||
assertTrue(wordset.contains("ONE")); // case is not modified
|
||||
assertTrue(wordset.contains("two")); // surrounding whitespace is removed
|
||||
|
@ -68,7 +66,7 @@ public class TestWordlistLoader extends LuceneTestCase {
|
|||
" two \n" + // stopword with leading/trailing space
|
||||
" three four five \n" + // multiple stopwords
|
||||
"six seven | comment\n"; //multiple stopwords + comment
|
||||
Set<String> wordset = WordlistLoader.getSnowballWordSet(new StringReader(s));
|
||||
CharArraySet wordset = WordlistLoader.getSnowballWordSet(new StringReader(s), TEST_VERSION_CURRENT);
|
||||
assertEquals(7, wordset.size());
|
||||
assertTrue(wordset.contains("ONE"));
|
||||
assertTrue(wordset.contains("two"));
|
||||
|
|
|
@ -26,6 +26,7 @@ import java.util.Set;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.en.PorterStemFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
@ -66,7 +67,7 @@ public final class SmartChineseAnalyzer extends Analyzer {
|
|||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<String> getDefaultStopSet(){
|
||||
public static CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
|
@ -75,7 +76,7 @@ public final class SmartChineseAnalyzer extends Analyzer {
|
|||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<String> DEFAULT_STOP_SET;
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
|
@ -87,13 +88,14 @@ public final class SmartChineseAnalyzer extends Analyzer {
|
|||
}
|
||||
}
|
||||
|
||||
static Set<String> loadDefaultStopWordSet() throws IOException {
|
||||
static CharArraySet loadDefaultStopWordSet() throws IOException {
|
||||
InputStream stream = SmartChineseAnalyzer.class
|
||||
.getResourceAsStream(DEFAULT_STOPWORD_FILE);
|
||||
try {
|
||||
InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
// make sure it is unmodifiable as we expose it in the outer class
|
||||
return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, STOPWORD_FILE_COMMENT));
|
||||
return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(reader,
|
||||
STOPWORD_FILE_COMMENT, Version.LUCENE_CURRENT));
|
||||
} finally {
|
||||
stream.close();
|
||||
}
|
||||
|
|
|
@ -34,6 +34,7 @@ import org.apache.lucene.analysis.stempel.StempelFilter;
|
|||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.egothor.stemmer.Trie;
|
||||
|
||||
|
@ -68,8 +69,8 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(PolishAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE);
|
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(PolishAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
|
|
Loading…
Reference in New Issue