From a94983686982747059e2a8a668747e5e80e849f6 Mon Sep 17 00:00:00 2001
From: Robert Muir
Date: Sun, 3 Jan 2010 08:48:17 +0000
Subject: [PATCH] LUCENE-2034: Refactor analyzer reuse and stopword handling
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@895339 13f79535-47bb-0310-9956-ffa450edef68
---
CHANGES.txt | 6 +
.../lucene/analysis/ar/ArabicAnalyzer.java | 88 ++--------
.../lucene/analysis/bg/BulgarianAnalyzer.java | 93 +++-------
.../lucene/analysis/br/BrazilianAnalyzer.java | 84 +++------
.../lucene/analysis/cjk/CJKAnalyzer.java | 60 +------
.../lucene/analysis/cjk/CJKTokenizer.java | 2 -
.../lucene/analysis/cn/ChineseAnalyzer.java | 59 ++-----
.../lucene/analysis/cz/CzechAnalyzer.java | 67 ++-----
.../lucene/analysis/de/GermanAnalyzer.java | 69 ++------
.../lucene/analysis/el/GreekAnalyzer.java | 70 ++------
.../lucene/analysis/fa/PersianAnalyzer.java | 84 ++-------
.../lucene/analysis/fr/FrenchAnalyzer.java | 62 ++-----
.../lucene/analysis/ru/RussianAnalyzer.java | 64 ++-----
.../lucene/analysis/th/ThaiAnalyzer.java | 53 +++---
.../analysis/ar/TestArabicAnalyzer.java | 10 +-
.../analysis/br/TestBrazilianStemmer.java | 6 +-
.../analysis/fa/TestPersianAnalyzer.java | 3 -
.../lucene/analysis/ReusableAnalyzerBase.java | 163 ++++++++++++++++++
.../lucene/analysis/SimpleAnalyzer.java | 18 +-
.../apache/lucene/analysis/StopAnalyzer.java | 51 ++----
.../lucene/analysis/StopwordAnalyzerBase.java | 110 ++++++++++++
.../lucene/analysis/WhitespaceAnalyzer.java | 18 +-
.../lucene/analysis/WordlistLoader.java | 79 +++++++--
.../apache/lucene/index/wordliststopwords.txt | 5 +
.../index/wordliststopwords_nocomment.txt | 3 +
25 files changed, 584 insertions(+), 743 deletions(-)
create mode 100644 src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java
create mode 100644 src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java
create mode 100644 src/test/org/apache/lucene/index/wordliststopwords.txt
create mode 100644 src/test/org/apache/lucene/index/wordliststopwords_nocomment.txt
diff --git a/CHANGES.txt b/CHANGES.txt
index 02c2b97e7ec..a93a8c4ba0e 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -105,6 +105,12 @@ New features
backwards compatibility. If Version < 3.1 is passed to the constructor,
LowerCaseFilter yields the old behavior. (Simon Willnauer, Robert Muir)
+* LUCENE-2034: Added ReusableAnalyzerBase, an abstract subclass of Analyzer
+ that makes it easier to reuse TokenStreams correctly. This issue also added
+ StopwordAnalyzerBase, which improves consistency of all Analyzers that use
+ stopwords, and implement many analyzers in contrib with it.
+ (Simon Willnauer via Robert Muir)
+
Optimizations
* LUCENE-2086: When resolving deleted terms, do so in term sort order
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
index e4036f8873e..aad6aa3a004 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
@@ -19,17 +19,15 @@ package org.apache.lucene.analysis.ar;
import java.io.File;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
import java.io.Reader;
-import java.util.Collections;
import java.util.Hashtable;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
@@ -52,7 +50,7 @@ import org.apache.lucene.util.Version;
*
*
*/
-public final class ArabicAnalyzer extends Analyzer {
+public final class ArabicAnalyzer extends StopwordAnalyzerBase {
/**
* File containing default Arabic stopwords.
@@ -62,21 +60,18 @@ public final class ArabicAnalyzer extends Analyzer {
*/
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
- /**
- * Contains the stopwords used with the StopFilter.
- */
- private final Set> stoptable;
/**
* The comment character in the stopwords file. All lines prefixed with this will be ignored
* @deprecated use {@link WordlistLoader#getWordSet(File, String)} directly
*/
+ // TODO make this private
public static final String STOPWORDS_COMMENT = "#";
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
- public static Set getDefaultStopSet(){
+ public static Set> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
@@ -85,34 +80,19 @@ public final class ArabicAnalyzer extends Analyzer {
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
- static final Set DEFAULT_STOP_SET;
+ static final Set> DEFAULT_STOP_SET;
static {
try {
- DEFAULT_STOP_SET = loadDefaultStopWordSet();
+ DEFAULT_STOP_SET = loadStopwordSet(false, ArabicAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
-
- static Set loadDefaultStopWordSet() throws IOException {
- InputStream stream = ArabicAnalyzer.class
- .getResourceAsStream(DEFAULT_STOPWORD_FILE);
- try {
- InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
- // make sure it is unmodifiable as we expose it in the outer class
- return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
- STOPWORDS_COMMENT));
- } finally {
- stream.close();
- }
- }
}
- private final Version matchVersion;
-
/**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/
@@ -129,8 +109,7 @@ public final class ArabicAnalyzer extends Analyzer {
* a stopword set
*/
public ArabicAnalyzer(Version matchVersion, Set> stopwords){
- stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
- this.matchVersion = matchVersion;
+ super(matchVersion, stopwords);
}
/**
@@ -159,54 +138,21 @@ public final class ArabicAnalyzer extends Analyzer {
/**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
*
- * @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
+ * @return {@link TokenStreamComponents} built from an {@link ArabicLetterTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
* and {@link ArabicStemFilter}.
*/
@Override
- public final TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new ArabicLetterTokenizer( reader );
- result = new LowerCaseFilter(matchVersion, result);
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new ArabicLetterTokenizer(reader);
+ TokenStream result = new LowerCaseFilter(matchVersion, source);
// the order here is important: the stopword list is not normalized!
- result = new StopFilter( matchVersion, result, stoptable );
- result = new ArabicNormalizationFilter( result );
- result = new ArabicStemFilter( result );
-
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
- * in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
- * and {@link ArabicStemFilter}.
- */
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new ArabicLetterTokenizer(reader);
- streams.result = new LowerCaseFilter(matchVersion, streams.source);
- // the order here is important: the stopword list is not normalized!
- streams.result = new StopFilter( matchVersion, streams.result, stoptable);
- streams.result = new ArabicNormalizationFilter(streams.result);
- streams.result = new ArabicStemFilter(streams.result);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
+ result = new StopFilter( matchVersion, result, stopwords);
+ result = new ArabicNormalizationFilter(result);
+ return new TokenStreamComponents(source, new ArabicStemFilter(result));
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
index 3e93e832dae..7abacf169e4 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
@@ -17,17 +17,16 @@ package org.apache.lucene.analysis.bg;
* limitations under the License.
*/
+import java.io.File;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
import java.io.Reader;
-import java.util.Collections;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
@@ -43,7 +42,7 @@ import org.apache.lucene.util.Version;
* http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
*
*/
-public final class BulgarianAnalyzer extends Analyzer {
+public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
/**
* File containing default Bulgarian stopwords.
@@ -54,14 +53,12 @@ public final class BulgarianAnalyzer extends Analyzer {
*/
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
- /**
- * Contains the stopwords used with the StopFilter.
- */
- private final Set> stoptable;
/**
* The comment character in the stopwords file. All lines prefixed with this
* will be ignored
+ * @deprecated use {@link WordlistLoader#getWordSet(File, String)} directly
*/
+ //TODO make this private
public static final String STOPWORDS_COMMENT = "#";
/**
@@ -69,7 +66,7 @@ public final class BulgarianAnalyzer extends Analyzer {
*
* @return an unmodifiable instance of the default stop-words set.
*/
- public static Set getDefaultStopSet() {
+ public static Set> getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET;
}
@@ -78,35 +75,19 @@ public final class BulgarianAnalyzer extends Analyzer {
* class accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
- static final Set DEFAULT_STOP_SET;
+ static final Set> DEFAULT_STOP_SET;
static {
try {
- DEFAULT_STOP_SET = loadDefaultStopWordSet();
- } catch (Exception ex) {
+ DEFAULT_STOP_SET = loadStopwordSet(false, BulgarianAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+ } catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
- throw new RuntimeException("Unable to load default stopword set", ex);
- }
- }
-
- static Set loadDefaultStopWordSet() throws IOException {
- final InputStream stream = BulgarianAnalyzer.class
- .getResourceAsStream(DEFAULT_STOPWORD_FILE);
- try {
- InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
- // make sure it is unmodifiable as we expose it in the outer class
- return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
- STOPWORDS_COMMENT));
- } finally {
- if(stream != null)
- stream.close();
+ throw new RuntimeException("Unable to load default stopword set");
}
}
}
-
- private final Version matchVersion;
-
+
/**
* Builds an analyzer with the default stop words:
* {@link #DEFAULT_STOPWORD_FILE}.
@@ -119,58 +100,24 @@ public final class BulgarianAnalyzer extends Analyzer {
* Builds an analyzer with the given stop words.
*/
public BulgarianAnalyzer(Version matchVersion, Set> stopwords) {
- super();
- stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
- stopwords));
- this.matchVersion = matchVersion;
+ super(matchVersion, stopwords);
}
/**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
* {@link Reader}.
*
- * @return A {@link TokenStream} built from an {@link StandardTokenizer}
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
* {@link StopFilter}, and {@link BulgarianStemFilter}.
*/
@Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new StandardTokenizer(matchVersion, reader);
- result = new StandardFilter(result);
+ public TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
result = new LowerCaseFilter(matchVersion, result);
- result = new StopFilter(matchVersion, result, stoptable);
+ result = new StopFilter(matchVersion, result, stopwords);
result = new BulgarianStemFilter(result);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
- * text in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from an {@link StandardTokenizer}
- * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
- * {@link StopFilter}, and {@link BulgarianStemFilter}.
- */
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new StandardTokenizer(matchVersion, reader);
- streams.result = new StandardFilter(streams.source);
- streams.result = new LowerCaseFilter(matchVersion, streams.result);
- streams.result = new StopFilter(matchVersion, streams.result, stoptable);
- streams.result = new BulgarianStemFilter(streams.result);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
+ return new TokenStreamComponents(source, result);
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
index 44299f639f9..3230ec293e0 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
@@ -21,19 +21,21 @@ import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
+import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
-import java.util.Collections;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
-import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
@@ -49,7 +51,7 @@ import org.apache.lucene.util.Version;
*
NOTE: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.
*/
-public final class BrazilianAnalyzer extends Analyzer {
+public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
/**
* List of typical Brazilian Portuguese stopwords.
@@ -91,19 +93,13 @@ public final class BrazilianAnalyzer extends Analyzer {
Arrays.asList(BRAZILIAN_STOP_WORDS), false));
}
- /**
- * Contains the stopwords used with the {@link StopFilter}.
- */
- private final Set> stoptable;
-
+
/**
* Contains words that should be indexed but not stemmed.
*/
// TODO make this private in 3.1
private Set> excltable = Collections.emptySet();
- private final Version matchVersion;
-
/**
* Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}).
*/
@@ -120,8 +116,7 @@ public final class BrazilianAnalyzer extends Analyzer {
* a stopword set
*/
public BrazilianAnalyzer(Version matchVersion, Set> stopwords) {
- stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
- this.matchVersion = matchVersion;
+ super(matchVersion, stopwords);
}
/**
@@ -188,53 +183,22 @@ public final class BrazilianAnalyzer extends Analyzer {
excltable = WordlistLoader.getWordSet( exclusionlist );
setPreviousTokenStream(null); // force a new stemmer to be created
}
-
- /**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
- * {@link BrazilianStemFilter}.
- */
- @Override
- public final TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new StandardTokenizer( matchVersion, reader );
- result = new LowerCaseFilter( matchVersion, result );
- result = new StandardFilter( result );
- result = new StopFilter( matchVersion, result, stoptable );
- result = new BrazilianStemFilter( result, excltable );
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
- * in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
- * {@link BrazilianStemFilter}.
- */
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new StandardTokenizer(matchVersion, reader);
- streams.result = new LowerCaseFilter(matchVersion, streams.source);
- streams.result = new StandardFilter(streams.result);
- streams.result = new StopFilter(matchVersion, streams.result, stoptable);
- streams.result = new BrazilianStemFilter(streams.result, excltable);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
- }
+ /**
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
+ *
+ * @return {@link TokenStreamComponents} built from a {@link StandardTokenizer} filtered with
+ * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
+ * {@link BrazilianStemFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new LowerCaseFilter(matchVersion, source);
+ result = new StandardFilter(result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ return new TokenStreamComponents(source, new BrazilianStemFilter(result,
+ excltable));
+ }
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
index 1ab7c228fa1..dc2df4c2d38 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
@@ -19,12 +19,12 @@ package org.apache.lucene.analysis.cjk;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Version;
-import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Set;
@@ -35,7 +35,7 @@ import java.util.Set;
* filters with {@link StopFilter}
*
*/
-public final class CJKAnalyzer extends Analyzer {
+public final class CJKAnalyzer extends StopwordAnalyzerBase {
//~ Static fields/initializers ---------------------------------------------
/**
@@ -71,11 +71,6 @@ public final class CJKAnalyzer extends Analyzer {
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS),
false));
}
- /**
- * stop word list
- */
- private final Set> stopTable;
- private final Version matchVersion;
//~ Constructors -----------------------------------------------------------
@@ -95,8 +90,7 @@ public final class CJKAnalyzer extends Analyzer {
* a stopword set
*/
public CJKAnalyzer(Version matchVersion, Set> stopwords){
- stopTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
- this.matchVersion = matchVersion;
+ super(matchVersion, stopwords);
}
/**
@@ -106,51 +100,15 @@ public final class CJKAnalyzer extends Analyzer {
* @deprecated use {@link #CJKAnalyzer(Version, Set)} instead
*/
public CJKAnalyzer(Version matchVersion, String... stopWords) {
- stopTable = StopFilter.makeStopSet(matchVersion, stopWords);
- this.matchVersion = matchVersion;
+ super(matchVersion, StopFilter.makeStopSet(matchVersion, stopWords));
}
//~ Methods ----------------------------------------------------------------
- /**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
- *
- * @param fieldName lucene field name
- * @param reader input {@link Reader}
- * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
- * {@link StopFilter}
- */
@Override
- public final TokenStream tokenStream(String fieldName, Reader reader) {
- return new StopFilter(matchVersion, new CJKTokenizer(reader), stopTable);
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
- * in the provided {@link Reader}.
- *
- * @param fieldName lucene field name
- * @param reader Input {@link Reader}
- * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
- * {@link StopFilter}
- */
- @Override
- public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
- /* tokenStream() is final, no back compat issue */
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new CJKTokenizer(reader);
- streams.result = new StopFilter(matchVersion, streams.source, stopTable);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new CJKTokenizer(reader);
+ return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
index a3d03534ea0..4edfc3b826c 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
@@ -25,8 +25,6 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
-import org.apache.lucene.util.AttributeSource.AttributeFactory;
-
/**
* CJKTokenizer is designed for Chinese, Japanese, and Korean languages.
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
index 48ae4afed5a..2d5c6a7d54f 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
@@ -17,10 +17,11 @@ package org.apache.lucene.analysis.cn;
* limitations under the License.
*/
-import java.io.IOException;
import java.io.Reader;
+
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
/**
@@ -29,49 +30,19 @@ import org.apache.lucene.analysis.Tokenizer;
*
*/
-public final class ChineseAnalyzer extends Analyzer {
+public final class ChineseAnalyzer extends ReusableAnalyzerBase {
- public ChineseAnalyzer() {
- }
-
- /**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link ChineseTokenizer}
- * filtered with {@link ChineseFilter}.
- */
+ /**
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the
+ * provided {@link Reader}.
+ *
+ * @return {@link TokenStreamComponents} built from a
+ * {@link ChineseTokenizer} filtered with {@link ChineseFilter}
+ */
@Override
- public final TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new ChineseTokenizer(reader);
- result = new ChineseFilter(result);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in the
- * provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link ChineseTokenizer}
- * filtered with {@link ChineseFilter}.
- */
- @Override
- public final TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- /* tokenStream() is final, no back compat issue */
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new ChineseTokenizer(reader);
- streams.result = new ChineseFilter(streams.source);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new ChineseTokenizer(reader);
+ return new TokenStreamComponents(source, new ChineseFilter(source));
}
}
\ No newline at end of file
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
index 8c66a11bdc5..804791607c6 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
@@ -17,6 +17,8 @@ package org.apache.lucene.analysis.cz;
* limitations under the License.
*/
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
@@ -30,9 +32,9 @@ import org.apache.lucene.util.Version;
import java.io.*;
import java.util.Arrays;
+import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
-import java.util.Collections;
/**
* {@link Analyzer} for Czech language.
@@ -53,7 +55,7 @@ import java.util.Collections;
* LUCENE-1068)
*
*/
-public final class CzechAnalyzer extends Analyzer {
+public final class CzechAnalyzer extends ReusableAnalyzerBase {
/**
* List of typical stopwords.
@@ -95,10 +97,11 @@ public final class CzechAnalyzer extends Analyzer {
Version.LUCENE_CURRENT, Arrays.asList(CZECH_STOP_WORDS), false));
}
+
/**
* Contains the stopwords used with the {@link StopFilter}.
*/
- // TODO make this final in 3.1
+ // TODO once loadStopWords is gone those member should be removed too in favor of StopwordAnalyzerBase
private Set> stoptable;
private final Version matchVersion;
@@ -168,6 +171,7 @@ public final class CzechAnalyzer extends Analyzer {
* @deprecated use {@link WordlistLoader#getWordSet(Reader, String) }
* and {@link #CzechAnalyzer(Version, Set)} instead
*/
+ // TODO extend StopwordAnalyzerBase once this method is gone!
public void loadStopWords( InputStream wordfile, String encoding ) {
setPreviousTokenStream(null); // force a new stopfilter to be created
if ( wordfile == null ) {
@@ -191,58 +195,25 @@ public final class CzechAnalyzer extends Analyzer {
stoptable = Collections.emptySet();
}
}
-
/**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
* {@link Reader}.
*
- * @return A {@link TokenStream} built from a {@link StandardTokenizer}
+ * @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
* {@link StopFilter}, and {@link CzechStemFilter} (only if version is
* >= LUCENE_31)
*/
@Override
- public final TokenStream tokenStream( String fieldName, Reader reader ) {
- TokenStream result = new StandardTokenizer( matchVersion, reader );
- result = new StandardFilter( result );
- result = new LowerCaseFilter( matchVersion, result );
- result = new StopFilter( matchVersion, result, stoptable );
- if (matchVersion.onOrAfter(Version.LUCENE_31))
- result = new CzechStemFilter(result);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
- * text in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer}
- * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
- * {@link StopFilter}, and {@link CzechStemFilter} (only if version is
- * >= LUCENE_31)
- */
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new StandardTokenizer(matchVersion, reader);
- streams.result = new StandardFilter(streams.source);
- streams.result = new LowerCaseFilter(matchVersion, streams.result);
- streams.result = new StopFilter( matchVersion, streams.result, stoptable);
- if (matchVersion.onOrAfter(Version.LUCENE_31))
- streams.result = new CzechStemFilter(streams.result);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
- }
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter( matchVersion, result, stoptable);
+ if (matchVersion.onOrAfter(Version.LUCENE_31))
+ result = new CzechStemFilter(result);
+ return new TokenStreamComponents(source, result);
+ }
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
index 5497416a88c..603a41347cb 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
@@ -29,13 +29,15 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.util.Version;
/**
@@ -51,7 +53,7 @@ import org.apache.lucene.util.Version;
*
NOTE: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.
*/
-public final class GermanAnalyzer extends Analyzer {
+public final class GermanAnalyzer extends StopwordAnalyzerBase {
/**
* List of typical german stopwords.
@@ -89,17 +91,13 @@ public final class GermanAnalyzer extends Analyzer {
/**
* Contains the stopwords used with the {@link StopFilter}.
*/
- //TODO make this final in 3.1
- private Set> stopSet;
-
+
/**
* Contains words that should be indexed but not stemmed.
*/
// TODO make this final in 3.1
private Set> exclusionSet;
- private final Version matchVersion;
-
/**
* Builds an analyzer with the default stop words:
* {@link #getDefaultStopSet()}.
@@ -131,9 +129,8 @@ public final class GermanAnalyzer extends Analyzer {
* a stemming exclusion set
*/
public GermanAnalyzer(Version matchVersion, Set> stopwords, Set> stemExclusionSet) {
- stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
+ super(matchVersion, stopwords);
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
- this.matchVersion = matchVersion;
}
/**
@@ -187,51 +184,23 @@ public final class GermanAnalyzer extends Analyzer {
exclusionSet = WordlistLoader.getWordSet(exclusionlist);
setPreviousTokenStream(null); // force a new stemmer to be created
}
-
+
/**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
- * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the
+ * provided {@link Reader}.
+ *
+ * @return {@link TokenStreamComponents} built from a
+ * {@link StandardTokenizer} filtered with {@link StandardFilter},
+ * {@link LowerCaseFilter}, {@link StopFilter}, and
* {@link GermanStemFilter}
*/
@Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new StandardTokenizer(matchVersion, reader);
- result = new StandardFilter(result);
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
result = new LowerCaseFilter(matchVersion, result);
- result = new StopFilter( matchVersion, result, stopSet);
- result = new GermanStemFilter(result, exclusionSet);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
- * in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
- * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and
- * {@link GermanStemFilter}
- */
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new StandardTokenizer(matchVersion, reader);
- streams.result = new StandardFilter(streams.source);
- streams.result = new LowerCaseFilter(matchVersion, streams.result);
- streams.result = new StopFilter( matchVersion, streams.result, stopSet);
- streams.result = new GermanStemFilter(streams.result, exclusionSet);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
+ result = new StopFilter( matchVersion, result, stopwords);
+ return new TokenStreamComponents(source, new GermanStemFilter(result, exclusionSet));
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
index 2e69c39d905..808cc207f85 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
@@ -19,14 +19,15 @@ package org.apache.lucene.analysis.el;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.util.Version;
-import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Map;
@@ -43,7 +44,7 @@ import java.util.Set;
*
NOTE: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.
*/
-public final class GreekAnalyzer extends Analyzer
+public final class GreekAnalyzer extends StopwordAnalyzerBase
{
/**
* List of typical Greek stopwords.
@@ -73,13 +74,6 @@ public final class GreekAnalyzer extends Analyzer
Version.LUCENE_CURRENT, Arrays.asList(GREEK_STOP_WORDS), false));
}
- /**
- * Contains the stopwords used with the {@link StopFilter}.
- */
- private final Set> stopSet;
-
- private final Version matchVersion;
-
public GreekAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
}
@@ -93,8 +87,7 @@ public final class GreekAnalyzer extends Analyzer
* a stopword set
*/
public GreekAnalyzer(Version matchVersion, Set> stopwords) {
- stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
- this.matchVersion = matchVersion;
+ super(matchVersion, stopwords);
}
/**
@@ -115,47 +108,20 @@ public final class GreekAnalyzer extends Analyzer
{
this(matchVersion, stopwords.keySet());
}
-
- /**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
- * {@link GreekLowerCaseFilter} and {@link StopFilter}
- */
+
+ /**
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the
+ * provided {@link Reader}.
+ *
+ * @return {@link TokenStreamComponents} built from a
+ * {@link StandardTokenizer} filtered with
+ * {@link GreekLowerCaseFilter} and {@link StopFilter}
+ */
@Override
- public TokenStream tokenStream(String fieldName, Reader reader)
- {
- TokenStream result = new StandardTokenizer(matchVersion, reader);
- result = new GreekLowerCaseFilter(result);
- result = new StopFilter(matchVersion, result, stopSet);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
- * in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
- * {@link GreekLowerCaseFilter} and {@link StopFilter}
- */
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new StandardTokenizer(matchVersion, reader);
- streams.result = new GreekLowerCaseFilter(streams.source);
- streams.result = new StopFilter(matchVersion, streams.result, stopSet);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ final TokenStream result = new GreekLowerCaseFilter(source);
+ return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
index ecef81ace0f..6df9c0b5765 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
@@ -19,17 +19,15 @@ package org.apache.lucene.analysis.fa;
import java.io.File;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
import java.io.Reader;
-import java.util.Collections;
import java.util.Hashtable;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
@@ -45,7 +43,7 @@ import org.apache.lucene.util.Version;
* yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
*
*/
-public final class PersianAnalyzer extends Analyzer {
+public final class PersianAnalyzer extends StopwordAnalyzerBase {
/**
* File containing default Persian stopwords.
@@ -57,11 +55,6 @@ public final class PersianAnalyzer extends Analyzer {
*/
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
- /**
- * Contains the stopwords used with the StopFilter.
- */
- private final Set> stoptable;
-
/**
* The comment character in the stopwords file. All lines prefixed with this
* will be ignored
@@ -85,30 +78,15 @@ public final class PersianAnalyzer extends Analyzer {
static {
try {
- DEFAULT_STOP_SET = loadDefaultStopWordSet();
+ DEFAULT_STOP_SET = loadStopwordSet(false, PersianAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
-
- static Set loadDefaultStopWordSet() throws IOException {
- InputStream stream = PersianAnalyzer.class
- .getResourceAsStream(DEFAULT_STOPWORD_FILE);
- try {
- InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
- // make sure it is unmodifiable as we expose it in the outer class
- return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
- STOPWORDS_COMMENT));
- } finally {
- stream.close();
- }
- }
}
- private final Version matchVersion;
-
/**
* Builds an analyzer with the default stop words:
* {@link #DEFAULT_STOPWORD_FILE}.
@@ -126,8 +104,7 @@ public final class PersianAnalyzer extends Analyzer {
* a stopword set
*/
public PersianAnalyzer(Version matchVersion, Set> stopwords){
- stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
- this.matchVersion = matchVersion;
+ super(matchVersion, stopwords);
}
/**
@@ -156,18 +133,19 @@ public final class PersianAnalyzer extends Analyzer {
}
/**
- * Creates a {@link TokenStream} which tokenizes all the text in the provided
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
* {@link Reader}.
*
- * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
+ * @return {@link TokenStreamComponents} built from a {@link ArabicLetterTokenizer}
* filtered with {@link LowerCaseFilter},
* {@link ArabicNormalizationFilter},
* {@link PersianNormalizationFilter} and Persian Stop words
*/
@Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new ArabicLetterTokenizer(reader);
- result = new LowerCaseFilter(matchVersion, result);
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new ArabicLetterTokenizer(reader);
+ TokenStream result = new LowerCaseFilter(matchVersion, source);
result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */
result = new PersianNormalizationFilter(result);
@@ -175,44 +153,6 @@ public final class PersianAnalyzer extends Analyzer {
* the order here is important: the stopword list is normalized with the
* above!
*/
- result = new StopFilter(matchVersion, result, stoptable);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- }
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
- * in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
- * filtered with {@link LowerCaseFilter},
- * {@link ArabicNormalizationFilter},
- * {@link PersianNormalizationFilter} and Persian Stop words
- */
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new ArabicLetterTokenizer(reader);
- streams.result = new LowerCaseFilter(matchVersion, streams.source);
- streams.result = new ArabicNormalizationFilter(streams.result);
- /* additional persian-specific normalization */
- streams.result = new PersianNormalizationFilter(streams.result);
- /*
- * the order here is important: the stopword list is normalized with the
- * above!
- */
- streams.result = new StopFilter(matchVersion, streams.result, stoptable);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
+ return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
index 885568ab284..cf029412335 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
@@ -20,7 +20,9 @@ package org.apache.lucene.analysis.fr;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
@@ -59,7 +61,7 @@ import java.util.Set;
*
NOTE: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.
*/
-public final class FrenchAnalyzer extends Analyzer {
+public final class FrenchAnalyzer extends StopwordAnalyzerBase {
/**
* Extended list of typical French stopwords.
@@ -91,18 +93,12 @@ public final class FrenchAnalyzer extends Analyzer {
"été", "être", "ô"
};
- /**
- * Contains the stopwords used with the {@link StopFilter}.
- */
- private final Set> stoptable;
/**
* Contains words that should be indexed but not stemmed.
*/
//TODO make this final in 3.0
private Set> excltable = Collections.
*/
-public final class RussianAnalyzer extends Analyzer
+public final class RussianAnalyzer extends StopwordAnalyzerBase
{
/**
* List of typical Russian stopwords.
@@ -63,13 +64,6 @@ public final class RussianAnalyzer extends Analyzer
Arrays.asList(RUSSIAN_STOP_WORDS), false));
}
- /**
- * Contains the stopwords used with the StopFilter.
- */
- private final Set> stopSet;
-
- private final Version matchVersion;
-
public RussianAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
@@ -91,8 +85,7 @@ public final class RussianAnalyzer extends Analyzer
* a stopword set
*/
public RussianAnalyzer(Version matchVersion, Set> stopwords){
- stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
- this.matchVersion = matchVersion;
+ super(matchVersion, stopwords);
}
/**
@@ -106,52 +99,21 @@ public final class RussianAnalyzer extends Analyzer
}
/**
- * Creates a {@link TokenStream} which tokenizes all the text in the
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the
* provided {@link Reader}.
*
- * @return A {@link TokenStream} built from a
+ * @return {@link TokenStreamComponents} built from a
* {@link RussianLetterTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StopFilter},
* and {@link RussianStemFilter}
*/
@Override
- public TokenStream tokenStream(String fieldName, Reader reader)
- {
- TokenStream result = new RussianLetterTokenizer(reader);
- result = new LowerCaseFilter(matchVersion, result);
- result = new StopFilter(matchVersion, result, stopSet);
- result = new RussianStemFilter(result);
- return result;
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new RussianLetterTokenizer(reader);
+ TokenStream result = new LowerCaseFilter(matchVersion, source);
+ result = new StopFilter(matchVersion, result, stopwords);
+ return new TokenStreamComponents(source, new RussianStemFilter(result));
+
}
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- /**
- * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
- * in the provided {@link Reader}.
- *
- * @return A {@link TokenStream} built from a
- * {@link RussianLetterTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link StopFilter},
- * and {@link RussianStemFilter}
- */
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new RussianLetterTokenizer(reader);
- streams.result = new LowerCaseFilter(matchVersion, streams.source);
- streams.result = new StopFilter(matchVersion, streams.result, stopSet);
- streams.result = new RussianStemFilter(streams.result);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- }
- return streams.result;
- }
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
index 5ea5fd1d351..bace03ee7d6 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
@@ -16,16 +16,18 @@ package org.apache.lucene.analysis.th;
* limitations under the License.
*/
-import java.io.IOException;
import java.io.Reader;
+
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.util.Version;
/**
@@ -35,41 +37,28 @@ import org.apache.lucene.util.Version;
*
NOTE: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.
*/
-public final class ThaiAnalyzer extends Analyzer {
+public final class ThaiAnalyzer extends ReusableAnalyzerBase {
private final Version matchVersion;
public ThaiAnalyzer(Version matchVersion) {
this.matchVersion = matchVersion;
}
-
+
+ /**
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the
+ * provided {@link Reader}.
+ *
+ * @return {@link TokenStreamComponents} built from a
+ * {@link StandardTokenizer} filtered with {@link StandardFilter},
+ * {@link ThaiWordFilter}, and {@link StopFilter}
+ */
@Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream ts = new StandardTokenizer(matchVersion, reader);
- ts = new StandardFilter(ts);
- ts = new ThaiWordFilter(ts);
- ts = new StopFilter(matchVersion, ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
- return ts;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
-
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new StandardTokenizer(matchVersion, reader);
- streams.result = new StandardFilter(streams.source);
- streams.result = new ThaiWordFilter(streams.result);
- streams.result = new StopFilter(matchVersion, streams.result, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
- setPreviousTokenStream(streams);
- } else {
- streams.source.reset(reader);
- streams.result.reset(); // reset the ThaiWordFilter's state
- }
- return streams.result;
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new ThaiWordFilter(result);
+ return new TokenStreamComponents(source, new StopFilter(matchVersion,
+ result, StopAnalyzer.ENGLISH_STOP_WORDS_SET));
}
}
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
index 4f7783c4025..73d7d767512 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
@@ -17,10 +17,10 @@ package org.apache.lucene.analysis.ar;
* limitations under the License.
*/
-import java.io.StringReader;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.Version;
@@ -78,7 +78,9 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
* Test that custom stopwords work, and are not case-sensitive.
*/
public void testCustomStopwords() throws Exception {
- ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, new String[] { "the", "and", "a" });
+ Set set = new HashSet();
+ Collections.addAll(set, "the", "and", "a");
+ ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, set);
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
"brown", "fox" });
}
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
index 36c57869a3f..51cc740aa53 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
@@ -17,10 +17,12 @@ package org.apache.lucene.analysis.br;
* limitations under the License.
*/
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
/**
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
index 26a7fb2b8ec..34096b58c3c 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
@@ -17,11 +17,8 @@ package org.apache.lucene.analysis.fa;
* limitations under the License.
*/
-import java.io.StringReader;
-
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
/**
diff --git a/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java b/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java
new file mode 100644
index 00000000000..8dc5120c6a6
--- /dev/null
+++ b/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java
@@ -0,0 +1,163 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * An convenience subclass of Analyzer that makes it easy to implement
+ * {@link TokenStream} reuse.
+ *
+ * ReusableAnalyzerBase is a simplification of Analyzer that supports easy reuse
+ * for the most common use-cases. Analyzers such as
+ * {@link PerFieldAnalyzerWrapper} that behave differently depending upon the
+ * field name need to subclass Analyzer directly instead.
+ *
+ *
+ * To prevent consistency problems, this class does not allow subclasses to
+ * extend {@link #reusableTokenStream(String, Reader)} or
+ * {@link #tokenStream(String, Reader)} directly. Instead, subclasses must
+ * implement {@link #createComponents(String, Reader)}.
+ *
+ */
+public abstract class ReusableAnalyzerBase extends Analyzer {
+
+ /**
+ * Creates a new {@link TokenStreamComponents} instance for this analyzer.
+ *
+ * @param fieldName
+ * the name of the fields content passed to the
+ * {@link TokenStreamComponents} sink as a reader
+ * @param aReader
+ * the reader passed to the {@link Tokenizer} constructor
+ * @return the {@link TokenStreamComponents} for this analyzer.
+ */
+ protected abstract TokenStreamComponents createComponents(String fieldName,
+ Reader aReader);
+
+ /**
+ * This method uses {@link #createComponents(String, Reader)} to obtain an
+ * instance of {@link TokenStreamComponents}. It returns the sink of the
+ * components and stores the components internally. Subsequent calls to this
+ * method will reuse the previously stored components if and only if the
+ * {@link TokenStreamComponents#reset(Reader)} method returned
+ * true. Otherwise a new instance of
+ * {@link TokenStreamComponents} is created.
+ *
+ * @param fieldName the name of the field the created TokenStream is used for
+ * @param reader the reader the streams source reads from
+ */
+ @Override
+ public final TokenStream reusableTokenStream(final String fieldName,
+ final Reader reader) throws IOException {
+ TokenStreamComponents streamChain = (TokenStreamComponents)
+ getPreviousTokenStream();
+ if (streamChain == null || !streamChain.reset(reader)) {
+ streamChain = createComponents(fieldName, reader);
+ setPreviousTokenStream(streamChain);
+ }
+ return streamChain.getTokenStream();
+ }
+
+ /**
+ * This method uses {@link #createComponents(String, Reader)} to obtain an
+ * instance of {@link TokenStreamComponents} and returns the sink of the
+ * components. Each calls to this method will create a new instance of
+ * {@link TokenStreamComponents}. Created {@link TokenStream} instances are
+ * never reused.
+ *
+ * @param fieldName the name of the field the created TokenStream is used for
+ * @param reader the reader the streams source reads from
+ */
+ @Override
+ public final TokenStream tokenStream(final String fieldName,
+ final Reader reader) {
+ return createComponents(fieldName, reader).getTokenStream();
+ }
+
+ /**
+ * This class encapsulates the outer components of a token stream. It provides
+ * access to the source ({@link Tokenizer}) and the outer end (sink), an
+ * instance of {@link TokenFilter} which also serves as the
+ * {@link TokenStream} returned by
+ * {@link Analyzer#tokenStream(String, Reader)} and
+ * {@link Analyzer#reusableTokenStream(String, Reader)}.
+ */
+ public static class TokenStreamComponents {
+ final Tokenizer source;
+ final TokenStream sink;
+
+ /**
+ * Creates a new {@link TokenStreamComponents} instance.
+ *
+ * @param source
+ * the analyzer's tokenizer
+ * @param result
+ * the analyzer's resulting token stream
+ */
+ public TokenStreamComponents(final Tokenizer source,
+ final TokenStream result) {
+ this.source = source;
+ this.sink = result;
+ }
+
+ /**
+ * Creates a new {@link TokenStreamComponents} instance.
+ *
+ * @param source
+ * the analyzer's tokenizer
+ */
+ public TokenStreamComponents(final Tokenizer source) {
+ this.source = source;
+ this.sink = source;
+ }
+
+ /**
+ * Resets the encapsulated components with the given reader. This method by
+ * default returns true indicating that the components have
+ * been reset successfully. Subclasses of {@link ReusableAnalyzerBase} might use
+ * their own {@link TokenStreamComponents} returning false if
+ * the components cannot be reset.
+ *
+ * @param reader
+ * a reader to reset the source component
+ * @return true if the components were reset, otherwise
+ * false
+ * @throws IOException
+ * if the component's reset method throws an {@link IOException}
+ */
+ protected boolean reset(final Reader reader) throws IOException {
+ source.reset(reader);
+ if(sink != source)
+ sink.reset(); // only reset if the sink reference is different from source
+ return true;
+ }
+
+ /**
+ * Returns the sink {@link TokenStream}
+ *
+ * @return the sink {@link TokenStream}
+ */
+ protected TokenStream getTokenStream() {
+ return sink;
+ }
+
+ }
+
+}
diff --git a/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java b/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java
index dcf577f0559..fc3b8a3f243 100644
--- a/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java
@@ -18,25 +18,15 @@ package org.apache.lucene.analysis;
*/
import java.io.Reader;
-import java.io.IOException;
/** An {@link Analyzer} that filters {@link LetterTokenizer}
* with {@link LowerCaseFilter} */
-public final class SimpleAnalyzer extends Analyzer {
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- return new LowerCaseTokenizer(reader);
- }
+public final class SimpleAnalyzer extends ReusableAnalyzerBase {
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
- Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
- if (tokenizer == null) {
- tokenizer = new LowerCaseTokenizer(reader);
- setPreviousTokenStream(tokenizer);
- } else
- tokenizer.reset(reader);
- return tokenizer;
+ protected TokenStreamComponents createComponents(final String fieldName,
+ final Reader reader) {
+ return new TokenStreamComponents(new LowerCaseTokenizer(reader));
}
}
diff --git a/src/java/org/apache/lucene/analysis/StopAnalyzer.java b/src/java/org/apache/lucene/analysis/StopAnalyzer.java
index ec45e332e87..9e76b89d773 100644
--- a/src/java/org/apache/lucene/analysis/StopAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/StopAnalyzer.java
@@ -24,6 +24,7 @@ import java.util.Arrays;
import java.util.Set;
import java.util.List;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
import org.apache.lucene.util.Version;
/** Filters {@link LetterTokenizer} with {@link LowerCaseFilter} and {@link StopFilter}.
@@ -38,9 +39,7 @@ import org.apache.lucene.util.Version;
*
*/
-public final class StopAnalyzer extends Analyzer {
- private final Set> stopWords;
- private final Version matchVersion;
+public final class StopAnalyzer extends StopwordAnalyzerBase {
/** An unmodifiable set containing some common English words that are not usually useful
for searching.*/
@@ -65,16 +64,14 @@ public final class StopAnalyzer extends Analyzer {
* @param matchVersion See above
*/
public StopAnalyzer(Version matchVersion) {
- stopWords = ENGLISH_STOP_WORDS_SET;
- this.matchVersion = matchVersion;
+ this(matchVersion, ENGLISH_STOP_WORDS_SET);
}
/** Builds an analyzer with the stop words from the given set.
* @param matchVersion See above
* @param stopWords Set of stop words */
public StopAnalyzer(Version matchVersion, Set> stopWords) {
- this.stopWords = stopWords;
- this.matchVersion = matchVersion;
+ super(matchVersion, stopWords);
}
/** Builds an analyzer with the stop words from the given file.
@@ -82,8 +79,7 @@ public final class StopAnalyzer extends Analyzer {
* @param matchVersion See above
* @param stopwordsFile File to load stop words from */
public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException {
- stopWords = WordlistLoader.getWordSet(stopwordsFile);
- this.matchVersion = matchVersion;
+ this(matchVersion, WordlistLoader.getWordSet(stopwordsFile));
}
/** Builds an analyzer with the stop words from the given reader.
@@ -91,34 +87,21 @@ public final class StopAnalyzer extends Analyzer {
* @param matchVersion See above
* @param stopwords Reader to load stop words from */
public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
- stopWords = WordlistLoader.getWordSet(stopwords);
- this.matchVersion = matchVersion;
+ this(matchVersion, WordlistLoader.getWordSet(stopwords));
}
- /** Filters LowerCaseTokenizer with StopFilter. */
+ /**
+ * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
+ *
+ * @return {@link TokenStreamComponents} built from a {@link LowerCaseTokenizer} filtered with
+ * {@link StopFilter}
+ */
@Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- return new StopFilter(matchVersion,
- new LowerCaseTokenizer(reader), stopWords);
- }
-
- /** Filters LowerCaseTokenizer with StopFilter. */
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
- };
- @Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
- SavedStreams streams = (SavedStreams) getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new LowerCaseTokenizer(reader);
- streams.result = new StopFilter(matchVersion,
- streams.source, stopWords);
- setPreviousTokenStream(streams);
- } else
- streams.source.reset(reader);
- return streams.result;
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new LowerCaseTokenizer(reader);
+ return new TokenStreamComponents(source, new StopFilter(matchVersion,
+ source, stopwords));
}
}
diff --git a/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java b/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java
new file mode 100644
index 00000000000..cdb9145db5c
--- /dev/null
+++ b/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.ReusableAnalyzerBase;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.util.Version;
+
+/**
+ * Base class for Analyzers that need to make use of stopword sets.
+ *
+ */
+public abstract class StopwordAnalyzerBase extends ReusableAnalyzerBase {
+
+ /**
+ * An immutable stopword set
+ */
+ protected final CharArraySet stopwords;
+
+ protected final Version matchVersion;
+
+ /**
+ * Returns the analyzer's stopword set or an empty set if the analyzer has no
+ * stopwords
+ *
+ * @return the analyzer's stopword set or an empty set if the analyzer has no
+ * stopwords
+ */
+ public Set> getStopwordSet() {
+ return stopwords;
+ }
+
+ /**
+ * Creates a new instance initialized with the given stopword set
+ *
+ * @param version
+ * the Lucene version for cross version compatibility
+ * @param stopwords
+ * the analyzer's stopword set
+ */
+ protected StopwordAnalyzerBase(final Version version, final Set> stopwords) {
+ /*
+ * no need to call
+ * setOverridesTokenStreamMethod(StopwordAnalyzerBase.class); here, both
+ * tokenStream methods are final in this class.
+ */
+ matchVersion = version;
+ // analyzers should use char array set for stopwords!
+ this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet
+ .unmodifiableSet(CharArraySet.copy(version, stopwords));
+ }
+
+ /**
+ * Creates a new Analyzer with an empty stopword set
+ *
+ * @param version
+ * the Lucene version for cross version compatibility
+ */
+ protected StopwordAnalyzerBase(final Version version) {
+ this(version, null);
+ }
+
+ /**
+ * Creates a CharArraySet from a file resource associated with a class. (See
+ * {@link Class#getResourceAsStream(String)}).
+ *
+ * @param ignoreCase
+ * true if the set should ignore the case of the
+ * stopwords, otherwise false
+ * @param aClass
+ * a class that is associated with the given stopwordResource
+ * @param resource
+ * name of the resource file associated with the given class
+ * @param comment
+ * comment string to ignore in the stopword file
+ * @return a CharArraySet containing the distinct stopwords from the given
+ * file
+ * @throws IOException
+ * if loading the stopwords throws an {@link IOException}
+ */
+ protected static CharArraySet loadStopwordSet(final boolean ignoreCase,
+ final Class extends ReusableAnalyzerBase> aClass, final String resource,
+ final String comment) throws IOException {
+ final Set wordSet = WordlistLoader.getWordSet(aClass, resource,
+ comment);
+ final CharArraySet set = new CharArraySet(Version.LUCENE_31, wordSet.size(), ignoreCase);
+ set.addAll(wordSet);
+ return set;
+ }
+
+}
diff --git a/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java b/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
index 2c2e4c0278b..edb6de1210c 100644
--- a/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
@@ -18,24 +18,14 @@ package org.apache.lucene.analysis;
*/
import java.io.Reader;
-import java.io.IOException;
/** An Analyzer that uses {@link WhitespaceTokenizer}. */
-public final class WhitespaceAnalyzer extends Analyzer {
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- return new WhitespaceTokenizer(reader);
- }
+public final class WhitespaceAnalyzer extends ReusableAnalyzerBase {
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
- Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
- if (tokenizer == null) {
- tokenizer = new WhitespaceTokenizer(reader);
- setPreviousTokenStream(tokenizer);
- } else
- tokenizer.reset(reader);
- return tokenizer;
+ protected TokenStreamComponents createComponents(final String fieldName,
+ final Reader reader) {
+ return new TokenStreamComponents(new WhitespaceTokenizer(reader));
}
}
diff --git a/src/java/org/apache/lucene/analysis/WordlistLoader.java b/src/java/org/apache/lucene/analysis/WordlistLoader.java
index f071bb606aa..051a578288d 100644
--- a/src/java/org/apache/lucene/analysis/WordlistLoader.java
+++ b/src/java/org/apache/lucene/analysis/WordlistLoader.java
@@ -21,15 +21,69 @@ import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
+import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.Set;
/**
* Loader for text files that represent a list of stopwords.
*/
public class WordlistLoader {
-
+
+ /**
+ * Loads a text file associated with a given class (See
+ * {@link Class#getResourceAsStream(String)}) and adds every line as an entry
+ * to a {@link Set} (omitting leading and trailing whitespace). Every line of
+ * the file should contain only one word. The words need to be in lower-case if
+ * you make use of an Analyzer which uses LowerCaseFilter (like
+ * StandardAnalyzer).
+ *
+ * @param aClass
+ * a class that is associated with the given stopwordResource
+ * @param stopwordResource
+ * name of the resource file associated with the given class
+ * @return a {@link Set} with the file's words
+ */
+ public static Set getWordSet(Class> aClass, String stopwordResource)
+ throws IOException {
+ final Reader reader = new BufferedReader(new InputStreamReader(aClass
+ .getResourceAsStream(stopwordResource), "UTF-8"));
+ try {
+ return getWordSet(reader);
+ } finally {
+ reader.close();
+ }
+ }
+
+ /**
+ * Loads a text file associated with a given class (See
+ * {@link Class#getResourceAsStream(String)}) and adds every line as an entry
+ * to a {@link Set} (omitting leading and trailing whitespace). Every line of
+ * the file should contain only one word. The words need to be in lower-case if
+ * you make use of an Analyzer which uses LowerCaseFilter (like
+ * StandardAnalyzer).
+ *
+ * @param aClass
+ * a class that is associated with the given stopwordResource
+ * @param stopwordResource
+ * name of the resource file associated with the given class
+ * @param comment
+ * the comment string to ignore
+ * @return a {@link Set} with the file's words
+ */
+ public static Set getWordSet(Class> aClass,
+ String stopwordResource, String comment) throws IOException {
+ final Reader reader = new BufferedReader(new InputStreamReader(aClass
+ .getResourceAsStream(stopwordResource), "UTF-8"));
+ try {
+ return getWordSet(reader, comment);
+ } finally {
+ reader.close();
+ }
+ }
+
/**
* Loads a text file and adds every line as an entry to a HashSet (omitting
* leading and trailing whitespace). Every line of the file should contain only
@@ -40,17 +94,15 @@ public class WordlistLoader {
* @return A HashSet with the file's words
*/
public static HashSet getWordSet(File wordfile) throws IOException {
- HashSet result = new HashSet();
FileReader reader = null;
try {
reader = new FileReader(wordfile);
- result = getWordSet(reader);
+ return getWordSet(reader);
}
finally {
if (reader != null)
reader.close();
}
- return result;
}
/**
@@ -64,17 +116,15 @@ public class WordlistLoader {
* @return A HashSet with the file's words
*/
public static HashSet getWordSet(File wordfile, String comment) throws IOException {
- HashSet result = new HashSet();
FileReader reader = null;
try {
reader = new FileReader(wordfile);
- result = getWordSet(reader, comment);
+ return getWordSet(reader, comment);
}
finally {
if (reader != null)
reader.close();
}
- return result;
}
@@ -88,7 +138,7 @@ public class WordlistLoader {
* @return A HashSet with the reader's words
*/
public static HashSet getWordSet(Reader reader) throws IOException {
- HashSet result = new HashSet();
+ final HashSet result = new HashSet();
BufferedReader br = null;
try {
if (reader instanceof BufferedReader) {
@@ -119,7 +169,7 @@ public class WordlistLoader {
* @return A HashSet with the reader's words
*/
public static HashSet getWordSet(Reader reader, String comment) throws IOException {
- HashSet result = new HashSet();
+ final HashSet result = new HashSet();
BufferedReader br = null;
try {
if (reader instanceof BufferedReader) {
@@ -154,21 +204,18 @@ public class WordlistLoader {
public static HashMap getStemDict(File wordstemfile) throws IOException {
if (wordstemfile == null)
throw new NullPointerException("wordstemfile may not be null");
- HashMap result = new HashMap();
+ final HashMap result = new HashMap();
BufferedReader br = null;
- FileReader fr = null;
+
try {
- fr = new FileReader(wordstemfile);
- br = new BufferedReader(fr);
+ br = new BufferedReader(new FileReader(wordstemfile));
String line;
while ((line = br.readLine()) != null) {
String[] wordstem = line.split("\t", 2);
result.put(wordstem[0], wordstem[1]);
}
} finally {
- if (fr != null)
- fr.close();
- if (br != null)
+ if(br != null)
br.close();
}
return result;
diff --git a/src/test/org/apache/lucene/index/wordliststopwords.txt b/src/test/org/apache/lucene/index/wordliststopwords.txt
new file mode 100644
index 00000000000..7d3550734e7
--- /dev/null
+++ b/src/test/org/apache/lucene/index/wordliststopwords.txt
@@ -0,0 +1,5 @@
+#comment
+ONE
+two
+#comment
+three
diff --git a/src/test/org/apache/lucene/index/wordliststopwords_nocomment.txt b/src/test/org/apache/lucene/index/wordliststopwords_nocomment.txt
new file mode 100644
index 00000000000..59cb04ec465
--- /dev/null
+++ b/src/test/org/apache/lucene/index/wordliststopwords_nocomment.txt
@@ -0,0 +1,3 @@
+ONE
+two
+three