From a94983686982747059e2a8a668747e5e80e849f6 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sun, 3 Jan 2010 08:48:17 +0000 Subject: [PATCH] LUCENE-2034: Refactor analyzer reuse and stopword handling git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@895339 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 6 + .../lucene/analysis/ar/ArabicAnalyzer.java | 88 ++-------- .../lucene/analysis/bg/BulgarianAnalyzer.java | 93 +++------- .../lucene/analysis/br/BrazilianAnalyzer.java | 84 +++------ .../lucene/analysis/cjk/CJKAnalyzer.java | 60 +------ .../lucene/analysis/cjk/CJKTokenizer.java | 2 - .../lucene/analysis/cn/ChineseAnalyzer.java | 59 ++----- .../lucene/analysis/cz/CzechAnalyzer.java | 67 ++----- .../lucene/analysis/de/GermanAnalyzer.java | 69 ++------ .../lucene/analysis/el/GreekAnalyzer.java | 70 ++------ .../lucene/analysis/fa/PersianAnalyzer.java | 84 ++------- .../lucene/analysis/fr/FrenchAnalyzer.java | 62 ++----- .../lucene/analysis/ru/RussianAnalyzer.java | 64 ++----- .../lucene/analysis/th/ThaiAnalyzer.java | 53 +++--- .../analysis/ar/TestArabicAnalyzer.java | 10 +- .../analysis/br/TestBrazilianStemmer.java | 6 +- .../analysis/fa/TestPersianAnalyzer.java | 3 - .../lucene/analysis/ReusableAnalyzerBase.java | 163 ++++++++++++++++++ .../lucene/analysis/SimpleAnalyzer.java | 18 +- .../apache/lucene/analysis/StopAnalyzer.java | 51 ++---- .../lucene/analysis/StopwordAnalyzerBase.java | 110 ++++++++++++ .../lucene/analysis/WhitespaceAnalyzer.java | 18 +- .../lucene/analysis/WordlistLoader.java | 79 +++++++-- .../apache/lucene/index/wordliststopwords.txt | 5 + .../index/wordliststopwords_nocomment.txt | 3 + 25 files changed, 584 insertions(+), 743 deletions(-) create mode 100644 src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java create mode 100644 src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java create mode 100644 src/test/org/apache/lucene/index/wordliststopwords.txt create mode 100644 src/test/org/apache/lucene/index/wordliststopwords_nocomment.txt diff --git a/CHANGES.txt b/CHANGES.txt index 02c2b97e7ec..a93a8c4ba0e 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -105,6 +105,12 @@ New features backwards compatibility. If Version < 3.1 is passed to the constructor, LowerCaseFilter yields the old behavior. (Simon Willnauer, Robert Muir) +* LUCENE-2034: Added ReusableAnalyzerBase, an abstract subclass of Analyzer + that makes it easier to reuse TokenStreams correctly. This issue also added + StopwordAnalyzerBase, which improves consistency of all Analyzers that use + stopwords, and implement many analyzers in contrib with it. + (Simon Willnauer via Robert Muir) + Optimizations * LUCENE-2086: When resolving deleted terms, do so in term sort order diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java index e4036f8873e..aad6aa3a004 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java @@ -19,17 +19,15 @@ package org.apache.lucene.analysis.ar; import java.io.File; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; import java.io.Reader; -import java.util.Collections; import java.util.Hashtable; import java.util.Set; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.StopwordAnalyzerBase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WordlistLoader; @@ -52,7 +50,7 @@ import org.apache.lucene.util.Version; * * */ -public final class ArabicAnalyzer extends Analyzer { +public final class ArabicAnalyzer extends StopwordAnalyzerBase { /** * File containing default Arabic stopwords. @@ -62,21 +60,18 @@ public final class ArabicAnalyzer extends Analyzer { */ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; - /** - * Contains the stopwords used with the StopFilter. - */ - private final Set stoptable; /** * The comment character in the stopwords file. All lines prefixed with this will be ignored * @deprecated use {@link WordlistLoader#getWordSet(File, String)} directly */ + // TODO make this private public static final String STOPWORDS_COMMENT = "#"; /** * Returns an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set. */ - public static Set getDefaultStopSet(){ + public static Set getDefaultStopSet(){ return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -85,34 +80,19 @@ public final class ArabicAnalyzer extends Analyzer { * accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final Set DEFAULT_STOP_SET; static { try { - DEFAULT_STOP_SET = loadDefaultStopWordSet(); + DEFAULT_STOP_SET = loadStopwordSet(false, ArabicAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) throw new RuntimeException("Unable to load default stopword set"); } } - - static Set loadDefaultStopWordSet() throws IOException { - InputStream stream = ArabicAnalyzer.class - .getResourceAsStream(DEFAULT_STOPWORD_FILE); - try { - InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); - // make sure it is unmodifiable as we expose it in the outer class - return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, - STOPWORDS_COMMENT)); - } finally { - stream.close(); - } - } } - private final Version matchVersion; - /** * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */ @@ -129,8 +109,7 @@ public final class ArabicAnalyzer extends Analyzer { * a stopword set */ public ArabicAnalyzer(Version matchVersion, Set stopwords){ - stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords)); - this.matchVersion = matchVersion; + super(matchVersion, stopwords); } /** @@ -159,54 +138,21 @@ public final class ArabicAnalyzer extends Analyzer { /** - * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. + * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}. * - * @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with + * @return {@link TokenStreamComponents} built from an {@link ArabicLetterTokenizer} filtered with * {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter} * and {@link ArabicStemFilter}. */ @Override - public final TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new ArabicLetterTokenizer( reader ); - result = new LowerCaseFilter(matchVersion, result); + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + final Tokenizer source = new ArabicLetterTokenizer(reader); + TokenStream result = new LowerCaseFilter(matchVersion, source); // the order here is important: the stopword list is not normalized! - result = new StopFilter( matchVersion, result, stoptable ); - result = new ArabicNormalizationFilter( result ); - result = new ArabicStemFilter( result ); - - return result; - } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - - /** - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text - * in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with - * {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter} - * and {@link ArabicStemFilter}. - */ - @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) - throws IOException { - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new ArabicLetterTokenizer(reader); - streams.result = new LowerCaseFilter(matchVersion, streams.source); - // the order here is important: the stopword list is not normalized! - streams.result = new StopFilter( matchVersion, streams.result, stoptable); - streams.result = new ArabicNormalizationFilter(streams.result); - streams.result = new ArabicStemFilter(streams.result); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; + result = new StopFilter( matchVersion, result, stopwords); + result = new ArabicNormalizationFilter(result); + return new TokenStreamComponents(source, new ArabicStemFilter(result)); } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java index 3e93e832dae..7abacf169e4 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java @@ -17,17 +17,16 @@ package org.apache.lucene.analysis.bg; * limitations under the License. */ +import java.io.File; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; import java.io.Reader; -import java.util.Collections; import java.util.Set; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.StopwordAnalyzerBase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WordlistLoader; @@ -43,7 +42,7 @@ import org.apache.lucene.util.Version; * http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf *

*/ -public final class BulgarianAnalyzer extends Analyzer { +public final class BulgarianAnalyzer extends StopwordAnalyzerBase { /** * File containing default Bulgarian stopwords. @@ -54,14 +53,12 @@ public final class BulgarianAnalyzer extends Analyzer { */ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; - /** - * Contains the stopwords used with the StopFilter. - */ - private final Set stoptable; /** * The comment character in the stopwords file. All lines prefixed with this * will be ignored + * @deprecated use {@link WordlistLoader#getWordSet(File, String)} directly */ + //TODO make this private public static final String STOPWORDS_COMMENT = "#"; /** @@ -69,7 +66,7 @@ public final class BulgarianAnalyzer extends Analyzer { * * @return an unmodifiable instance of the default stop-words set. */ - public static Set getDefaultStopSet() { + public static Set getDefaultStopSet() { return DefaultSetHolder.DEFAULT_STOP_SET; } @@ -78,35 +75,19 @@ public final class BulgarianAnalyzer extends Analyzer { * class accesses the static final set the first time.; */ private static class DefaultSetHolder { - static final Set DEFAULT_STOP_SET; + static final Set DEFAULT_STOP_SET; static { try { - DEFAULT_STOP_SET = loadDefaultStopWordSet(); - } catch (Exception ex) { + DEFAULT_STOP_SET = loadStopwordSet(false, BulgarianAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT); + } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) - throw new RuntimeException("Unable to load default stopword set", ex); - } - } - - static Set loadDefaultStopWordSet() throws IOException { - final InputStream stream = BulgarianAnalyzer.class - .getResourceAsStream(DEFAULT_STOPWORD_FILE); - try { - InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); - // make sure it is unmodifiable as we expose it in the outer class - return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, - STOPWORDS_COMMENT)); - } finally { - if(stream != null) - stream.close(); + throw new RuntimeException("Unable to load default stopword set"); } } } - - private final Version matchVersion; - + /** * Builds an analyzer with the default stop words: * {@link #DEFAULT_STOPWORD_FILE}. @@ -119,58 +100,24 @@ public final class BulgarianAnalyzer extends Analyzer { * Builds an analyzer with the given stop words. */ public BulgarianAnalyzer(Version matchVersion, Set stopwords) { - super(); - stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, - stopwords)); - this.matchVersion = matchVersion; + super(matchVersion, stopwords); } /** - * Creates a {@link TokenStream} which tokenizes all the text in the provided + * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided * {@link Reader}. * - * @return A {@link TokenStream} built from an {@link StandardTokenizer} + * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer} * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, * {@link StopFilter}, and {@link BulgarianStemFilter}. */ @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new StandardTokenizer(matchVersion, reader); - result = new StandardFilter(result); + public TokenStreamComponents createComponents(String fieldName, Reader reader) { + final Tokenizer source = new StandardTokenizer(matchVersion, reader); + TokenStream result = new StandardFilter(source); result = new LowerCaseFilter(matchVersion, result); - result = new StopFilter(matchVersion, result, stoptable); + result = new StopFilter(matchVersion, result, stopwords); result = new BulgarianStemFilter(result); - return result; - } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - - /** - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the - * text in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from an {@link StandardTokenizer} - * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, - * {@link StopFilter}, and {@link BulgarianStemFilter}. - */ - @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) - throws IOException { - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new StandardTokenizer(matchVersion, reader); - streams.result = new StandardFilter(streams.source); - streams.result = new LowerCaseFilter(matchVersion, streams.result); - streams.result = new StopFilter(matchVersion, streams.result, stoptable); - streams.result = new BulgarianStemFilter(streams.result); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; + return new TokenStreamComponents(source, result); } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java index 44299f639f9..3230ec293e0 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java @@ -21,19 +21,21 @@ import java.io.File; import java.io.IOException; import java.io.Reader; import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; import java.util.Map; import java.util.Set; -import java.util.Collections; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.StopwordAnalyzerBase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WordlistLoader; -import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc +import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.util.Version; @@ -49,7 +51,7 @@ import org.apache.lucene.util.Version; *

NOTE: This class uses the same {@link Version} * dependent settings as {@link StandardAnalyzer}.

*/ -public final class BrazilianAnalyzer extends Analyzer { +public final class BrazilianAnalyzer extends StopwordAnalyzerBase { /** * List of typical Brazilian Portuguese stopwords. @@ -91,19 +93,13 @@ public final class BrazilianAnalyzer extends Analyzer { Arrays.asList(BRAZILIAN_STOP_WORDS), false)); } - /** - * Contains the stopwords used with the {@link StopFilter}. - */ - private final Set stoptable; - + /** * Contains words that should be indexed but not stemmed. */ // TODO make this private in 3.1 private Set excltable = Collections.emptySet(); - private final Version matchVersion; - /** * Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}). */ @@ -120,8 +116,7 @@ public final class BrazilianAnalyzer extends Analyzer { * a stopword set */ public BrazilianAnalyzer(Version matchVersion, Set stopwords) { - stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords)); - this.matchVersion = matchVersion; + super(matchVersion, stopwords); } /** @@ -188,53 +183,22 @@ public final class BrazilianAnalyzer extends Analyzer { excltable = WordlistLoader.getWordSet( exclusionlist ); setPreviousTokenStream(null); // force a new stemmer to be created } - - /** - * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with - * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and - * {@link BrazilianStemFilter}. - */ - @Override - public final TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new StandardTokenizer( matchVersion, reader ); - result = new LowerCaseFilter( matchVersion, result ); - result = new StandardFilter( result ); - result = new StopFilter( matchVersion, result, stoptable ); - result = new BrazilianStemFilter( result, excltable ); - return result; - } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - - /** - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text - * in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with - * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and - * {@link BrazilianStemFilter}. - */ - @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) - throws IOException { - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new StandardTokenizer(matchVersion, reader); - streams.result = new LowerCaseFilter(matchVersion, streams.source); - streams.result = new StandardFilter(streams.result); - streams.result = new StopFilter(matchVersion, streams.result, stoptable); - streams.result = new BrazilianStemFilter(streams.result, excltable); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; - } + /** + * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}. + * + * @return {@link TokenStreamComponents} built from a {@link StandardTokenizer} filtered with + * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and + * {@link BrazilianStemFilter}. + */ + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer source = new StandardTokenizer(matchVersion, reader); + TokenStream result = new LowerCaseFilter(matchVersion, source); + result = new StandardFilter(result); + result = new StopFilter(matchVersion, result, stopwords); + return new TokenStreamComponents(source, new BrazilianStemFilter(result, + excltable)); + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java index 1ab7c228fa1..dc2df4c2d38 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java @@ -19,12 +19,12 @@ package org.apache.lucene.analysis.cjk; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link import org.apache.lucene.analysis.StopFilter; -import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.StopwordAnalyzerBase; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.util.Version; -import java.io.IOException; import java.io.Reader; import java.util.Arrays; import java.util.Set; @@ -35,7 +35,7 @@ import java.util.Set; * filters with {@link StopFilter} * */ -public final class CJKAnalyzer extends Analyzer { +public final class CJKAnalyzer extends StopwordAnalyzerBase { //~ Static fields/initializers --------------------------------------------- /** @@ -71,11 +71,6 @@ public final class CJKAnalyzer extends Analyzer { .unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS), false)); } - /** - * stop word list - */ - private final Set stopTable; - private final Version matchVersion; //~ Constructors ----------------------------------------------------------- @@ -95,8 +90,7 @@ public final class CJKAnalyzer extends Analyzer { * a stopword set */ public CJKAnalyzer(Version matchVersion, Set stopwords){ - stopTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords)); - this.matchVersion = matchVersion; + super(matchVersion, stopwords); } /** @@ -106,51 +100,15 @@ public final class CJKAnalyzer extends Analyzer { * @deprecated use {@link #CJKAnalyzer(Version, Set)} instead */ public CJKAnalyzer(Version matchVersion, String... stopWords) { - stopTable = StopFilter.makeStopSet(matchVersion, stopWords); - this.matchVersion = matchVersion; + super(matchVersion, StopFilter.makeStopSet(matchVersion, stopWords)); } //~ Methods ---------------------------------------------------------------- - /** - * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. - * - * @param fieldName lucene field name - * @param reader input {@link Reader} - * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with - * {@link StopFilter} - */ @Override - public final TokenStream tokenStream(String fieldName, Reader reader) { - return new StopFilter(matchVersion, new CJKTokenizer(reader), stopTable); - } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - - /** - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text - * in the provided {@link Reader}. - * - * @param fieldName lucene field name - * @param reader Input {@link Reader} - * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with - * {@link StopFilter} - */ - @Override - public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { - /* tokenStream() is final, no back compat issue */ - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new CJKTokenizer(reader); - streams.result = new StopFilter(matchVersion, streams.source, stopTable); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + final Tokenizer source = new CJKTokenizer(reader); + return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords)); } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java index a3d03534ea0..4edfc3b826c 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java @@ -25,8 +25,6 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.AttributeSource.AttributeFactory; - /** * CJKTokenizer is designed for Chinese, Japanese, and Korean languages. diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java index 48ae4afed5a..2d5c6a7d54f 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java @@ -17,10 +17,11 @@ package org.apache.lucene.analysis.cn; * limitations under the License. */ -import java.io.IOException; import java.io.Reader; + +import org.apache.lucene.analysis.ReusableAnalyzerBase; +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; /** @@ -29,49 +30,19 @@ import org.apache.lucene.analysis.Tokenizer; * */ -public final class ChineseAnalyzer extends Analyzer { +public final class ChineseAnalyzer extends ReusableAnalyzerBase { - public ChineseAnalyzer() { - } - - /** - * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link ChineseTokenizer} - * filtered with {@link ChineseFilter}. - */ + /** + * Creates {@link TokenStreamComponents} used to tokenize all the text in the + * provided {@link Reader}. + * + * @return {@link TokenStreamComponents} built from a + * {@link ChineseTokenizer} filtered with {@link ChineseFilter} + */ @Override - public final TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new ChineseTokenizer(reader); - result = new ChineseFilter(result); - return result; - } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - - /** - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in the - * provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link ChineseTokenizer} - * filtered with {@link ChineseFilter}. - */ - @Override - public final TokenStream reusableTokenStream(String fieldName, Reader reader) - throws IOException { - /* tokenStream() is final, no back compat issue */ - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new ChineseTokenizer(reader); - streams.result = new ChineseFilter(streams.source); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + final Tokenizer source = new ChineseTokenizer(reader); + return new TokenStreamComponents(source, new ChineseFilter(source)); } } \ No newline at end of file diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java index 8c66a11bdc5..804791607c6 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.cz; * limitations under the License. */ +import org.apache.lucene.analysis.ReusableAnalyzerBase; +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.LowerCaseFilter; @@ -30,9 +32,9 @@ import org.apache.lucene.util.Version; import java.io.*; import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; import java.util.Set; -import java.util.Collections; /** * {@link Analyzer} for Czech language. @@ -53,7 +55,7 @@ import java.util.Collections; * LUCENE-1068) * */ -public final class CzechAnalyzer extends Analyzer { +public final class CzechAnalyzer extends ReusableAnalyzerBase { /** * List of typical stopwords. @@ -95,10 +97,11 @@ public final class CzechAnalyzer extends Analyzer { Version.LUCENE_CURRENT, Arrays.asList(CZECH_STOP_WORDS), false)); } + /** * Contains the stopwords used with the {@link StopFilter}. */ - // TODO make this final in 3.1 + // TODO once loadStopWords is gone those member should be removed too in favor of StopwordAnalyzerBase private Set stoptable; private final Version matchVersion; @@ -168,6 +171,7 @@ public final class CzechAnalyzer extends Analyzer { * @deprecated use {@link WordlistLoader#getWordSet(Reader, String) } * and {@link #CzechAnalyzer(Version, Set)} instead */ + // TODO extend StopwordAnalyzerBase once this method is gone! public void loadStopWords( InputStream wordfile, String encoding ) { setPreviousTokenStream(null); // force a new stopfilter to be created if ( wordfile == null ) { @@ -191,58 +195,25 @@ public final class CzechAnalyzer extends Analyzer { stoptable = Collections.emptySet(); } } - /** - * Creates a {@link TokenStream} which tokenizes all the text in the provided + * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided * {@link Reader}. * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} + * @return {@link TokenStreamComponents} built from a {@link StandardTokenizer} * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, * {@link StopFilter}, and {@link CzechStemFilter} (only if version is * >= LUCENE_31) */ @Override - public final TokenStream tokenStream( String fieldName, Reader reader ) { - TokenStream result = new StandardTokenizer( matchVersion, reader ); - result = new StandardFilter( result ); - result = new LowerCaseFilter( matchVersion, result ); - result = new StopFilter( matchVersion, result, stoptable ); - if (matchVersion.onOrAfter(Version.LUCENE_31)) - result = new CzechStemFilter(result); - return result; - } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - - /** - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the - * text in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} - * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, - * {@link StopFilter}, and {@link CzechStemFilter} (only if version is - * >= LUCENE_31) - */ - @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) - throws IOException { - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new StandardTokenizer(matchVersion, reader); - streams.result = new StandardFilter(streams.source); - streams.result = new LowerCaseFilter(matchVersion, streams.result); - streams.result = new StopFilter( matchVersion, streams.result, stoptable); - if (matchVersion.onOrAfter(Version.LUCENE_31)) - streams.result = new CzechStemFilter(streams.result); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; - } + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + final Tokenizer source = new StandardTokenizer(matchVersion, reader); + TokenStream result = new StandardFilter(source); + result = new LowerCaseFilter(matchVersion, result); + result = new StopFilter( matchVersion, result, stoptable); + if (matchVersion.onOrAfter(Version.LUCENE_31)) + result = new CzechStemFilter(result); + return new TokenStreamComponents(source, result); + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java index 5497416a88c..603a41347cb 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java @@ -29,13 +29,15 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.StopwordAnalyzerBase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WordlistLoader; +import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc import org.apache.lucene.util.Version; /** @@ -51,7 +53,7 @@ import org.apache.lucene.util.Version; *

NOTE: This class uses the same {@link Version} * dependent settings as {@link StandardAnalyzer}.

*/ -public final class GermanAnalyzer extends Analyzer { +public final class GermanAnalyzer extends StopwordAnalyzerBase { /** * List of typical german stopwords. @@ -89,17 +91,13 @@ public final class GermanAnalyzer extends Analyzer { /** * Contains the stopwords used with the {@link StopFilter}. */ - //TODO make this final in 3.1 - private Set stopSet; - + /** * Contains words that should be indexed but not stemmed. */ // TODO make this final in 3.1 private Set exclusionSet; - private final Version matchVersion; - /** * Builds an analyzer with the default stop words: * {@link #getDefaultStopSet()}. @@ -131,9 +129,8 @@ public final class GermanAnalyzer extends Analyzer { * a stemming exclusion set */ public GermanAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionSet) { - stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords)); + super(matchVersion, stopwords); exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet)); - this.matchVersion = matchVersion; } /** @@ -187,51 +184,23 @@ public final class GermanAnalyzer extends Analyzer { exclusionSet = WordlistLoader.getWordSet(exclusionlist); setPreviousTokenStream(null); // force a new stemmer to be created } - + /** - * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with - * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and + * Creates {@link TokenStreamComponents} used to tokenize all the text in the + * provided {@link Reader}. + * + * @return {@link TokenStreamComponents} built from a + * {@link StandardTokenizer} filtered with {@link StandardFilter}, + * {@link LowerCaseFilter}, {@link StopFilter}, and * {@link GermanStemFilter} */ @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new StandardTokenizer(matchVersion, reader); - result = new StandardFilter(result); + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + final Tokenizer source = new StandardTokenizer(matchVersion, reader); + TokenStream result = new StandardFilter(source); result = new LowerCaseFilter(matchVersion, result); - result = new StopFilter( matchVersion, result, stopSet); - result = new GermanStemFilter(result, exclusionSet); - return result; - } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - - /** - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text - * in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with - * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}, and - * {@link GermanStemFilter} - */ - @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new StandardTokenizer(matchVersion, reader); - streams.result = new StandardFilter(streams.source); - streams.result = new LowerCaseFilter(matchVersion, streams.result); - streams.result = new StopFilter( matchVersion, streams.result, stopSet); - streams.result = new GermanStemFilter(streams.result, exclusionSet); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; + result = new StopFilter( matchVersion, result, stopwords); + return new TokenStreamComponents(source, new GermanStemFilter(result, exclusionSet)); } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java index 2e69c39d905..808cc207f85 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java @@ -19,14 +19,15 @@ package org.apache.lucene.analysis.el; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.StopwordAnalyzerBase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc import org.apache.lucene.util.Version; -import java.io.IOException; import java.io.Reader; import java.util.Arrays; import java.util.Map; @@ -43,7 +44,7 @@ import java.util.Set; *

NOTE: This class uses the same {@link Version} * dependent settings as {@link StandardAnalyzer}.

*/ -public final class GreekAnalyzer extends Analyzer +public final class GreekAnalyzer extends StopwordAnalyzerBase { /** * List of typical Greek stopwords. @@ -73,13 +74,6 @@ public final class GreekAnalyzer extends Analyzer Version.LUCENE_CURRENT, Arrays.asList(GREEK_STOP_WORDS), false)); } - /** - * Contains the stopwords used with the {@link StopFilter}. - */ - private final Set stopSet; - - private final Version matchVersion; - public GreekAnalyzer(Version matchVersion) { this(matchVersion, DefaultSetHolder.DEFAULT_SET); } @@ -93,8 +87,7 @@ public final class GreekAnalyzer extends Analyzer * a stopword set */ public GreekAnalyzer(Version matchVersion, Set stopwords) { - stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords)); - this.matchVersion = matchVersion; + super(matchVersion, stopwords); } /** @@ -115,47 +108,20 @@ public final class GreekAnalyzer extends Analyzer { this(matchVersion, stopwords.keySet()); } - - /** - * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with - * {@link GreekLowerCaseFilter} and {@link StopFilter} - */ + + /** + * Creates {@link TokenStreamComponents} used to tokenize all the text in the + * provided {@link Reader}. + * + * @return {@link TokenStreamComponents} built from a + * {@link StandardTokenizer} filtered with + * {@link GreekLowerCaseFilter} and {@link StopFilter} + */ @Override - public TokenStream tokenStream(String fieldName, Reader reader) - { - TokenStream result = new StandardTokenizer(matchVersion, reader); - result = new GreekLowerCaseFilter(result); - result = new StopFilter(matchVersion, result, stopSet); - return result; - } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - - /** - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text - * in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with - * {@link GreekLowerCaseFilter} and {@link StopFilter} - */ - @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) - throws IOException { - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new StandardTokenizer(matchVersion, reader); - streams.result = new GreekLowerCaseFilter(streams.source); - streams.result = new StopFilter(matchVersion, streams.result, stopSet); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + final Tokenizer source = new StandardTokenizer(matchVersion, reader); + final TokenStream result = new GreekLowerCaseFilter(source); + return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)); } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java index ecef81ace0f..6df9c0b5765 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java @@ -19,17 +19,15 @@ package org.apache.lucene.analysis.fa; import java.io.File; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; import java.io.Reader; -import java.util.Collections; import java.util.Hashtable; import java.util.Set; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.StopwordAnalyzerBase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WordlistLoader; @@ -45,7 +43,7 @@ import org.apache.lucene.util.Version; * yeh and keheh) are standardized. "Stemming" is accomplished via stopwords. *

*/ -public final class PersianAnalyzer extends Analyzer { +public final class PersianAnalyzer extends StopwordAnalyzerBase { /** * File containing default Persian stopwords. @@ -57,11 +55,6 @@ public final class PersianAnalyzer extends Analyzer { */ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; - /** - * Contains the stopwords used with the StopFilter. - */ - private final Set stoptable; - /** * The comment character in the stopwords file. All lines prefixed with this * will be ignored @@ -85,30 +78,15 @@ public final class PersianAnalyzer extends Analyzer { static { try { - DEFAULT_STOP_SET = loadDefaultStopWordSet(); + DEFAULT_STOP_SET = loadStopwordSet(false, PersianAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) throw new RuntimeException("Unable to load default stopword set"); } } - - static Set loadDefaultStopWordSet() throws IOException { - InputStream stream = PersianAnalyzer.class - .getResourceAsStream(DEFAULT_STOPWORD_FILE); - try { - InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); - // make sure it is unmodifiable as we expose it in the outer class - return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader, - STOPWORDS_COMMENT)); - } finally { - stream.close(); - } - } } - private final Version matchVersion; - /** * Builds an analyzer with the default stop words: * {@link #DEFAULT_STOPWORD_FILE}. @@ -126,8 +104,7 @@ public final class PersianAnalyzer extends Analyzer { * a stopword set */ public PersianAnalyzer(Version matchVersion, Set stopwords){ - stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords)); - this.matchVersion = matchVersion; + super(matchVersion, stopwords); } /** @@ -156,18 +133,19 @@ public final class PersianAnalyzer extends Analyzer { } /** - * Creates a {@link TokenStream} which tokenizes all the text in the provided + * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided * {@link Reader}. * - * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer} + * @return {@link TokenStreamComponents} built from a {@link ArabicLetterTokenizer} * filtered with {@link LowerCaseFilter}, * {@link ArabicNormalizationFilter}, * {@link PersianNormalizationFilter} and Persian Stop words */ @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new ArabicLetterTokenizer(reader); - result = new LowerCaseFilter(matchVersion, result); + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + final Tokenizer source = new ArabicLetterTokenizer(reader); + TokenStream result = new LowerCaseFilter(matchVersion, source); result = new ArabicNormalizationFilter(result); /* additional persian-specific normalization */ result = new PersianNormalizationFilter(result); @@ -175,44 +153,6 @@ public final class PersianAnalyzer extends Analyzer { * the order here is important: the stopword list is normalized with the * above! */ - result = new StopFilter(matchVersion, result, stoptable); - return result; - } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - } - - /** - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text - * in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer} - * filtered with {@link LowerCaseFilter}, - * {@link ArabicNormalizationFilter}, - * {@link PersianNormalizationFilter} and Persian Stop words - */ - @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) - throws IOException { - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new ArabicLetterTokenizer(reader); - streams.result = new LowerCaseFilter(matchVersion, streams.source); - streams.result = new ArabicNormalizationFilter(streams.result); - /* additional persian-specific normalization */ - streams.result = new PersianNormalizationFilter(streams.result); - /* - * the order here is important: the stopword list is normalized with the - * above! - */ - streams.result = new StopFilter(matchVersion, streams.result, stoptable); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; + return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)); } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java index 885568ab284..cf029412335 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java @@ -20,7 +20,9 @@ package org.apache.lucene.analysis.fr; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.StopwordAnalyzerBase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WordlistLoader; @@ -59,7 +61,7 @@ import java.util.Set; *

NOTE: This class uses the same {@link Version} * dependent settings as {@link StandardAnalyzer}.

*/ -public final class FrenchAnalyzer extends Analyzer { +public final class FrenchAnalyzer extends StopwordAnalyzerBase { /** * Extended list of typical French stopwords. @@ -91,18 +93,12 @@ public final class FrenchAnalyzer extends Analyzer { "été", "être", "ô" }; - /** - * Contains the stopwords used with the {@link StopFilter}. - */ - private final Set stoptable; /** * Contains words that should be indexed but not stemmed. */ //TODO make this final in 3.0 private Set excltable = Collections.emptySet(); - private final Version matchVersion; - /** * Returns an unmodifiable instance of the default stop-words set. * @return an unmodifiable instance of the default stop-words set. @@ -148,9 +144,7 @@ public final class FrenchAnalyzer extends Analyzer { */ public FrenchAnalyzer(Version matchVersion, Set stopwords, Set stemExclutionSet) { - this.matchVersion = matchVersion; - this.stoptable = CharArraySet.unmodifiableSet(CharArraySet - .copy(matchVersion, stopwords)); + super(matchVersion, stopwords); this.excltable = CharArraySet.unmodifiableSet(CharArraySet .copy(matchVersion, stemExclutionSet)); } @@ -202,54 +196,22 @@ public final class FrenchAnalyzer extends Analyzer { } /** - * Creates a {@link TokenStream} which tokenizes all the text in the provided + * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided * {@link Reader}. * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} + * @return {@link TokenStreamComponents} built from a {@link StandardTokenizer} * filtered with {@link StandardFilter}, {@link StopFilter}, * {@link FrenchStemFilter} and {@link LowerCaseFilter} */ @Override - public final TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new StandardTokenizer(matchVersion, reader); - result = new StandardFilter(result); - result = new StopFilter(matchVersion, result, stoptable); + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + final Tokenizer source = new StandardTokenizer(matchVersion, reader); + TokenStream result = new StandardFilter(source); + result = new StopFilter(matchVersion, result, stopwords); result = new FrenchStemFilter(result, excltable); // Convert to lowercase after stemming! - result = new LowerCaseFilter(matchVersion, result); - return result; - } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - - /** - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the - * text in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} - * filtered with {@link StandardFilter}, {@link StopFilter}, - * {@link FrenchStemFilter} and {@link LowerCaseFilter} - */ - @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) - throws IOException { - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new StandardTokenizer(matchVersion, reader); - streams.result = new StandardFilter(streams.source); - streams.result = new StopFilter(matchVersion, streams.result, stoptable); - streams.result = new FrenchStemFilter(streams.result, excltable); - // Convert to lowercase after stemming! - streams.result = new LowerCaseFilter(matchVersion, streams.result); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; + return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result)); } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java index b3c190e7755..ff414b8b1b1 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java @@ -17,7 +17,6 @@ package org.apache.lucene.analysis.ru; * limitations under the License. */ -import java.io.IOException; import java.io.Reader; import java.util.Arrays; import java.util.Map; @@ -26,7 +25,9 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.StopwordAnalyzerBase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.util.Version; @@ -39,7 +40,7 @@ import org.apache.lucene.util.Version; * A default set of stopwords is used unless an alternative list is specified. *

*/ -public final class RussianAnalyzer extends Analyzer +public final class RussianAnalyzer extends StopwordAnalyzerBase { /** * List of typical Russian stopwords. @@ -63,13 +64,6 @@ public final class RussianAnalyzer extends Analyzer Arrays.asList(RUSSIAN_STOP_WORDS), false)); } - /** - * Contains the stopwords used with the StopFilter. - */ - private final Set stopSet; - - private final Version matchVersion; - public RussianAnalyzer(Version matchVersion) { this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); } @@ -91,8 +85,7 @@ public final class RussianAnalyzer extends Analyzer * a stopword set */ public RussianAnalyzer(Version matchVersion, Set stopwords){ - stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords)); - this.matchVersion = matchVersion; + super(matchVersion, stopwords); } /** @@ -106,52 +99,21 @@ public final class RussianAnalyzer extends Analyzer } /** - * Creates a {@link TokenStream} which tokenizes all the text in the + * Creates {@link TokenStreamComponents} used to tokenize all the text in the * provided {@link Reader}. * - * @return A {@link TokenStream} built from a + * @return {@link TokenStreamComponents} built from a * {@link RussianLetterTokenizer} filtered with * {@link LowerCaseFilter}, {@link StopFilter}, * and {@link RussianStemFilter} */ @Override - public TokenStream tokenStream(String fieldName, Reader reader) - { - TokenStream result = new RussianLetterTokenizer(reader); - result = new LowerCaseFilter(matchVersion, result); - result = new StopFilter(matchVersion, result, stopSet); - result = new RussianStemFilter(result); - return result; + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + final Tokenizer source = new RussianLetterTokenizer(reader); + TokenStream result = new LowerCaseFilter(matchVersion, source); + result = new StopFilter(matchVersion, result, stopwords); + return new TokenStreamComponents(source, new RussianStemFilter(result)); + } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - - /** - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text - * in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a - * {@link RussianLetterTokenizer} filtered with - * {@link LowerCaseFilter}, {@link StopFilter}, - * and {@link RussianStemFilter} - */ - @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) - throws IOException { - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new RussianLetterTokenizer(reader); - streams.result = new LowerCaseFilter(matchVersion, streams.source); - streams.result = new StopFilter(matchVersion, streams.result, stopSet); - streams.result = new RussianStemFilter(streams.result); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - } - return streams.result; - } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java index 5ea5fd1d351..bace03ee7d6 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java @@ -16,16 +16,18 @@ package org.apache.lucene.analysis.th; * limitations under the License. */ -import java.io.IOException; import java.io.Reader; + +import org.apache.lucene.analysis.ReusableAnalyzerBase; +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc import org.apache.lucene.util.Version; /** @@ -35,41 +37,28 @@ import org.apache.lucene.util.Version; *

NOTE: This class uses the same {@link Version} * dependent settings as {@link StandardAnalyzer}.

*/ -public final class ThaiAnalyzer extends Analyzer { +public final class ThaiAnalyzer extends ReusableAnalyzerBase { private final Version matchVersion; public ThaiAnalyzer(Version matchVersion) { this.matchVersion = matchVersion; } - + + /** + * Creates {@link TokenStreamComponents} used to tokenize all the text in the + * provided {@link Reader}. + * + * @return {@link TokenStreamComponents} built from a + * {@link StandardTokenizer} filtered with {@link StandardFilter}, + * {@link ThaiWordFilter}, and {@link StopFilter} + */ @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream ts = new StandardTokenizer(matchVersion, reader); - ts = new StandardFilter(ts); - ts = new ThaiWordFilter(ts); - ts = new StopFilter(matchVersion, ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET); - return ts; - } - - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - - @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new StandardTokenizer(matchVersion, reader); - streams.result = new StandardFilter(streams.source); - streams.result = new ThaiWordFilter(streams.result); - streams.result = new StopFilter(matchVersion, streams.result, StopAnalyzer.ENGLISH_STOP_WORDS_SET); - setPreviousTokenStream(streams); - } else { - streams.source.reset(reader); - streams.result.reset(); // reset the ThaiWordFilter's state - } - return streams.result; + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + final Tokenizer source = new StandardTokenizer(matchVersion, reader); + TokenStream result = new StandardFilter(source); + result = new ThaiWordFilter(result); + return new TokenStreamComponents(source, new StopFilter(matchVersion, + result, StopAnalyzer.ENGLISH_STOP_WORDS_SET)); } } diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java index 4f7783c4025..73d7d767512 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java @@ -17,10 +17,10 @@ package org.apache.lucene.analysis.ar; * limitations under the License. */ -import java.io.StringReader; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.util.Version; @@ -78,7 +78,9 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase { * Test that custom stopwords work, and are not case-sensitive. */ public void testCustomStopwords() throws Exception { - ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, new String[] { "the", "and", "a" }); + Set set = new HashSet(); + Collections.addAll(set, "the", "and", "a"); + ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, set); assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick", "brown", "fox" }); } diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java index 36c57869a3f..51cc740aa53 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java @@ -17,10 +17,12 @@ package org.apache.lucene.analysis.br; * limitations under the License. */ +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; + import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.Version; /** diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java index 26a7fb2b8ec..34096b58c3c 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java @@ -17,11 +17,8 @@ package org.apache.lucene.analysis.fa; * limitations under the License. */ -import java.io.StringReader; - import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.Version; /** diff --git a/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java b/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java new file mode 100644 index 00000000000..8dc5120c6a6 --- /dev/null +++ b/src/java/org/apache/lucene/analysis/ReusableAnalyzerBase.java @@ -0,0 +1,163 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis; + +import java.io.IOException; +import java.io.Reader; + +/** + * An convenience subclass of Analyzer that makes it easy to implement + * {@link TokenStream} reuse. + *

+ * ReusableAnalyzerBase is a simplification of Analyzer that supports easy reuse + * for the most common use-cases. Analyzers such as + * {@link PerFieldAnalyzerWrapper} that behave differently depending upon the + * field name need to subclass Analyzer directly instead. + *

+ *

+ * To prevent consistency problems, this class does not allow subclasses to + * extend {@link #reusableTokenStream(String, Reader)} or + * {@link #tokenStream(String, Reader)} directly. Instead, subclasses must + * implement {@link #createComponents(String, Reader)}. + *

+ */ +public abstract class ReusableAnalyzerBase extends Analyzer { + + /** + * Creates a new {@link TokenStreamComponents} instance for this analyzer. + * + * @param fieldName + * the name of the fields content passed to the + * {@link TokenStreamComponents} sink as a reader + * @param aReader + * the reader passed to the {@link Tokenizer} constructor + * @return the {@link TokenStreamComponents} for this analyzer. + */ + protected abstract TokenStreamComponents createComponents(String fieldName, + Reader aReader); + + /** + * This method uses {@link #createComponents(String, Reader)} to obtain an + * instance of {@link TokenStreamComponents}. It returns the sink of the + * components and stores the components internally. Subsequent calls to this + * method will reuse the previously stored components if and only if the + * {@link TokenStreamComponents#reset(Reader)} method returned + * true. Otherwise a new instance of + * {@link TokenStreamComponents} is created. + * + * @param fieldName the name of the field the created TokenStream is used for + * @param reader the reader the streams source reads from + */ + @Override + public final TokenStream reusableTokenStream(final String fieldName, + final Reader reader) throws IOException { + TokenStreamComponents streamChain = (TokenStreamComponents) + getPreviousTokenStream(); + if (streamChain == null || !streamChain.reset(reader)) { + streamChain = createComponents(fieldName, reader); + setPreviousTokenStream(streamChain); + } + return streamChain.getTokenStream(); + } + + /** + * This method uses {@link #createComponents(String, Reader)} to obtain an + * instance of {@link TokenStreamComponents} and returns the sink of the + * components. Each calls to this method will create a new instance of + * {@link TokenStreamComponents}. Created {@link TokenStream} instances are + * never reused. + * + * @param fieldName the name of the field the created TokenStream is used for + * @param reader the reader the streams source reads from + */ + @Override + public final TokenStream tokenStream(final String fieldName, + final Reader reader) { + return createComponents(fieldName, reader).getTokenStream(); + } + + /** + * This class encapsulates the outer components of a token stream. It provides + * access to the source ({@link Tokenizer}) and the outer end (sink), an + * instance of {@link TokenFilter} which also serves as the + * {@link TokenStream} returned by + * {@link Analyzer#tokenStream(String, Reader)} and + * {@link Analyzer#reusableTokenStream(String, Reader)}. + */ + public static class TokenStreamComponents { + final Tokenizer source; + final TokenStream sink; + + /** + * Creates a new {@link TokenStreamComponents} instance. + * + * @param source + * the analyzer's tokenizer + * @param result + * the analyzer's resulting token stream + */ + public TokenStreamComponents(final Tokenizer source, + final TokenStream result) { + this.source = source; + this.sink = result; + } + + /** + * Creates a new {@link TokenStreamComponents} instance. + * + * @param source + * the analyzer's tokenizer + */ + public TokenStreamComponents(final Tokenizer source) { + this.source = source; + this.sink = source; + } + + /** + * Resets the encapsulated components with the given reader. This method by + * default returns true indicating that the components have + * been reset successfully. Subclasses of {@link ReusableAnalyzerBase} might use + * their own {@link TokenStreamComponents} returning false if + * the components cannot be reset. + * + * @param reader + * a reader to reset the source component + * @return true if the components were reset, otherwise + * false + * @throws IOException + * if the component's reset method throws an {@link IOException} + */ + protected boolean reset(final Reader reader) throws IOException { + source.reset(reader); + if(sink != source) + sink.reset(); // only reset if the sink reference is different from source + return true; + } + + /** + * Returns the sink {@link TokenStream} + * + * @return the sink {@link TokenStream} + */ + protected TokenStream getTokenStream() { + return sink; + } + + } + +} diff --git a/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java b/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java index dcf577f0559..fc3b8a3f243 100644 --- a/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java +++ b/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java @@ -18,25 +18,15 @@ package org.apache.lucene.analysis; */ import java.io.Reader; -import java.io.IOException; /** An {@link Analyzer} that filters {@link LetterTokenizer} * with {@link LowerCaseFilter} */ -public final class SimpleAnalyzer extends Analyzer { - @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - return new LowerCaseTokenizer(reader); - } +public final class SimpleAnalyzer extends ReusableAnalyzerBase { @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { - Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream(); - if (tokenizer == null) { - tokenizer = new LowerCaseTokenizer(reader); - setPreviousTokenStream(tokenizer); - } else - tokenizer.reset(reader); - return tokenizer; + protected TokenStreamComponents createComponents(final String fieldName, + final Reader reader) { + return new TokenStreamComponents(new LowerCaseTokenizer(reader)); } } diff --git a/src/java/org/apache/lucene/analysis/StopAnalyzer.java b/src/java/org/apache/lucene/analysis/StopAnalyzer.java index ec45e332e87..9e76b89d773 100644 --- a/src/java/org/apache/lucene/analysis/StopAnalyzer.java +++ b/src/java/org/apache/lucene/analysis/StopAnalyzer.java @@ -24,6 +24,7 @@ import java.util.Arrays; import java.util.Set; import java.util.List; +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link import org.apache.lucene.util.Version; /** Filters {@link LetterTokenizer} with {@link LowerCaseFilter} and {@link StopFilter}. @@ -38,9 +39,7 @@ import org.apache.lucene.util.Version; * */ -public final class StopAnalyzer extends Analyzer { - private final Set stopWords; - private final Version matchVersion; +public final class StopAnalyzer extends StopwordAnalyzerBase { /** An unmodifiable set containing some common English words that are not usually useful for searching.*/ @@ -65,16 +64,14 @@ public final class StopAnalyzer extends Analyzer { * @param matchVersion See above */ public StopAnalyzer(Version matchVersion) { - stopWords = ENGLISH_STOP_WORDS_SET; - this.matchVersion = matchVersion; + this(matchVersion, ENGLISH_STOP_WORDS_SET); } /** Builds an analyzer with the stop words from the given set. * @param matchVersion See above * @param stopWords Set of stop words */ public StopAnalyzer(Version matchVersion, Set stopWords) { - this.stopWords = stopWords; - this.matchVersion = matchVersion; + super(matchVersion, stopWords); } /** Builds an analyzer with the stop words from the given file. @@ -82,8 +79,7 @@ public final class StopAnalyzer extends Analyzer { * @param matchVersion See above * @param stopwordsFile File to load stop words from */ public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException { - stopWords = WordlistLoader.getWordSet(stopwordsFile); - this.matchVersion = matchVersion; + this(matchVersion, WordlistLoader.getWordSet(stopwordsFile)); } /** Builds an analyzer with the stop words from the given reader. @@ -91,34 +87,21 @@ public final class StopAnalyzer extends Analyzer { * @param matchVersion See above * @param stopwords Reader to load stop words from */ public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException { - stopWords = WordlistLoader.getWordSet(stopwords); - this.matchVersion = matchVersion; + this(matchVersion, WordlistLoader.getWordSet(stopwords)); } - /** Filters LowerCaseTokenizer with StopFilter. */ + /** + * Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}. + * + * @return {@link TokenStreamComponents} built from a {@link LowerCaseTokenizer} filtered with + * {@link StopFilter} + */ @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - return new StopFilter(matchVersion, - new LowerCaseTokenizer(reader), stopWords); - } - - /** Filters LowerCaseTokenizer with StopFilter. */ - private class SavedStreams { - Tokenizer source; - TokenStream result; - }; - @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { - SavedStreams streams = (SavedStreams) getPreviousTokenStream(); - if (streams == null) { - streams = new SavedStreams(); - streams.source = new LowerCaseTokenizer(reader); - streams.result = new StopFilter(matchVersion, - streams.source, stopWords); - setPreviousTokenStream(streams); - } else - streams.source.reset(reader); - return streams.result; + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + final Tokenizer source = new LowerCaseTokenizer(reader); + return new TokenStreamComponents(source, new StopFilter(matchVersion, + source, stopwords)); } } diff --git a/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java b/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java new file mode 100644 index 00000000000..cdb9145db5c --- /dev/null +++ b/src/java/org/apache/lucene/analysis/StopwordAnalyzerBase.java @@ -0,0 +1,110 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis; + +import java.io.IOException; +import java.util.Set; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.ReusableAnalyzerBase; +import org.apache.lucene.analysis.WordlistLoader; +import org.apache.lucene.util.Version; + +/** + * Base class for Analyzers that need to make use of stopword sets. + * + */ +public abstract class StopwordAnalyzerBase extends ReusableAnalyzerBase { + + /** + * An immutable stopword set + */ + protected final CharArraySet stopwords; + + protected final Version matchVersion; + + /** + * Returns the analyzer's stopword set or an empty set if the analyzer has no + * stopwords + * + * @return the analyzer's stopword set or an empty set if the analyzer has no + * stopwords + */ + public Set getStopwordSet() { + return stopwords; + } + + /** + * Creates a new instance initialized with the given stopword set + * + * @param version + * the Lucene version for cross version compatibility + * @param stopwords + * the analyzer's stopword set + */ + protected StopwordAnalyzerBase(final Version version, final Set stopwords) { + /* + * no need to call + * setOverridesTokenStreamMethod(StopwordAnalyzerBase.class); here, both + * tokenStream methods are final in this class. + */ + matchVersion = version; + // analyzers should use char array set for stopwords! + this.stopwords = stopwords == null ? CharArraySet.EMPTY_SET : CharArraySet + .unmodifiableSet(CharArraySet.copy(version, stopwords)); + } + + /** + * Creates a new Analyzer with an empty stopword set + * + * @param version + * the Lucene version for cross version compatibility + */ + protected StopwordAnalyzerBase(final Version version) { + this(version, null); + } + + /** + * Creates a CharArraySet from a file resource associated with a class. (See + * {@link Class#getResourceAsStream(String)}). + * + * @param ignoreCase + * true if the set should ignore the case of the + * stopwords, otherwise false + * @param aClass + * a class that is associated with the given stopwordResource + * @param resource + * name of the resource file associated with the given class + * @param comment + * comment string to ignore in the stopword file + * @return a CharArraySet containing the distinct stopwords from the given + * file + * @throws IOException + * if loading the stopwords throws an {@link IOException} + */ + protected static CharArraySet loadStopwordSet(final boolean ignoreCase, + final Class aClass, final String resource, + final String comment) throws IOException { + final Set wordSet = WordlistLoader.getWordSet(aClass, resource, + comment); + final CharArraySet set = new CharArraySet(Version.LUCENE_31, wordSet.size(), ignoreCase); + set.addAll(wordSet); + return set; + } + +} diff --git a/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java b/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java index 2c2e4c0278b..edb6de1210c 100644 --- a/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java +++ b/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java @@ -18,24 +18,14 @@ package org.apache.lucene.analysis; */ import java.io.Reader; -import java.io.IOException; /** An Analyzer that uses {@link WhitespaceTokenizer}. */ -public final class WhitespaceAnalyzer extends Analyzer { - @Override - public TokenStream tokenStream(String fieldName, Reader reader) { - return new WhitespaceTokenizer(reader); - } +public final class WhitespaceAnalyzer extends ReusableAnalyzerBase { @Override - public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { - Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream(); - if (tokenizer == null) { - tokenizer = new WhitespaceTokenizer(reader); - setPreviousTokenStream(tokenizer); - } else - tokenizer.reset(reader); - return tokenizer; + protected TokenStreamComponents createComponents(final String fieldName, + final Reader reader) { + return new TokenStreamComponents(new WhitespaceTokenizer(reader)); } } diff --git a/src/java/org/apache/lucene/analysis/WordlistLoader.java b/src/java/org/apache/lucene/analysis/WordlistLoader.java index f071bb606aa..051a578288d 100644 --- a/src/java/org/apache/lucene/analysis/WordlistLoader.java +++ b/src/java/org/apache/lucene/analysis/WordlistLoader.java @@ -21,15 +21,69 @@ import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; +import java.io.InputStreamReader; import java.io.Reader; import java.util.HashMap; import java.util.HashSet; +import java.util.Set; /** * Loader for text files that represent a list of stopwords. */ public class WordlistLoader { - + + /** + * Loads a text file associated with a given class (See + * {@link Class#getResourceAsStream(String)}) and adds every line as an entry + * to a {@link Set} (omitting leading and trailing whitespace). Every line of + * the file should contain only one word. The words need to be in lower-case if + * you make use of an Analyzer which uses LowerCaseFilter (like + * StandardAnalyzer). + * + * @param aClass + * a class that is associated with the given stopwordResource + * @param stopwordResource + * name of the resource file associated with the given class + * @return a {@link Set} with the file's words + */ + public static Set getWordSet(Class aClass, String stopwordResource) + throws IOException { + final Reader reader = new BufferedReader(new InputStreamReader(aClass + .getResourceAsStream(stopwordResource), "UTF-8")); + try { + return getWordSet(reader); + } finally { + reader.close(); + } + } + + /** + * Loads a text file associated with a given class (See + * {@link Class#getResourceAsStream(String)}) and adds every line as an entry + * to a {@link Set} (omitting leading and trailing whitespace). Every line of + * the file should contain only one word. The words need to be in lower-case if + * you make use of an Analyzer which uses LowerCaseFilter (like + * StandardAnalyzer). + * + * @param aClass + * a class that is associated with the given stopwordResource + * @param stopwordResource + * name of the resource file associated with the given class + * @param comment + * the comment string to ignore + * @return a {@link Set} with the file's words + */ + public static Set getWordSet(Class aClass, + String stopwordResource, String comment) throws IOException { + final Reader reader = new BufferedReader(new InputStreamReader(aClass + .getResourceAsStream(stopwordResource), "UTF-8")); + try { + return getWordSet(reader, comment); + } finally { + reader.close(); + } + } + /** * Loads a text file and adds every line as an entry to a HashSet (omitting * leading and trailing whitespace). Every line of the file should contain only @@ -40,17 +94,15 @@ public class WordlistLoader { * @return A HashSet with the file's words */ public static HashSet getWordSet(File wordfile) throws IOException { - HashSet result = new HashSet(); FileReader reader = null; try { reader = new FileReader(wordfile); - result = getWordSet(reader); + return getWordSet(reader); } finally { if (reader != null) reader.close(); } - return result; } /** @@ -64,17 +116,15 @@ public class WordlistLoader { * @return A HashSet with the file's words */ public static HashSet getWordSet(File wordfile, String comment) throws IOException { - HashSet result = new HashSet(); FileReader reader = null; try { reader = new FileReader(wordfile); - result = getWordSet(reader, comment); + return getWordSet(reader, comment); } finally { if (reader != null) reader.close(); } - return result; } @@ -88,7 +138,7 @@ public class WordlistLoader { * @return A HashSet with the reader's words */ public static HashSet getWordSet(Reader reader) throws IOException { - HashSet result = new HashSet(); + final HashSet result = new HashSet(); BufferedReader br = null; try { if (reader instanceof BufferedReader) { @@ -119,7 +169,7 @@ public class WordlistLoader { * @return A HashSet with the reader's words */ public static HashSet getWordSet(Reader reader, String comment) throws IOException { - HashSet result = new HashSet(); + final HashSet result = new HashSet(); BufferedReader br = null; try { if (reader instanceof BufferedReader) { @@ -154,21 +204,18 @@ public class WordlistLoader { public static HashMap getStemDict(File wordstemfile) throws IOException { if (wordstemfile == null) throw new NullPointerException("wordstemfile may not be null"); - HashMap result = new HashMap(); + final HashMap result = new HashMap(); BufferedReader br = null; - FileReader fr = null; + try { - fr = new FileReader(wordstemfile); - br = new BufferedReader(fr); + br = new BufferedReader(new FileReader(wordstemfile)); String line; while ((line = br.readLine()) != null) { String[] wordstem = line.split("\t", 2); result.put(wordstem[0], wordstem[1]); } } finally { - if (fr != null) - fr.close(); - if (br != null) + if(br != null) br.close(); } return result; diff --git a/src/test/org/apache/lucene/index/wordliststopwords.txt b/src/test/org/apache/lucene/index/wordliststopwords.txt new file mode 100644 index 00000000000..7d3550734e7 --- /dev/null +++ b/src/test/org/apache/lucene/index/wordliststopwords.txt @@ -0,0 +1,5 @@ +#comment +ONE +two +#comment +three diff --git a/src/test/org/apache/lucene/index/wordliststopwords_nocomment.txt b/src/test/org/apache/lucene/index/wordliststopwords_nocomment.txt new file mode 100644 index 00000000000..59cb04ec465 --- /dev/null +++ b/src/test/org/apache/lucene/index/wordliststopwords_nocomment.txt @@ -0,0 +1,3 @@ +ONE +two +three