diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt index 7686ca898ed..c4e104cf79e 100644 --- a/contrib/CHANGES.txt +++ b/contrib/CHANGES.txt @@ -150,6 +150,9 @@ Optimizations better performance, in ICUCollationKeyFilter. (Robert Muir via Mike McCandless) + 2. LUCENE-1794: Implement TokenStream reuse for contrib Analyzers, + and implement reset() for TokenStreams to support reuse. (Robert Muir) + Documentation (None) diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java index 8929cabfa05..a7c82720b22 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java @@ -30,6 +30,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WordlistLoader; /** @@ -109,7 +110,7 @@ public final class ArabicAnalyzer extends Analyzer { /** * Creates a TokenStream which tokenizes all the text in the provided Reader. * - * @return A TokenStream build from an ArabicTokenizer filtered with + * @return A TokenStream built from an ArabicTokenizer filtered with * StopFilter, LowerCaseFilter, ArabicNormalizationFilter and ArabicStemFilter. */ public final TokenStream tokenStream(String fieldName, Reader reader) { @@ -121,5 +122,35 @@ public final class ArabicAnalyzer extends Analyzer { return result; } + + private class SavedStreams { + Tokenizer source; + TokenStream result; + }; + + /** + * Returns a (possibly reused) TokenStream which tokenizes all the text + * in the provided Reader. + * + * @return A TokenStream built from an ArabicTokenizer filtered with + * StopFilter, LowerCaseFilter, ArabicNormalizationFilter and + * ArabicStemFilter. + */ + public TokenStream reusableTokenStream(String fieldName, Reader reader) + throws IOException { + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + streams.source = new ArabicLetterTokenizer(reader); + streams.result = new StopFilter(streams.source, stoptable); + streams.result = new LowerCaseFilter(streams.result); + streams.result = new ArabicNormalizationFilter(streams.result); + streams.result = new ArabicStemFilter(streams.result); + setPreviousTokenStream(streams); + } else { + streams.source.reset(reader); + } + return streams.result; + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java index 46372601d5c..39feeb8d558 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java @@ -28,6 +28,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WordlistLoader; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; @@ -125,8 +126,9 @@ public final class BrazilianAnalyzer extends Analyzer { /** * Creates a TokenStream which tokenizes all the text in the provided Reader. * - * @return A TokenStream build from a StandardTokenizer filtered with - * StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter. + * @return A TokenStream built from a StandardTokenizer filtered with + * LowerCaseFilter, StandardFilter, StopFilter, and + * BrazilianStemFilter. */ public final TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer( reader ); @@ -136,5 +138,35 @@ public final class BrazilianAnalyzer extends Analyzer { result = new BrazilianStemFilter( result, excltable ); return result; } + + private class SavedStreams { + Tokenizer source; + TokenStream result; + }; + + /** + * Returns a (possibly reused) TokenStream which tokenizes all the text + * in the provided Reader. + * + * @return A TokenStream built from a StandardTokenizer filtered with + * LowerCaseFilter, StandardFilter, StopFilter, and + * BrazilianStemFilter. + */ + public TokenStream reusableTokenStream(String fieldName, Reader reader) + throws IOException { + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + streams.source = new StandardTokenizer(reader); + streams.result = new LowerCaseFilter(streams.source); + streams.result = new StandardFilter(streams.result); + streams.result = new StopFilter(streams.result, stoptable); + streams.result = new BrazilianStemFilter(streams.result, excltable); + setPreviousTokenStream(streams); + } else { + streams.source.reset(reader); + } + return streams.result; + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java index 1aba49e8258..ee39161d157 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java @@ -20,7 +20,9 @@ package org.apache.lucene.analysis.cjk; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import java.io.IOException; import java.io.Reader; import java.util.Set; @@ -84,4 +86,30 @@ public class CJKAnalyzer extends Analyzer { public final TokenStream tokenStream(String fieldName, Reader reader) { return new StopFilter(new CJKTokenizer(reader), stopTable); } + + private class SavedStreams { + Tokenizer source; + TokenStream result; + }; + + /** + * get (possibly reused) token stream from input + * + * @param fieldName lucene field name + * @param reader input reader + * @return TokenStream + */ + public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + /* tokenStream() is final, no back compat issue */ + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + streams.source = new CJKTokenizer(reader); + streams.result = new StopFilter(streams.source, stopTable); + setPreviousTokenStream(streams); + } else { + streams.source.reset(reader); + } + return streams.result; + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java index 715c1d9d80c..68fe8d54490 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java @@ -278,5 +278,17 @@ public final class CJKTokenizer extends Tokenizer { // set final offset final int finalOffset = offset; this.offsetAtt.setOffset(finalOffset, finalOffset); - } + } + + public void reset() throws IOException { + super.reset(); + offset = bufferIndex = dataLen = 0; + preIsTokened = false; + tokenType = WORD_TYPE; + } + + public void reset(Reader reader) throws IOException { + super.reset(reader); + reset(); + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java index 893ca3733a5..5470a4f215a 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java @@ -17,9 +17,11 @@ package org.apache.lucene.analysis.cn; * limitations under the License. */ +import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; /** * Title: ChineseAnalyzer @@ -47,4 +49,31 @@ public class ChineseAnalyzer extends Analyzer { result = new ChineseFilter(result); return result; } + + private class SavedStreams { + Tokenizer source; + TokenStream result; + }; + + /** + * Returns a (possibly reused) TokenStream which tokenizes all the text in the + * provided Reader. + * + * @return A TokenStream build from a ChineseTokenizer filtered with + * ChineseFilter. + */ + public final TokenStream reusableTokenStream(String fieldName, Reader reader) + throws IOException { + /* tokenStream() is final, no back compat issue */ + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + streams.source = new ChineseTokenizer(reader); + streams.result = new ChineseFilter(streams.source); + setPreviousTokenStream(streams); + } else { + streams.source.reset(reader); + } + return streams.result; + } } \ No newline at end of file diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java index 72a9394e542..1d38378094f 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java @@ -146,5 +146,15 @@ public final class ChineseTokenizer extends Tokenizer { // set final offset final int finalOffset = offset; this.offsetAtt.setOffset(finalOffset, finalOffset); - } + } + + public void reset() throws IOException { + super.reset(); + offset = bufferIndex = dataLen = 0; + } + + public void reset(Reader input) throws IOException { + super.reset(input); + reset(); + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java index 15cead80072..5bd501f165a 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java @@ -215,4 +215,9 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { } protected abstract void decomposeInternal(final Token token); + + public void reset() throws IOException { + super.reset(); + tokens.clear(); + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java index ee49784dafb..280033e45ec 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WordlistLoader; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; @@ -126,7 +127,7 @@ public final class CzechAnalyzer extends Analyzer { /** * Creates a TokenStream which tokenizes all the text in the provided Reader. * - * @return A TokenStream build from a StandardTokenizer filtered with + * @return A TokenStream built from a StandardTokenizer filtered with * StandardFilter, LowerCaseFilter, and StopFilter */ public final TokenStream tokenStream( String fieldName, Reader reader ) { @@ -136,5 +137,33 @@ public final class CzechAnalyzer extends Analyzer { result = new StopFilter( result, stoptable ); return result; } + + private class SavedStreams { + Tokenizer source; + TokenStream result; + }; + + /** + * Returns a (possibly reused) TokenStream which tokenizes all the text in + * the provided Reader. + * + * @return A TokenStream built from a StandardTokenizer filtered with + * StandardFilter, LowerCaseFilter, and StopFilter + */ + public TokenStream reusableTokenStream(String fieldName, Reader reader) + throws IOException { + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + streams.source = new StandardTokenizer(reader); + streams.result = new StandardFilter(streams.source); + streams.result = new LowerCaseFilter(streams.result); + streams.result = new StopFilter(streams.result, stoptable); + setPreviousTokenStream(streams); + } else { + streams.source.reset(reader); + } + return streams.result; + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java index 0ce9160d599..b7d5a20a0f0 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java @@ -29,6 +29,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WordlistLoader; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; @@ -79,6 +80,7 @@ public class GermanAnalyzer extends Analyzer { */ public GermanAnalyzer() { stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS); + setOverridesTokenStreamMethod(GermanAnalyzer.class); } /** @@ -86,6 +88,7 @@ public class GermanAnalyzer extends Analyzer { */ public GermanAnalyzer(String[] stopwords) { stopSet = StopFilter.makeStopSet(stopwords); + setOverridesTokenStreamMethod(GermanAnalyzer.class); } /** @@ -93,6 +96,7 @@ public class GermanAnalyzer extends Analyzer { */ public GermanAnalyzer(Map stopwords) { stopSet = new HashSet(stopwords.keySet()); + setOverridesTokenStreamMethod(GermanAnalyzer.class); } /** @@ -100,6 +104,7 @@ public class GermanAnalyzer extends Analyzer { */ public GermanAnalyzer(File stopwords) throws IOException { stopSet = WordlistLoader.getWordSet(stopwords); + setOverridesTokenStreamMethod(GermanAnalyzer.class); } /** @@ -126,7 +131,7 @@ public class GermanAnalyzer extends Analyzer { /** * Creates a TokenStream which tokenizes all the text in the provided Reader. * - * @return A TokenStream build from a StandardTokenizer filtered with + * @return A TokenStream built from a StandardTokenizer filtered with * StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter */ public TokenStream tokenStream(String fieldName, Reader reader) { @@ -137,4 +142,39 @@ public class GermanAnalyzer extends Analyzer { result = new GermanStemFilter(result, exclusionSet); return result; } + + private class SavedStreams { + Tokenizer source; + TokenStream result; + }; + + /** + * Returns a (possibly reused) TokenStream which tokenizes all the text + * in the provided Reader. + * + * @return A TokenStream built from a StandardTokenizer filtered with + * StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter + */ + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + if (overridesTokenStreamMethod) { + // LUCENE-1678: force fallback to tokenStream() if we + // have been subclassed and that subclass overrides + // tokenStream but not reusableTokenStream + return tokenStream(fieldName, reader); + } + + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + streams.source = new StandardTokenizer(reader); + streams.result = new StandardFilter(streams.source); + streams.result = new LowerCaseFilter(streams.result); + streams.result = new StopFilter(streams.result, stopSet); + streams.result = new GermanStemFilter(streams.result, exclusionSet); + setPreviousTokenStream(streams); + } else { + streams.source.reset(reader); + } + return streams.result; + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java index 65c5f327818..a77f5f80224 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java @@ -20,8 +20,10 @@ package org.apache.lucene.analysis.el; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; +import java.io.IOException; import java.io.Reader; import java.util.HashSet; import java.util.Map; @@ -209,7 +211,7 @@ public final class GreekAnalyzer extends Analyzer /** * Creates a TokenStream which tokenizes all the text in the provided Reader. * - * @return A TokenStream build from a StandardTokenizer filtered with + * @return A TokenStream built from a StandardTokenizer filtered with * GreekLowerCaseFilter and StopFilter */ public TokenStream tokenStream(String fieldName, Reader reader) @@ -219,4 +221,31 @@ public final class GreekAnalyzer extends Analyzer result = new StopFilter(result, stopSet); return result; } + + private class SavedStreams { + Tokenizer source; + TokenStream result; + }; + + /** + * Returns a (possibly reused) TokenStream which tokenizes all the text + * in the provided Reader. + * + * @return A TokenStream built from a StandardTokenizer filtered with + * GreekLowerCaseFilter and StopFilter + */ + public TokenStream reusableTokenStream(String fieldName, Reader reader) + throws IOException { + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + streams.source = new StandardTokenizer(reader); + streams.result = new GreekLowerCaseFilter(streams.source, charset); + streams.result = new StopFilter(streams.result, stopSet); + setPreviousTokenStream(streams); + } else { + streams.source.reset(reader); + } + return streams.result; + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java index f45de6f13e0..fbafa35a14a 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WordlistLoader; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; @@ -128,7 +129,7 @@ public final class FrenchAnalyzer extends Analyzer { /** * Creates a TokenStream which tokenizes all the text in the provided Reader. * - * @return A TokenStream build from a StandardTokenizer filtered with + * @return A TokenStream built from a StandardTokenizer filtered with * StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter */ public final TokenStream tokenStream(String fieldName, Reader reader) { @@ -144,5 +145,35 @@ public final class FrenchAnalyzer extends Analyzer { result = new LowerCaseFilter(result); return result; } + + private class SavedStreams { + Tokenizer source; + TokenStream result; + }; + + /** + * Returns a (possibly reused) TokenStream which tokenizes all the text + * in the provided Reader. + * + * @return A TokenStream built from a StandardTokenizer filtered with + * StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter + */ + public TokenStream reusableTokenStream(String fieldName, Reader reader) + throws IOException { + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + streams.source = new StandardTokenizer(reader); + streams.result = new StandardFilter(streams.source); + streams.result = new StopFilter(streams.result, stoptable); + streams.result = new FrenchStemFilter(streams.result, excltable); + // Convert to lowercase after stemming! + streams.result = new LowerCaseFilter(streams.result); + setPreviousTokenStream(streams); + } else { + streams.source.reset(reader); + } + return streams.result; + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java index a00e1ce633a..b0fca28d82e 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java @@ -161,4 +161,9 @@ public class EdgeNGramTokenFilter extends TokenFilter { public final Token next() throws java.io.IOException { return super.next(); } + + public void reset() throws IOException { + super.reset(); + curTermBuffer = null; + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java index 79dd188ae23..91579094bb5 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java @@ -170,4 +170,14 @@ public class EdgeNGramTokenizer extends Tokenizer { public final Token next() throws java.io.IOException { return super.next(); } + + public void reset(Reader input) throws IOException { + super.reset(input); + reset(); + } + + public void reset() throws IOException { + super.reset(); + started = false; + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java index ebf9fc0bdc0..46db5ce3670 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java @@ -109,4 +109,9 @@ public class NGramTokenFilter extends TokenFilter { public final Token next() throws java.io.IOException { return super.next(); } + + public void reset() throws IOException { + super.reset(); + curTermBuffer = null; + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java index 4a3c7a315eb..974bea64771 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java @@ -115,4 +115,15 @@ public class NGramTokenizer extends Tokenizer { public final Token next() throws java.io.IOException { return super.next(); } + + public void reset(Reader input) throws IOException { + super.reset(input); + reset(); + } + + public void reset() throws IOException { + super.reset(); + started = false; + pos = 0; + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java index c69b4bd8556..dae58534962 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java @@ -20,6 +20,7 @@ package org.apache.lucene.analysis.nl; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; @@ -78,6 +79,7 @@ public class DutchAnalyzer extends Analyzer { * */ public DutchAnalyzer() { + setOverridesTokenStreamMethod(DutchAnalyzer.class); stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS); stemdict.put("fiets", "fiets"); //otherwise fiet stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet @@ -91,6 +93,7 @@ public class DutchAnalyzer extends Analyzer { * @param stopwords */ public DutchAnalyzer(String[] stopwords) { + setOverridesTokenStreamMethod(DutchAnalyzer.class); stoptable = StopFilter.makeStopSet(stopwords); } @@ -100,6 +103,7 @@ public class DutchAnalyzer extends Analyzer { * @param stopwords */ public DutchAnalyzer(HashSet stopwords) { + setOverridesTokenStreamMethod(DutchAnalyzer.class); stoptable = stopwords; } @@ -109,6 +113,7 @@ public class DutchAnalyzer extends Analyzer { * @param stopwords */ public DutchAnalyzer(File stopwords) { + setOverridesTokenStreamMethod(DutchAnalyzer.class); try { stoptable = org.apache.lucene.analysis.WordlistLoader.getWordSet(stopwords); } catch (IOException e) { @@ -162,7 +167,7 @@ public class DutchAnalyzer extends Analyzer { /** * Creates a TokenStream which tokenizes all the text in the provided TextReader. * - * @return A TokenStream build from a StandardTokenizer filtered with StandardFilter, + * @return A TokenStream built from a StandardTokenizer filtered with StandardFilter, * StopFilter, DutchStemFilter */ public TokenStream tokenStream(String fieldName, Reader reader) { @@ -172,4 +177,39 @@ public class DutchAnalyzer extends Analyzer { result = new DutchStemFilter(result, excltable, stemdict); return result; } + + private class SavedStreams { + Tokenizer source; + TokenStream result; + }; + + /** + * Returns a (possibly reused) TokenStream which tokenizes all the text + * in the provided Reader. + * + * @return A TokenStream built from a StandardTokenizer filtered with + * StandardFilter, StopFilter, DutchStemFilter + */ + public TokenStream reusableTokenStream(String fieldName, Reader reader) + throws IOException { + if (overridesTokenStreamMethod) { + // LUCENE-1678: force fallback to tokenStream() if we + // have been subclassed and that subclass overrides + // tokenStream but not reusableTokenStream + return tokenStream(fieldName, reader); + } + + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + streams.source = new StandardTokenizer(reader); + streams.result = new StandardFilter(streams.source); + streams.result = new StopFilter(streams.result, stoptable); + streams.result = new DutchStemFilter(streams.result, excltable); + setPreviousTokenStream(streams); + } else { + streams.source.reset(reader); + } + return streams.result; + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java index b629cc5c0f5..1f3aea5a0c1 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java @@ -56,6 +56,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer { */ public QueryAutoStopWordAnalyzer(Analyzer delegate) { this.delegate = delegate; + setOverridesTokenStreamMethod(QueryAutoStopWordAnalyzer.class); } /** @@ -154,17 +155,97 @@ public class QueryAutoStopWordAnalyzer extends Analyzer { term = te.term(); } stopWordsPerField.put(fieldName, stopWords); + + /* if the stopwords for a field are changed, + * then saved streams for that field are erased. + */ + Map streamMap = (Map) getPreviousTokenStream(); + if (streamMap != null) + streamMap.remove(fieldName); + return stopWords.size(); } public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = delegate.tokenStream(fieldName, reader); + TokenStream result; + try { + result = delegate.reusableTokenStream(fieldName, reader); + } catch (IOException e) { + result = delegate.tokenStream(fieldName, reader); + } HashSet stopWords = (HashSet) stopWordsPerField.get(fieldName); if (stopWords != null) { result = new StopFilter(result, stopWords); } return result; } + + private class SavedStreams { + /* the underlying stream */ + TokenStream wrapped; + + /* + * when there are no stopwords for the field, refers to wrapped. + * if there stopwords, it is a StopFilter around wrapped. + */ + TokenStream withStopFilter; + }; + + public TokenStream reusableTokenStream(String fieldName, Reader reader) + throws IOException { + if (overridesTokenStreamMethod) { + // LUCENE-1678: force fallback to tokenStream() if we + // have been subclassed and that subclass overrides + // tokenStream but not reusableTokenStream + return tokenStream(fieldName, reader); + } + + /* map of SavedStreams for each field */ + Map streamMap = (Map) getPreviousTokenStream(); + if (streamMap == null) { + streamMap = new HashMap(); + setPreviousTokenStream(streamMap); + } + + SavedStreams streams = (SavedStreams) streamMap.get(fieldName); + if (streams == null) { + /* an entry for this field does not exist, create one */ + streams = new SavedStreams(); + streamMap.put(fieldName, streams); + streams.wrapped = delegate.reusableTokenStream(fieldName, reader); + + /* if there are any stopwords for the field, save the stopfilter */ + HashSet stopWords = (HashSet) stopWordsPerField.get(fieldName); + if (stopWords != null) + streams.withStopFilter = new StopFilter(streams.wrapped, stopWords); + else + streams.withStopFilter = streams.wrapped; + + } else { + /* + * an entry for this field exists, verify the wrapped stream has not + * changed. if it has not, reuse it, otherwise wrap the new stream. + */ + TokenStream result = delegate.reusableTokenStream(fieldName, reader); + if (result == streams.wrapped) { + /* the wrapped analyzer reused the stream */ + streams.withStopFilter.reset(); + } else { + /* + * the wrapped analyzer did not. if there are any stopwords for the + * field, create a new StopFilter around the new stream + */ + streams.wrapped = result; + HashSet stopWords = (HashSet) stopWordsPerField.get(fieldName); + if (stopWords != null) + streams.withStopFilter = new StopFilter(streams.wrapped, stopWords); + else + streams.withStopFilter = streams.wrapped; + } + } + + return streams.withStopFilter; + } /** * Provides information on which stop words have been identified for a field diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java index 6f735bd0f8a..d3bf8b56162 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.ru; * limitations under the License. */ +import java.io.IOException; import java.io.Reader; import java.util.HashSet; import java.util.Map; @@ -25,6 +26,7 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; /** * Analyzer for Russian language. Supports an external list of stopwords (words that @@ -246,7 +248,7 @@ public final class RussianAnalyzer extends Analyzer /** * Creates a TokenStream which tokenizes all the text in the provided Reader. * - * @return A TokenStream build from a RussianLetterTokenizer filtered with + * @return A TokenStream built from a RussianLetterTokenizer filtered with * RussianLowerCaseFilter, StopFilter, and RussianStemFilter */ public TokenStream tokenStream(String fieldName, Reader reader) @@ -257,4 +259,32 @@ public final class RussianAnalyzer extends Analyzer result = new RussianStemFilter(result, charset); return result; } + + private class SavedStreams { + Tokenizer source; + TokenStream result; + }; + + /** + * Returns a (possibly reused) TokenStream which tokenizes all the text + * in the provided Reader. + * + * @return A TokenStream built from a RussianLetterTokenizer filtered with + * RussianLowerCaseFilter, StopFilter, and RussianStemFilter + */ + public TokenStream reusableTokenStream(String fieldName, Reader reader) + throws IOException { + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + streams.source = new RussianLetterTokenizer(reader, charset); + streams.result = new RussianLowerCaseFilter(streams.source, charset); + streams.result = new StopFilter(streams.result, stopSet); + streams.result = new RussianStemFilter(streams.result, charset); + setPreviousTokenStream(streams); + } else { + streams.source.reset(reader); + } + return streams.result; + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java index 4091ccf1ce7..358ae084c59 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.shingle; * limitations under the License. */ +import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Analyzer; @@ -36,6 +37,7 @@ public class ShingleAnalyzerWrapper extends Analyzer { public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) { super(); this.defaultAnalyzer = defaultAnalyzer; + setOverridesTokenStreamMethod(ShingleAnalyzerWrapper.class); } public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) { @@ -49,6 +51,7 @@ public class ShingleAnalyzerWrapper extends Analyzer { public ShingleAnalyzerWrapper() { super(); this.defaultAnalyzer = new StandardAnalyzer(); + setOverridesTokenStreamMethod(ShingleAnalyzerWrapper.class); } public ShingleAnalyzerWrapper(int nGramSize) { @@ -90,10 +93,50 @@ public class ShingleAnalyzerWrapper extends Analyzer { } public TokenStream tokenStream(String fieldName, Reader reader) { - ShingleFilter filter = new ShingleFilter(defaultAnalyzer.tokenStream( - fieldName, reader)); + TokenStream wrapped; + try { + wrapped = defaultAnalyzer.reusableTokenStream(fieldName, reader); + } catch (IOException e) { + wrapped = defaultAnalyzer.tokenStream(fieldName, reader); + } + ShingleFilter filter = new ShingleFilter(wrapped); filter.setMaxShingleSize(maxShingleSize); filter.setOutputUnigrams(outputUnigrams); return filter; } + + private class SavedStreams { + TokenStream wrapped; + ShingleFilter shingle; + }; + + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + if (overridesTokenStreamMethod) { + // LUCENE-1678: force fallback to tokenStream() if we + // have been subclassed and that subclass overrides + // tokenStream but not reusableTokenStream + return tokenStream(fieldName, reader); + } + + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + streams.wrapped = defaultAnalyzer.reusableTokenStream(fieldName, reader); + streams.shingle = new ShingleFilter(streams.wrapped); + setPreviousTokenStream(streams); + } else { + TokenStream result = defaultAnalyzer.reusableTokenStream(fieldName, reader); + if (result == streams.wrapped) { + /* the wrapped analyzer reused the stream */ + streams.shingle.reset(); + } else { + /* the wrapped analyzer did not, create a new shingle around the new one */ + streams.wrapped = result; + streams.shingle = new ShingleFilter(streams.wrapped); + } + } + streams.shingle.setMaxShingleSize(maxShingleSize); + streams.shingle.setOutputUnigrams(outputUnigrams); + return streams.shingle; + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java index 2f67ca8bcb3..45fd2474634 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java @@ -336,4 +336,14 @@ public class ShingleFilter extends TokenFilter { public final Token next() throws java.io.IOException { return super.next(); } + + public void reset() throws IOException { + super.reset(); + nextToken = null; + shingleBufferPosition = 0; + shingleBuf.clear(); + numFillerTokensToInsert = 0; + currentToken = null; + hasCurrentToken = false; + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java index 2d38867023b..db4ee21e69d 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java @@ -16,11 +16,13 @@ package org.apache.lucene.analysis.th; * limitations under the License. */ +import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; @@ -29,6 +31,11 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; * @version 0.2 */ public class ThaiAnalyzer extends Analyzer { + + public ThaiAnalyzer() { + setOverridesTokenStreamMethod(ThaiAnalyzer.class); + } + public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream ts = new StandardTokenizer(reader); ts = new StandardFilter(ts); @@ -36,4 +43,32 @@ public class ThaiAnalyzer extends Analyzer { ts = new StopFilter(ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET); return ts; } + + private class SavedStreams { + Tokenizer source; + TokenStream result; + }; + + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + if (overridesTokenStreamMethod) { + // LUCENE-1678: force fallback to tokenStream() if we + // have been subclassed and that subclass overrides + // tokenStream but not reusableTokenStream + return tokenStream(fieldName, reader); + } + + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + streams.source = new StandardTokenizer(reader); + streams.result = new StandardFilter(streams.source); + streams.result = new ThaiWordFilter(streams.result); + streams.result = new StopFilter(streams.result, StopAnalyzer.ENGLISH_STOP_WORDS_SET); + setPreviousTokenStream(streams); + } else { + streams.source.reset(reader); + streams.result.reset(); // reset the ThaiWordFilter's state + } + return streams.result; + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java index 055a0b1674e..95baaa491d3 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java @@ -93,4 +93,9 @@ public class ThaiWordFilter extends TokenFilter { public final Token next() throws java.io.IOException { return super.next(); } + + public void reset() throws IOException { + super.reset(); + thaiState = null; + } } diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java index 96283942cb4..a1a546a2130 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java @@ -57,6 +57,15 @@ public class TestArabicAnalyzer extends TestCase { assertAnalyzesTo(a, "ما ملكت أيمانكم", new String[] { "ملكت", "ايمانكم"}); assertAnalyzesTo(a, "الذين ملكت أيمانكم", new String[] { "ملكت", "ايمانكم" }); // stopwords } + + /** + * Simple tests to show things are getting reset correctly, etc. + */ + public void testReusableTokenStream() throws Exception { + ArabicAnalyzer a = new ArabicAnalyzer(); + assertAnalyzesToReuse(a, "كبير", new String[] { "كبير" }); + assertAnalyzesToReuse(a, "كبيرة", new String[] { "كبير" }); // feminine marker + } /** * Non-arabic text gets treated in a similar way as SimpleAnalyzer. @@ -80,5 +89,18 @@ public class TestArabicAnalyzer extends TestCase { assertFalse(ts.incrementToken()); ts.close(); } + + private void assertAnalyzesToReuse(Analyzer a, String input, String[] output) + throws Exception { + TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input)); + TermAttribute termAtt = (TermAttribute) ts + .getAttribute(TermAttribute.class); + for (int i = 0; i < output.length; i++) { + assertTrue(ts.incrementToken()); + assertEquals(output[i], termAtt.term()); + } + + assertFalse(ts.incrementToken()); + } } diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java index e1c9062425f..15527b75c52 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java @@ -117,6 +117,14 @@ public class TestBrazilianStemmer extends TestCase { check("quinzena", "quinzen"); check("quiosque", "quiosqu"); } + + public void testReusableTokenStream() throws Exception { + Analyzer a = new BrazilianAnalyzer(); + checkReuse(a, "boa", "boa"); + checkReuse(a, "boainain", "boainain"); + checkReuse(a, "boas", "boas"); + checkReuse(a, "bôas", "boas"); // removes diacritic: different from snowball portugese + } private void check(final String input, final String expected) throws IOException { @@ -128,5 +136,13 @@ public class TestBrazilianStemmer extends TestCase { assertFalse(stream.incrementToken()); stream.close(); } + + private void checkReuse(Analyzer analyzer, final String input, final String expected) throws IOException { + TokenStream stream = analyzer.reusableTokenStream("dummy", new StringReader(input)); + TermAttribute text = (TermAttribute) stream.getAttribute(TermAttribute.class); + assertTrue(stream.incrementToken()); + assertEquals(expected, text.term()); + assertFalse(stream.incrementToken()); + } } \ No newline at end of file diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java index c15ea48d964..fde7c3c33df 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java @@ -22,6 +22,8 @@ import java.io.StringReader; import junit.framework.TestCase; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; @@ -60,6 +62,21 @@ public class TestCJKTokenizer extends TestCase{ assertFalse(tokenizer.incrementToken()); } + public void checkCJKTokenReusable(final Analyzer a, final String str, final TestToken[] out_tokens) throws IOException { + TokenStream ts = a.reusableTokenStream("dummy", new StringReader(str)); + TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); + TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class); + for (int i = 0; i < out_tokens.length; i++) { + assertTrue(ts.incrementToken()); + assertEquals(termAtt.term(), out_tokens[i].termText); + assertEquals(offsetAtt.startOffset(), out_tokens[i].start); + assertEquals(offsetAtt.endOffset(), out_tokens[i].end); + assertEquals(typeAtt.type(), out_tokens[i].type); + } + assertFalse(ts.incrementToken()); + } + public void testJa1() throws IOException { String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341"; @@ -151,4 +168,38 @@ public class TestCJKTokenizer extends TestCase{ }; checkCJKToken(str, out_tokens); } + + public void testReusableTokenStream() throws Exception { + Analyzer analyzer = new CJKAnalyzer(); + String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053"; + + TestToken[] out_tokens = { + newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE), + newToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u304f\u3051", 10,12, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u3051\u3053", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE) + }; + checkCJKTokenReusable(analyzer, str, out_tokens); + + str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053"; + TestToken[] out_tokens2 = { + newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE), + newToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE), + newToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u304f\u3051", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u3053", 14,15, CJKTokenizer.DOUBLE_TOKEN_TYPE) + }; + checkCJKTokenReusable(analyzer, str, out_tokens2); + } } diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java index 32417f26ce6..07a88bb8636 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java @@ -22,7 +22,10 @@ import java.io.StringReader; import junit.framework.TestCase; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; public class TestChineseTokenizer extends TestCase @@ -42,4 +45,32 @@ public class TestChineseTokenizer extends TestCase correctEndOffset++; } } + + public void testReusableTokenStream() throws Exception + { + Analyzer a = new ChineseAnalyzer(); + assertAnalyzesToReuse(a, "中华人民共和国", + new String[] { "中", "华", "人", "民", "共", "和", "国" }, + new int[] { 0, 1, 2, 3, 4, 5, 6 }, + new int[] { 1, 2, 3, 4, 5, 6, 7 }); + assertAnalyzesToReuse(a, "北京市", + new String[] { "北", "京", "市" }, + new int[] { 0, 1, 2 }, + new int[] { 1, 2, 3 }); + } + + private void assertAnalyzesToReuse(Analyzer a, String input, String[] output, + int startOffsets[], int endOffsets[]) + throws Exception { + TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input)); + TermAttribute termAtt = (TermAttribute) ts + .getAttribute(TermAttribute.class); + + for (int i = 0; i < output.length; i++) { + assertTrue(ts.incrementToken()); + assertEquals(output[i], termAtt.term()); + } + + assertFalse(ts.incrementToken()); + } } diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java index 506dfa13a43..a865634323e 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java @@ -34,6 +34,7 @@ import java.util.zip.ZipInputStream; import junit.framework.TestCase; import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -151,6 +152,38 @@ public class TestCompoundWordTokenFilter extends TestCase { 14, 20 }, new int[] { 26, 3, 14, 14, 20, 26 }, new int[] { 1, 0, 0, 0, 0, 0 }); } + + public void testReset() throws Exception { + String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz", + "Aufgabe", "Überwachung" }; + + Reader reader = getHyphenationReader("de_DR.xml"); + if (reader == null) { + // we gracefully die if we have no reader + return; + } + + HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter + .getHyphenationTree(reader); + + Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader( + "Rindfleischüberwachungsgesetz")); + HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( + wsTokenizer, hyphenator, dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); + + TermAttribute termAtt = (TermAttribute) tf.getAttribute(TermAttribute.class); + assertTrue(tf.incrementToken()); + assertEquals("Rindfleischüberwachungsgesetz", termAtt.term()); + assertTrue(tf.incrementToken()); + assertEquals("Rind", termAtt.term()); + wsTokenizer.reset(new StringReader("Rindfleischüberwachungsgesetz")); + tf.reset(); + assertTrue(tf.incrementToken()); + assertEquals("Rindfleischüberwachungsgesetz", termAtt.term()); + } private void assertFiltersTo(TokenFilter tf, String[] s, int[] startOffset, int[] endOffset, int[] posIncr) throws Exception { diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java index 5460c95d5f7..6672abb6713 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java @@ -36,6 +36,12 @@ public class TestCzechAnalyzer extends TestCase { public void testStopWord() throws Exception { assertAnalyzesTo(new CzechAnalyzer(), "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" }); } + + public void testReusableTokenStream() throws Exception { + Analyzer analyzer = new CzechAnalyzer(); + assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" }); + assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česká", "republika" }); + } private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); @@ -47,4 +53,14 @@ public class TestCzechAnalyzer extends TestCase { assertFalse(ts.incrementToken()); ts.close(); } + + private void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws Exception { + TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input)); + TermAttribute text = (TermAttribute) ts.getAttribute(TermAttribute.class); + for (int i=0; i 0); } - - + + /** + * subclass that acts just like whitespace analyzer for testing + */ + private class QueryAutoStopWordSubclassAnalyzer extends QueryAutoStopWordAnalyzer { + public QueryAutoStopWordSubclassAnalyzer() { + super(new WhitespaceAnalyzer()); + } + + public TokenStream tokenStream(String fieldName, Reader reader) { + return new WhitespaceTokenizer(reader); + } + } + + public void testLUCENE1678BWComp() throws Exception { + QueryAutoStopWordAnalyzer a = new QueryAutoStopWordSubclassAnalyzer(); + a.addStopWords(reader, "repetitiveField", 10); + Hits h = search(a, "repetitiveField:boring"); + assertFalse(h.length() == 0); + } } diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java index 233bae1b9c1..96c7dc0b719 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java @@ -26,6 +26,7 @@ import java.io.StringReader; import junit.framework.TestCase; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; @@ -187,5 +188,22 @@ public class TestRussianAnalyzer extends TestCase fail("unexpected IOException"); } } + + public void testReusableTokenStream() throws Exception { + Analyzer a = new RussianAnalyzer(); + assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще", + new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" }); + assertAnalyzesToReuse(a, "Но знание это хранилось в тайне", + new String[] { "знан", "хран", "тайн" }); + } + private void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws Exception { + TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input)); + TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); + for (int i=0; i