mirror of https://github.com/apache/lucene.git
LUCENE-1794: Implement TokenStream reuse for contrib Analyzers
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@804680 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3dbcc8f1a5
commit
d2af6ef0bd
|
@ -150,6 +150,9 @@ Optimizations
|
|||
better performance, in ICUCollationKeyFilter. (Robert Muir via
|
||||
Mike McCandless)
|
||||
|
||||
2. LUCENE-1794: Implement TokenStream reuse for contrib Analyzers,
|
||||
and implement reset() for TokenStreams to support reuse. (Robert Muir)
|
||||
|
||||
Documentation
|
||||
|
||||
(None)
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
|
||||
/**
|
||||
|
@ -109,7 +110,7 @@ public final class ArabicAnalyzer extends Analyzer {
|
|||
/**
|
||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream build from an ArabicTokenizer filtered with
|
||||
* @return A TokenStream built from an ArabicTokenizer filtered with
|
||||
* StopFilter, LowerCaseFilter, ArabicNormalizationFilter and ArabicStemFilter.
|
||||
*/
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
|
@ -121,5 +122,35 @@ public final class ArabicAnalyzer extends Analyzer {
|
|||
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) TokenStream which tokenizes all the text
|
||||
* in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream built from an ArabicTokenizer filtered with
|
||||
* StopFilter, LowerCaseFilter, ArabicNormalizationFilter and
|
||||
* ArabicStemFilter.
|
||||
*/
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new ArabicLetterTokenizer(reader);
|
||||
streams.result = new StopFilter(streams.source, stoptable);
|
||||
streams.result = new LowerCaseFilter(streams.result);
|
||||
streams.result = new ArabicNormalizationFilter(streams.result);
|
||||
streams.result = new ArabicStemFilter(streams.result);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
@ -125,8 +126,9 @@ public final class BrazilianAnalyzer extends Analyzer {
|
|||
/**
|
||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream build from a StandardTokenizer filtered with
|
||||
* StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter.
|
||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
||||
* LowerCaseFilter, StandardFilter, StopFilter, and
|
||||
* BrazilianStemFilter.
|
||||
*/
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer( reader );
|
||||
|
@ -136,5 +138,35 @@ public final class BrazilianAnalyzer extends Analyzer {
|
|||
result = new BrazilianStemFilter( result, excltable );
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) TokenStream which tokenizes all the text
|
||||
* in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
||||
* LowerCaseFilter, StandardFilter, StopFilter, and
|
||||
* BrazilianStemFilter.
|
||||
*/
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(reader);
|
||||
streams.result = new LowerCaseFilter(streams.source);
|
||||
streams.result = new StandardFilter(streams.result);
|
||||
streams.result = new StopFilter(streams.result, stoptable);
|
||||
streams.result = new BrazilianStemFilter(streams.result, excltable);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -20,7 +20,9 @@ package org.apache.lucene.analysis.cjk;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -84,4 +86,30 @@ public class CJKAnalyzer extends Analyzer {
|
|||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new StopFilter(new CJKTokenizer(reader), stopTable);
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* get (possibly reused) token stream from input
|
||||
*
|
||||
* @param fieldName lucene field name
|
||||
* @param reader input reader
|
||||
* @return TokenStream
|
||||
*/
|
||||
public final TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
/* tokenStream() is final, no back compat issue */
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new CJKTokenizer(reader);
|
||||
streams.result = new StopFilter(streams.source, stopTable);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -278,5 +278,17 @@ public final class CJKTokenizer extends Tokenizer {
|
|||
// set final offset
|
||||
final int finalOffset = offset;
|
||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
offset = bufferIndex = dataLen = 0;
|
||||
preIsTokened = false;
|
||||
tokenType = WORD_TYPE;
|
||||
}
|
||||
|
||||
public void reset(Reader reader) throws IOException {
|
||||
super.reset(reader);
|
||||
reset();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,9 +17,11 @@ package org.apache.lucene.analysis.cn;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
* Title: ChineseAnalyzer
|
||||
|
@ -47,4 +49,31 @@ public class ChineseAnalyzer extends Analyzer {
|
|||
result = new ChineseFilter(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) TokenStream which tokenizes all the text in the
|
||||
* provided Reader.
|
||||
*
|
||||
* @return A TokenStream build from a ChineseTokenizer filtered with
|
||||
* ChineseFilter.
|
||||
*/
|
||||
public final TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
/* tokenStream() is final, no back compat issue */
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new ChineseTokenizer(reader);
|
||||
streams.result = new ChineseFilter(streams.source);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
}
|
|
@ -146,5 +146,15 @@ public final class ChineseTokenizer extends Tokenizer {
|
|||
// set final offset
|
||||
final int finalOffset = offset;
|
||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
offset = bufferIndex = dataLen = 0;
|
||||
}
|
||||
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
reset();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -215,4 +215,9 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
}
|
||||
|
||||
protected abstract void decomposeInternal(final Token token);
|
||||
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
tokens.clear();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
@ -126,7 +127,7 @@ public final class CzechAnalyzer extends Analyzer {
|
|||
/**
|
||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream build from a StandardTokenizer filtered with
|
||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
||||
* StandardFilter, LowerCaseFilter, and StopFilter
|
||||
*/
|
||||
public final TokenStream tokenStream( String fieldName, Reader reader ) {
|
||||
|
@ -136,5 +137,33 @@ public final class CzechAnalyzer extends Analyzer {
|
|||
result = new StopFilter( result, stoptable );
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) TokenStream which tokenizes all the text in
|
||||
* the provided Reader.
|
||||
*
|
||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
||||
* StandardFilter, LowerCaseFilter, and StopFilter
|
||||
*/
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new LowerCaseFilter(streams.result);
|
||||
streams.result = new StopFilter(streams.result, stoptable);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
@ -79,6 +80,7 @@ public class GermanAnalyzer extends Analyzer {
|
|||
*/
|
||||
public GermanAnalyzer() {
|
||||
stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS);
|
||||
setOverridesTokenStreamMethod(GermanAnalyzer.class);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -86,6 +88,7 @@ public class GermanAnalyzer extends Analyzer {
|
|||
*/
|
||||
public GermanAnalyzer(String[] stopwords) {
|
||||
stopSet = StopFilter.makeStopSet(stopwords);
|
||||
setOverridesTokenStreamMethod(GermanAnalyzer.class);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -93,6 +96,7 @@ public class GermanAnalyzer extends Analyzer {
|
|||
*/
|
||||
public GermanAnalyzer(Map stopwords) {
|
||||
stopSet = new HashSet(stopwords.keySet());
|
||||
setOverridesTokenStreamMethod(GermanAnalyzer.class);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -100,6 +104,7 @@ public class GermanAnalyzer extends Analyzer {
|
|||
*/
|
||||
public GermanAnalyzer(File stopwords) throws IOException {
|
||||
stopSet = WordlistLoader.getWordSet(stopwords);
|
||||
setOverridesTokenStreamMethod(GermanAnalyzer.class);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -126,7 +131,7 @@ public class GermanAnalyzer extends Analyzer {
|
|||
/**
|
||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream build from a StandardTokenizer filtered with
|
||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
||||
* StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
|
||||
*/
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
|
@ -137,4 +142,39 @@ public class GermanAnalyzer extends Analyzer {
|
|||
result = new GermanStemFilter(result, exclusionSet);
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) TokenStream which tokenizes all the text
|
||||
* in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
||||
* StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
|
||||
*/
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
if (overridesTokenStreamMethod) {
|
||||
// LUCENE-1678: force fallback to tokenStream() if we
|
||||
// have been subclassed and that subclass overrides
|
||||
// tokenStream but not reusableTokenStream
|
||||
return tokenStream(fieldName, reader);
|
||||
}
|
||||
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new LowerCaseFilter(streams.result);
|
||||
streams.result = new StopFilter(streams.result, stopSet);
|
||||
streams.result = new GermanStemFilter(streams.result, exclusionSet);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,8 +20,10 @@ package org.apache.lucene.analysis.el;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
|
@ -209,7 +211,7 @@ public final class GreekAnalyzer extends Analyzer
|
|||
/**
|
||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream build from a StandardTokenizer filtered with
|
||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
||||
* GreekLowerCaseFilter and StopFilter
|
||||
*/
|
||||
public TokenStream tokenStream(String fieldName, Reader reader)
|
||||
|
@ -219,4 +221,31 @@ public final class GreekAnalyzer extends Analyzer
|
|||
result = new StopFilter(result, stopSet);
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) TokenStream which tokenizes all the text
|
||||
* in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
||||
* GreekLowerCaseFilter and StopFilter
|
||||
*/
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(reader);
|
||||
streams.result = new GreekLowerCaseFilter(streams.source, charset);
|
||||
streams.result = new StopFilter(streams.result, stopSet);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
@ -128,7 +129,7 @@ public final class FrenchAnalyzer extends Analyzer {
|
|||
/**
|
||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream build from a StandardTokenizer filtered with
|
||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
||||
* StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
|
||||
*/
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
|
@ -144,5 +145,35 @@ public final class FrenchAnalyzer extends Analyzer {
|
|||
result = new LowerCaseFilter(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) TokenStream which tokenizes all the text
|
||||
* in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
||||
* StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
|
||||
*/
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new StopFilter(streams.result, stoptable);
|
||||
streams.result = new FrenchStemFilter(streams.result, excltable);
|
||||
// Convert to lowercase after stemming!
|
||||
streams.result = new LowerCaseFilter(streams.result);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -161,4 +161,9 @@ public class EdgeNGramTokenFilter extends TokenFilter {
|
|||
public final Token next() throws java.io.IOException {
|
||||
return super.next();
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
curTermBuffer = null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -170,4 +170,14 @@ public class EdgeNGramTokenizer extends Tokenizer {
|
|||
public final Token next() throws java.io.IOException {
|
||||
return super.next();
|
||||
}
|
||||
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
reset();
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
started = false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -109,4 +109,9 @@ public class NGramTokenFilter extends TokenFilter {
|
|||
public final Token next() throws java.io.IOException {
|
||||
return super.next();
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
curTermBuffer = null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -115,4 +115,15 @@ public class NGramTokenizer extends Tokenizer {
|
|||
public final Token next() throws java.io.IOException {
|
||||
return super.next();
|
||||
}
|
||||
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
reset();
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
started = false;
|
||||
pos = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.nl;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
||||
|
@ -78,6 +79,7 @@ public class DutchAnalyzer extends Analyzer {
|
|||
*
|
||||
*/
|
||||
public DutchAnalyzer() {
|
||||
setOverridesTokenStreamMethod(DutchAnalyzer.class);
|
||||
stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
|
||||
stemdict.put("fiets", "fiets"); //otherwise fiet
|
||||
stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
|
||||
|
@ -91,6 +93,7 @@ public class DutchAnalyzer extends Analyzer {
|
|||
* @param stopwords
|
||||
*/
|
||||
public DutchAnalyzer(String[] stopwords) {
|
||||
setOverridesTokenStreamMethod(DutchAnalyzer.class);
|
||||
stoptable = StopFilter.makeStopSet(stopwords);
|
||||
}
|
||||
|
||||
|
@ -100,6 +103,7 @@ public class DutchAnalyzer extends Analyzer {
|
|||
* @param stopwords
|
||||
*/
|
||||
public DutchAnalyzer(HashSet stopwords) {
|
||||
setOverridesTokenStreamMethod(DutchAnalyzer.class);
|
||||
stoptable = stopwords;
|
||||
}
|
||||
|
||||
|
@ -109,6 +113,7 @@ public class DutchAnalyzer extends Analyzer {
|
|||
* @param stopwords
|
||||
*/
|
||||
public DutchAnalyzer(File stopwords) {
|
||||
setOverridesTokenStreamMethod(DutchAnalyzer.class);
|
||||
try {
|
||||
stoptable = org.apache.lucene.analysis.WordlistLoader.getWordSet(stopwords);
|
||||
} catch (IOException e) {
|
||||
|
@ -162,7 +167,7 @@ public class DutchAnalyzer extends Analyzer {
|
|||
/**
|
||||
* Creates a TokenStream which tokenizes all the text in the provided TextReader.
|
||||
*
|
||||
* @return A TokenStream build from a StandardTokenizer filtered with StandardFilter,
|
||||
* @return A TokenStream built from a StandardTokenizer filtered with StandardFilter,
|
||||
* StopFilter, DutchStemFilter
|
||||
*/
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
|
@ -172,4 +177,39 @@ public class DutchAnalyzer extends Analyzer {
|
|||
result = new DutchStemFilter(result, excltable, stemdict);
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) TokenStream which tokenizes all the text
|
||||
* in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream built from a StandardTokenizer filtered with
|
||||
* StandardFilter, StopFilter, DutchStemFilter
|
||||
*/
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
if (overridesTokenStreamMethod) {
|
||||
// LUCENE-1678: force fallback to tokenStream() if we
|
||||
// have been subclassed and that subclass overrides
|
||||
// tokenStream but not reusableTokenStream
|
||||
return tokenStream(fieldName, reader);
|
||||
}
|
||||
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new StopFilter(streams.result, stoptable);
|
||||
streams.result = new DutchStemFilter(streams.result, excltable);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -56,6 +56,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
|
|||
*/
|
||||
public QueryAutoStopWordAnalyzer(Analyzer delegate) {
|
||||
this.delegate = delegate;
|
||||
setOverridesTokenStreamMethod(QueryAutoStopWordAnalyzer.class);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -154,17 +155,97 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
|
|||
term = te.term();
|
||||
}
|
||||
stopWordsPerField.put(fieldName, stopWords);
|
||||
|
||||
/* if the stopwords for a field are changed,
|
||||
* then saved streams for that field are erased.
|
||||
*/
|
||||
Map streamMap = (Map) getPreviousTokenStream();
|
||||
if (streamMap != null)
|
||||
streamMap.remove(fieldName);
|
||||
|
||||
return stopWords.size();
|
||||
}
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = delegate.tokenStream(fieldName, reader);
|
||||
TokenStream result;
|
||||
try {
|
||||
result = delegate.reusableTokenStream(fieldName, reader);
|
||||
} catch (IOException e) {
|
||||
result = delegate.tokenStream(fieldName, reader);
|
||||
}
|
||||
HashSet stopWords = (HashSet) stopWordsPerField.get(fieldName);
|
||||
if (stopWords != null) {
|
||||
result = new StopFilter(result, stopWords);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
/* the underlying stream */
|
||||
TokenStream wrapped;
|
||||
|
||||
/*
|
||||
* when there are no stopwords for the field, refers to wrapped.
|
||||
* if there stopwords, it is a StopFilter around wrapped.
|
||||
*/
|
||||
TokenStream withStopFilter;
|
||||
};
|
||||
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
if (overridesTokenStreamMethod) {
|
||||
// LUCENE-1678: force fallback to tokenStream() if we
|
||||
// have been subclassed and that subclass overrides
|
||||
// tokenStream but not reusableTokenStream
|
||||
return tokenStream(fieldName, reader);
|
||||
}
|
||||
|
||||
/* map of SavedStreams for each field */
|
||||
Map streamMap = (Map) getPreviousTokenStream();
|
||||
if (streamMap == null) {
|
||||
streamMap = new HashMap();
|
||||
setPreviousTokenStream(streamMap);
|
||||
}
|
||||
|
||||
SavedStreams streams = (SavedStreams) streamMap.get(fieldName);
|
||||
if (streams == null) {
|
||||
/* an entry for this field does not exist, create one */
|
||||
streams = new SavedStreams();
|
||||
streamMap.put(fieldName, streams);
|
||||
streams.wrapped = delegate.reusableTokenStream(fieldName, reader);
|
||||
|
||||
/* if there are any stopwords for the field, save the stopfilter */
|
||||
HashSet stopWords = (HashSet) stopWordsPerField.get(fieldName);
|
||||
if (stopWords != null)
|
||||
streams.withStopFilter = new StopFilter(streams.wrapped, stopWords);
|
||||
else
|
||||
streams.withStopFilter = streams.wrapped;
|
||||
|
||||
} else {
|
||||
/*
|
||||
* an entry for this field exists, verify the wrapped stream has not
|
||||
* changed. if it has not, reuse it, otherwise wrap the new stream.
|
||||
*/
|
||||
TokenStream result = delegate.reusableTokenStream(fieldName, reader);
|
||||
if (result == streams.wrapped) {
|
||||
/* the wrapped analyzer reused the stream */
|
||||
streams.withStopFilter.reset();
|
||||
} else {
|
||||
/*
|
||||
* the wrapped analyzer did not. if there are any stopwords for the
|
||||
* field, create a new StopFilter around the new stream
|
||||
*/
|
||||
streams.wrapped = result;
|
||||
HashSet stopWords = (HashSet) stopWordsPerField.get(fieldName);
|
||||
if (stopWords != null)
|
||||
streams.withStopFilter = new StopFilter(streams.wrapped, stopWords);
|
||||
else
|
||||
streams.withStopFilter = streams.wrapped;
|
||||
}
|
||||
}
|
||||
|
||||
return streams.withStopFilter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides information on which stop words have been identified for a field
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.ru;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
|
@ -25,6 +26,7 @@ import java.util.Set;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
* Analyzer for Russian language. Supports an external list of stopwords (words that
|
||||
|
@ -246,7 +248,7 @@ public final class RussianAnalyzer extends Analyzer
|
|||
/**
|
||||
* Creates a TokenStream which tokenizes all the text in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream build from a RussianLetterTokenizer filtered with
|
||||
* @return A TokenStream built from a RussianLetterTokenizer filtered with
|
||||
* RussianLowerCaseFilter, StopFilter, and RussianStemFilter
|
||||
*/
|
||||
public TokenStream tokenStream(String fieldName, Reader reader)
|
||||
|
@ -257,4 +259,32 @@ public final class RussianAnalyzer extends Analyzer
|
|||
result = new RussianStemFilter(result, charset);
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a (possibly reused) TokenStream which tokenizes all the text
|
||||
* in the provided Reader.
|
||||
*
|
||||
* @return A TokenStream built from a RussianLetterTokenizer filtered with
|
||||
* RussianLowerCaseFilter, StopFilter, and RussianStemFilter
|
||||
*/
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new RussianLetterTokenizer(reader, charset);
|
||||
streams.result = new RussianLowerCaseFilter(streams.source, charset);
|
||||
streams.result = new StopFilter(streams.result, stopSet);
|
||||
streams.result = new RussianStemFilter(streams.result, charset);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.shingle;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
@ -36,6 +37,7 @@ public class ShingleAnalyzerWrapper extends Analyzer {
|
|||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
|
||||
super();
|
||||
this.defaultAnalyzer = defaultAnalyzer;
|
||||
setOverridesTokenStreamMethod(ShingleAnalyzerWrapper.class);
|
||||
}
|
||||
|
||||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) {
|
||||
|
@ -49,6 +51,7 @@ public class ShingleAnalyzerWrapper extends Analyzer {
|
|||
public ShingleAnalyzerWrapper() {
|
||||
super();
|
||||
this.defaultAnalyzer = new StandardAnalyzer();
|
||||
setOverridesTokenStreamMethod(ShingleAnalyzerWrapper.class);
|
||||
}
|
||||
|
||||
public ShingleAnalyzerWrapper(int nGramSize) {
|
||||
|
@ -90,10 +93,50 @@ public class ShingleAnalyzerWrapper extends Analyzer {
|
|||
}
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
ShingleFilter filter = new ShingleFilter(defaultAnalyzer.tokenStream(
|
||||
fieldName, reader));
|
||||
TokenStream wrapped;
|
||||
try {
|
||||
wrapped = defaultAnalyzer.reusableTokenStream(fieldName, reader);
|
||||
} catch (IOException e) {
|
||||
wrapped = defaultAnalyzer.tokenStream(fieldName, reader);
|
||||
}
|
||||
ShingleFilter filter = new ShingleFilter(wrapped);
|
||||
filter.setMaxShingleSize(maxShingleSize);
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
return filter;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
TokenStream wrapped;
|
||||
ShingleFilter shingle;
|
||||
};
|
||||
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
if (overridesTokenStreamMethod) {
|
||||
// LUCENE-1678: force fallback to tokenStream() if we
|
||||
// have been subclassed and that subclass overrides
|
||||
// tokenStream but not reusableTokenStream
|
||||
return tokenStream(fieldName, reader);
|
||||
}
|
||||
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.wrapped = defaultAnalyzer.reusableTokenStream(fieldName, reader);
|
||||
streams.shingle = new ShingleFilter(streams.wrapped);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
TokenStream result = defaultAnalyzer.reusableTokenStream(fieldName, reader);
|
||||
if (result == streams.wrapped) {
|
||||
/* the wrapped analyzer reused the stream */
|
||||
streams.shingle.reset();
|
||||
} else {
|
||||
/* the wrapped analyzer did not, create a new shingle around the new one */
|
||||
streams.wrapped = result;
|
||||
streams.shingle = new ShingleFilter(streams.wrapped);
|
||||
}
|
||||
}
|
||||
streams.shingle.setMaxShingleSize(maxShingleSize);
|
||||
streams.shingle.setOutputUnigrams(outputUnigrams);
|
||||
return streams.shingle;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -336,4 +336,14 @@ public class ShingleFilter extends TokenFilter {
|
|||
public final Token next() throws java.io.IOException {
|
||||
return super.next();
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
nextToken = null;
|
||||
shingleBufferPosition = 0;
|
||||
shingleBuf.clear();
|
||||
numFillerTokensToInsert = 0;
|
||||
currentToken = null;
|
||||
hasCurrentToken = false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,11 +16,13 @@ package org.apache.lucene.analysis.th;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
|
||||
|
@ -29,6 +31,11 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
* @version 0.2
|
||||
*/
|
||||
public class ThaiAnalyzer extends Analyzer {
|
||||
|
||||
public ThaiAnalyzer() {
|
||||
setOverridesTokenStreamMethod(ThaiAnalyzer.class);
|
||||
}
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream ts = new StandardTokenizer(reader);
|
||||
ts = new StandardFilter(ts);
|
||||
|
@ -36,4 +43,32 @@ public class ThaiAnalyzer extends Analyzer {
|
|||
ts = new StopFilter(ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
return ts;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
if (overridesTokenStreamMethod) {
|
||||
// LUCENE-1678: force fallback to tokenStream() if we
|
||||
// have been subclassed and that subclass overrides
|
||||
// tokenStream but not reusableTokenStream
|
||||
return tokenStream(fieldName, reader);
|
||||
}
|
||||
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new ThaiWordFilter(streams.result);
|
||||
streams.result = new StopFilter(streams.result, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
streams.result.reset(); // reset the ThaiWordFilter's state
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -93,4 +93,9 @@ public class ThaiWordFilter extends TokenFilter {
|
|||
public final Token next() throws java.io.IOException {
|
||||
return super.next();
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
thaiState = null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -57,6 +57,15 @@ public class TestArabicAnalyzer extends TestCase {
|
|||
assertAnalyzesTo(a, "ما ملكت أيمانكم", new String[] { "ملكت", "ايمانكم"});
|
||||
assertAnalyzesTo(a, "الذين ملكت أيمانكم", new String[] { "ملكت", "ايمانكم" }); // stopwords
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple tests to show things are getting reset correctly, etc.
|
||||
*/
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
ArabicAnalyzer a = new ArabicAnalyzer();
|
||||
assertAnalyzesToReuse(a, "كبير", new String[] { "كبير" });
|
||||
assertAnalyzesToReuse(a, "كبيرة", new String[] { "كبير" }); // feminine marker
|
||||
}
|
||||
|
||||
/**
|
||||
* Non-arabic text gets treated in a similar way as SimpleAnalyzer.
|
||||
|
@ -80,5 +89,18 @@ public class TestArabicAnalyzer extends TestCase {
|
|||
assertFalse(ts.incrementToken());
|
||||
ts.close();
|
||||
}
|
||||
|
||||
private void assertAnalyzesToReuse(Analyzer a, String input, String[] output)
|
||||
throws Exception {
|
||||
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
|
||||
TermAttribute termAtt = (TermAttribute) ts
|
||||
.getAttribute(TermAttribute.class);
|
||||
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(output[i], termAtt.term());
|
||||
}
|
||||
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -117,6 +117,14 @@ public class TestBrazilianStemmer extends TestCase {
|
|||
check("quinzena", "quinzen");
|
||||
check("quiosque", "quiosqu");
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new BrazilianAnalyzer();
|
||||
checkReuse(a, "boa", "boa");
|
||||
checkReuse(a, "boainain", "boainain");
|
||||
checkReuse(a, "boas", "boas");
|
||||
checkReuse(a, "bôas", "boas"); // removes diacritic: different from snowball portugese
|
||||
}
|
||||
|
||||
|
||||
private void check(final String input, final String expected) throws IOException {
|
||||
|
@ -128,5 +136,13 @@ public class TestBrazilianStemmer extends TestCase {
|
|||
assertFalse(stream.incrementToken());
|
||||
stream.close();
|
||||
}
|
||||
|
||||
private void checkReuse(Analyzer analyzer, final String input, final String expected) throws IOException {
|
||||
TokenStream stream = analyzer.reusableTokenStream("dummy", new StringReader(input));
|
||||
TermAttribute text = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals(expected, text.term());
|
||||
assertFalse(stream.incrementToken());
|
||||
}
|
||||
|
||||
}
|
|
@ -22,6 +22,8 @@ import java.io.StringReader;
|
|||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
@ -60,6 +62,21 @@ public class TestCJKTokenizer extends TestCase{
|
|||
assertFalse(tokenizer.incrementToken());
|
||||
}
|
||||
|
||||
public void checkCJKTokenReusable(final Analyzer a, final String str, final TestToken[] out_tokens) throws IOException {
|
||||
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(str));
|
||||
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
|
||||
TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
|
||||
for (int i = 0; i < out_tokens.length; i++) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(termAtt.term(), out_tokens[i].termText);
|
||||
assertEquals(offsetAtt.startOffset(), out_tokens[i].start);
|
||||
assertEquals(offsetAtt.endOffset(), out_tokens[i].end);
|
||||
assertEquals(typeAtt.type(), out_tokens[i].type);
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
|
||||
public void testJa1() throws IOException {
|
||||
String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341";
|
||||
|
||||
|
@ -151,4 +168,38 @@ public class TestCJKTokenizer extends TestCase{
|
|||
};
|
||||
checkCJKToken(str, out_tokens);
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer analyzer = new CJKAnalyzer();
|
||||
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
|
||||
|
||||
TestToken[] out_tokens = {
|
||||
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u304f\u3051", 10,12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3051\u3053", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
||||
};
|
||||
checkCJKTokenReusable(analyzer, str, out_tokens);
|
||||
|
||||
str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
|
||||
TestToken[] out_tokens2 = {
|
||||
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u304f\u3051", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3053", 14,15, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
||||
};
|
||||
checkCJKTokenReusable(analyzer, str, out_tokens2);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,7 +22,10 @@ import java.io.StringReader;
|
|||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
|
||||
public class TestChineseTokenizer extends TestCase
|
||||
|
@ -42,4 +45,32 @@ public class TestChineseTokenizer extends TestCase
|
|||
correctEndOffset++;
|
||||
}
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception
|
||||
{
|
||||
Analyzer a = new ChineseAnalyzer();
|
||||
assertAnalyzesToReuse(a, "中华人民共和国",
|
||||
new String[] { "中", "华", "人", "民", "共", "和", "国" },
|
||||
new int[] { 0, 1, 2, 3, 4, 5, 6 },
|
||||
new int[] { 1, 2, 3, 4, 5, 6, 7 });
|
||||
assertAnalyzesToReuse(a, "北京市",
|
||||
new String[] { "北", "京", "市" },
|
||||
new int[] { 0, 1, 2 },
|
||||
new int[] { 1, 2, 3 });
|
||||
}
|
||||
|
||||
private void assertAnalyzesToReuse(Analyzer a, String input, String[] output,
|
||||
int startOffsets[], int endOffsets[])
|
||||
throws Exception {
|
||||
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
|
||||
TermAttribute termAtt = (TermAttribute) ts
|
||||
.getAttribute(TermAttribute.class);
|
||||
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(output[i], termAtt.term());
|
||||
}
|
||||
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,6 +34,7 @@ import java.util.zip.ZipInputStream;
|
|||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
|
@ -151,6 +152,38 @@ public class TestCompoundWordTokenFilter extends TestCase {
|
|||
14, 20 }, new int[] { 26, 3, 14, 14, 20, 26 }, new int[] { 1, 0, 0, 0,
|
||||
0, 0 });
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
|
||||
"Aufgabe", "Überwachung" };
|
||||
|
||||
Reader reader = getHyphenationReader("de_DR.xml");
|
||||
if (reader == null) {
|
||||
// we gracefully die if we have no reader
|
||||
return;
|
||||
}
|
||||
|
||||
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
|
||||
.getHyphenationTree(reader);
|
||||
|
||||
Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader(
|
||||
"Rindfleischüberwachungsgesetz"));
|
||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
|
||||
wsTokenizer, hyphenator, dict,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
|
||||
TermAttribute termAtt = (TermAttribute) tf.getAttribute(TermAttribute.class);
|
||||
assertTrue(tf.incrementToken());
|
||||
assertEquals("Rindfleischüberwachungsgesetz", termAtt.term());
|
||||
assertTrue(tf.incrementToken());
|
||||
assertEquals("Rind", termAtt.term());
|
||||
wsTokenizer.reset(new StringReader("Rindfleischüberwachungsgesetz"));
|
||||
tf.reset();
|
||||
assertTrue(tf.incrementToken());
|
||||
assertEquals("Rindfleischüberwachungsgesetz", termAtt.term());
|
||||
}
|
||||
|
||||
private void assertFiltersTo(TokenFilter tf, String[] s, int[] startOffset,
|
||||
int[] endOffset, int[] posIncr) throws Exception {
|
||||
|
|
|
@ -36,6 +36,12 @@ public class TestCzechAnalyzer extends TestCase {
|
|||
public void testStopWord() throws Exception {
|
||||
assertAnalyzesTo(new CzechAnalyzer(), "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer analyzer = new CzechAnalyzer();
|
||||
assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
|
||||
assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česká", "republika" });
|
||||
}
|
||||
|
||||
private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
|
||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||
|
@ -47,4 +53,14 @@ public class TestCzechAnalyzer extends TestCase {
|
|||
assertFalse(ts.incrementToken());
|
||||
ts.close();
|
||||
}
|
||||
|
||||
private void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws Exception {
|
||||
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
|
||||
TermAttribute text = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||
for (int i=0; i<output.length; i++) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(text.term(), output[i]);
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,10 +22,14 @@ import java.io.File;
|
|||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
|
@ -64,6 +68,26 @@ public class TestGermanStemFilter extends TestCase {
|
|||
fail();
|
||||
}
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new GermanAnalyzer();
|
||||
checkReuse(a, "Tisch", "tisch");
|
||||
checkReuse(a, "Tische", "tisch");
|
||||
checkReuse(a, "Tischen", "tisch");
|
||||
}
|
||||
|
||||
/**
|
||||
* subclass that acts just like whitespace analyzer for testing
|
||||
*/
|
||||
private class GermanSubclassAnalyzer extends GermanAnalyzer {
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new WhitespaceTokenizer(reader);
|
||||
}
|
||||
}
|
||||
|
||||
public void testLUCENE1678BWComp() throws Exception {
|
||||
checkReuse(new GermanSubclassAnalyzer(), "Tischen", "Tischen");
|
||||
}
|
||||
|
||||
private void check(final String input, final String expected) throws IOException {
|
||||
StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
|
||||
|
@ -73,5 +97,12 @@ public class TestGermanStemFilter extends TestCase {
|
|||
assertEquals(expected, termAtt.term());
|
||||
filter.close();
|
||||
}
|
||||
|
||||
|
||||
private void checkReuse(Analyzer a, String input, String expected) throws IOException {
|
||||
TokenStream stream = a.reusableTokenStream("dummy", new StringReader(input));
|
||||
TermAttribute text = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals(expected, text.term());
|
||||
assertFalse(stream.incrementToken());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -49,6 +49,16 @@ public class GreekAnalyzerTest extends TestCase {
|
|||
assertFalse(ts.incrementToken());
|
||||
ts.close();
|
||||
}
|
||||
|
||||
private void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws Exception {
|
||||
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
|
||||
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||
for (int i=0; i<output.length; i++) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(termAtt.term(), output[i]);
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the analysis of various greek strings.
|
||||
|
@ -70,5 +80,20 @@ public class GreekAnalyzerTest extends TestCase {
|
|||
assertAnalyzesTo(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
|
||||
new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
|
||||
}
|
||||
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new GreekAnalyzer();
|
||||
// Verify the correct analysis of capitals and small accented letters
|
||||
assertAnalyzesToReuse(a, "\u039c\u03af\u03b1 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac \u03ba\u03b1\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bb\u03bf\u03cd\u03c3\u03b9\u03b1 \u03c3\u03b5\u03b9\u03c1\u03ac \u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03ae\u03c1\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ae\u03c2 \u03b3\u03bb\u03ce\u03c3\u03c3\u03b1\u03c2",
|
||||
new String[] { "\u03bc\u03b9\u03b1", "\u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03b1", "\u03ba\u03b1\u03bb\u03b7", "\u03c0\u03bb\u03bf\u03c5\u03c3\u03b9\u03b1", "\u03c3\u03b5\u03b9\u03c1\u03b1", "\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03b7\u03c1\u03c9\u03bd",
|
||||
"\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03b7\u03c3", "\u03b3\u03bb\u03c9\u03c3\u03c3\u03b1\u03c3" });
|
||||
// Verify the correct analysis of small letters with diaeresis and the elimination
|
||||
// of punctuation marks
|
||||
assertAnalyzesToReuse(a, "\u03a0\u03c1\u03bf\u03ca\u03cc\u03bd\u03c4\u03b1 (\u03ba\u03b1\u03b9) [\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03ad\u03c2] - \u0391\u039d\u0391\u0393\u039a\u0395\u03a3",
|
||||
new String[] { "\u03c0\u03c1\u03bf\u03b9\u03bf\u03bd\u03c4\u03b1", "\u03c0\u03bf\u03bb\u03bb\u03b1\u03c0\u03bb\u03b5\u03c3", "\u03b1\u03bd\u03b1\u03b3\u03ba\u03b5\u03c3" });
|
||||
// Verify the correct analysis of capital accented letters and capitalletters with diaeresis,
|
||||
// as well as the elimination of stop words
|
||||
assertAnalyzesToReuse(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
|
||||
new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -84,6 +84,19 @@ public class TestFrenchAnalyzer extends TestCase {
|
|||
assertFalse(ts.incrementToken());
|
||||
ts.close();
|
||||
}
|
||||
|
||||
public void assertAnalyzesToReuse(Analyzer a, String input, String[] output)
|
||||
throws Exception {
|
||||
|
||||
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
|
||||
|
||||
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(termAtt.term(), output[i]);
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
|
||||
public void testAnalyzer() throws Exception {
|
||||
FrenchAnalyzer fa = new FrenchAnalyzer();
|
||||
|
@ -186,5 +199,26 @@ public class TestFrenchAnalyzer extends TestCase {
|
|||
new String[] { "33bis", "1940-1945", "1940", "1945", "i" });
|
||||
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
FrenchAnalyzer fa = new FrenchAnalyzer();
|
||||
// stopwords
|
||||
assertAnalyzesToReuse(
|
||||
fa,
|
||||
"le la chien les aux chat du des à cheval",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
|
||||
// some nouns and adjectives
|
||||
assertAnalyzesToReuse(
|
||||
fa,
|
||||
"lances chismes habitable chiste éléments captifs",
|
||||
new String[] {
|
||||
"lanc",
|
||||
"chism",
|
||||
"habit",
|
||||
"chist",
|
||||
"élément",
|
||||
"captif" });
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -120,4 +120,18 @@ public class EdgeNGramTokenFilterTest extends TestCase {
|
|||
assertEquals("(fgh,0,3)", termAtt.toString());
|
||||
assertFalse(tokenizer.incrementToken());
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader("abcde"));
|
||||
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
|
||||
TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
|
||||
assertTrue(filter.incrementToken());
|
||||
assertEquals("(a,0,1)", termAtt.toString());
|
||||
assertTrue(filter.incrementToken());
|
||||
assertEquals("(ab,0,2)", termAtt.toString());
|
||||
tokenizer.reset(new StringReader("abcde"));
|
||||
filter.reset();
|
||||
assertTrue(filter.incrementToken());
|
||||
assertEquals("(a,0,1)", termAtt.toString());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -108,4 +108,16 @@ public class EdgeNGramTokenizerTest extends TestCase {
|
|||
assertEquals("(cde,2,5)", termAtt.toString());
|
||||
assertFalse(tokenizer.incrementToken());
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
|
||||
TermAttribute termAtt = (TermAttribute) tokenizer.addAttribute(TermAttribute.class);
|
||||
assertTrue(tokenizer.incrementToken());
|
||||
assertEquals("(a,0,1)", termAtt.toString());
|
||||
assertTrue(tokenizer.incrementToken());
|
||||
assertEquals("(ab,0,2)", termAtt.toString());
|
||||
tokenizer.reset(new StringReader("abcde"));
|
||||
assertTrue(tokenizer.incrementToken());
|
||||
assertEquals("(a,0,1)", termAtt.toString());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -108,4 +108,18 @@ public class NGramTokenFilterTest extends TestCase {
|
|||
|
||||
checkStream(filter, exp);
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader("abcde"));
|
||||
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 3);
|
||||
TermAttribute termAtt = (TermAttribute) filter.addAttribute(TermAttribute.class);
|
||||
assertTrue(filter.incrementToken());
|
||||
assertEquals("(a,0,1)", termAtt.toString());
|
||||
assertTrue(filter.incrementToken());
|
||||
assertEquals("(b,1,2)", termAtt.toString());
|
||||
tokenizer.reset(new StringReader("abcde"));
|
||||
filter.reset();
|
||||
assertTrue(filter.incrementToken());
|
||||
assertEquals("(a,0,1)", termAtt.toString());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -99,4 +99,16 @@ public class NGramTokenizerTest extends TestCase {
|
|||
NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
|
||||
assertFalse(tokenizer.incrementToken());
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
|
||||
TermAttribute termAtt = (TermAttribute) tokenizer.getAttribute(TermAttribute.class);
|
||||
assertTrue(tokenizer.incrementToken());
|
||||
assertEquals("(a,0,1)", termAtt.toString());
|
||||
assertTrue(tokenizer.incrementToken());
|
||||
assertEquals("(b,1,2)", termAtt.toString());
|
||||
tokenizer.reset(new StringReader("abcde"));
|
||||
assertTrue(tokenizer.incrementToken());
|
||||
assertEquals("(a,0,1)", termAtt.toString());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,12 +18,14 @@ package org.apache.lucene.analysis.nl;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/**
|
||||
|
@ -116,6 +118,31 @@ public class TestDutchStemmer extends TestCase {
|
|||
check("ophoping", "ophop");
|
||||
check("ophouden", "ophoud");
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new DutchAnalyzer();
|
||||
checkReuse(a, "lichaamsziek", "lichaamsziek");
|
||||
checkReuse(a, "lichamelijk", "licham");
|
||||
checkReuse(a, "lichamelijke", "licham");
|
||||
checkReuse(a, "lichamelijkheden", "licham");
|
||||
}
|
||||
|
||||
/**
|
||||
* subclass that acts just like whitespace analyzer for testing
|
||||
*/
|
||||
private class DutchSubclassAnalyzer extends DutchAnalyzer {
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new WhitespaceTokenizer(reader);
|
||||
}
|
||||
}
|
||||
|
||||
public void testLUCENE1678BWComp() throws Exception {
|
||||
Analyzer a = new DutchSubclassAnalyzer();
|
||||
checkReuse(a, "lichaamsziek", "lichaamsziek");
|
||||
checkReuse(a, "lichamelijk", "lichamelijk");
|
||||
checkReuse(a, "lichamelijke", "lichamelijke");
|
||||
checkReuse(a, "lichamelijkheden", "lichamelijkheden");
|
||||
}
|
||||
|
||||
|
||||
private void check(final String input, final String expected) throws IOException {
|
||||
|
@ -127,5 +154,16 @@ public class TestDutchStemmer extends TestCase {
|
|||
assertFalse(stream.incrementToken());
|
||||
stream.close();
|
||||
}
|
||||
|
||||
private void checkReuse(Analyzer a, final String input, final String expected)
|
||||
throws IOException {
|
||||
TokenStream stream = a
|
||||
.reusableTokenStream("dummy", new StringReader(input));
|
||||
TermAttribute text = (TermAttribute) stream
|
||||
.getAttribute(TermAttribute.class);
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals(expected, text.term());
|
||||
assertFalse(stream.incrementToken());
|
||||
}
|
||||
|
||||
}
|
|
@ -18,7 +18,9 @@ package org.apache.lucene.analysis.query;
|
|||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
@ -32,6 +34,7 @@ import org.apache.lucene.search.Query;
|
|||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
public class QueryAutoStopWordAnalyzerTest extends TestCase {
|
||||
String variedFieldValues[] = {"the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "boring", "dog"};
|
||||
|
@ -139,6 +142,24 @@ public class QueryAutoStopWordAnalyzerTest extends TestCase {
|
|||
assertTrue("Filter should not prevent stopwords in one field being used in another ", h.length() > 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* subclass that acts just like whitespace analyzer for testing
|
||||
*/
|
||||
private class QueryAutoStopWordSubclassAnalyzer extends QueryAutoStopWordAnalyzer {
|
||||
public QueryAutoStopWordSubclassAnalyzer() {
|
||||
super(new WhitespaceAnalyzer());
|
||||
}
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new WhitespaceTokenizer(reader);
|
||||
}
|
||||
}
|
||||
|
||||
public void testLUCENE1678BWComp() throws Exception {
|
||||
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordSubclassAnalyzer();
|
||||
a.addStopWords(reader, "repetitiveField", 10);
|
||||
Hits h = search(a, "repetitiveField:boring");
|
||||
assertFalse(h.length() == 0);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@ import java.io.StringReader;
|
|||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
|
@ -187,5 +188,22 @@ public class TestRussianAnalyzer extends TestCase
|
|||
fail("unexpected IOException");
|
||||
}
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new RussianAnalyzer();
|
||||
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
||||
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
|
||||
assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
|
||||
new String[] { "знан", "хран", "тайн" });
|
||||
}
|
||||
|
||||
private void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws Exception {
|
||||
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
|
||||
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||
for (int i=0; i<output.length; i++) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(termAtt.term(), output[i]);
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,12 +17,16 @@ package org.apache.lucene.analysis.shingle;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.LetterTokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -200,4 +204,90 @@ public class ShingleAnalyzerWrapperTest extends TestCase {
|
|||
int[] ranks = new int[] { 1, 2, 0 };
|
||||
compareRanks(hits, ranks);
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
|
||||
assertAnalyzesToReuse(a, "please divide into shingles",
|
||||
new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
|
||||
new int[] { 0, 0, 7, 7, 14, 14, 19 },
|
||||
new int[] { 6, 13, 13, 18, 18, 27, 27 },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1 });
|
||||
assertAnalyzesToReuse(a, "divide me up again",
|
||||
new String[] { "divide", "divide me", "me", "me up", "up", "up again", "again" },
|
||||
new int[] { 0, 0, 7, 7, 10, 10, 13 },
|
||||
new int[] { 6, 9, 9, 12, 12, 18, 18 },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1 });
|
||||
}
|
||||
|
||||
/**
|
||||
* subclass that acts just like whitespace analyzer for testing
|
||||
*/
|
||||
private class ShingleWrapperSubclassAnalyzer extends ShingleAnalyzerWrapper {
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new WhitespaceTokenizer(reader);
|
||||
}
|
||||
};
|
||||
|
||||
public void testLUCENE1678BWComp() throws Exception {
|
||||
Analyzer a = new ShingleWrapperSubclassAnalyzer();
|
||||
assertAnalyzesToReuse(a, "this is a test",
|
||||
new String[] { "this", "is", "a", "test" },
|
||||
new int[] { 0, 5, 8, 10 },
|
||||
new int[] { 4, 7, 9, 14 },
|
||||
new int[] { 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
/*
|
||||
* analyzer that does not support reuse
|
||||
* it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even.
|
||||
*/
|
||||
private class NonreusableAnalyzer extends Analyzer {
|
||||
int invocationCount = 0;
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
if (++invocationCount % 2 == 0)
|
||||
return new WhitespaceTokenizer(reader);
|
||||
else
|
||||
return new LetterTokenizer(reader);
|
||||
}
|
||||
}
|
||||
|
||||
public void testWrappedAnalyzerDoesNotReuse() throws Exception {
|
||||
Analyzer a = new ShingleAnalyzerWrapper(new NonreusableAnalyzer());
|
||||
assertAnalyzesToReuse(a, "please divide into shingles.",
|
||||
new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
|
||||
new int[] { 0, 0, 7, 7, 14, 14, 19 },
|
||||
new int[] { 6, 13, 13, 18, 18, 27, 27 },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1 });
|
||||
assertAnalyzesToReuse(a, "please divide into shingles.",
|
||||
new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles.", "shingles." },
|
||||
new int[] { 0, 0, 7, 7, 14, 14, 19 },
|
||||
new int[] { 6, 13, 13, 18, 18, 28, 28 },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1 });
|
||||
assertAnalyzesToReuse(a, "please divide into shingles.",
|
||||
new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
|
||||
new int[] { 0, 0, 7, 7, 14, 14, 19 },
|
||||
new int[] { 6, 13, 13, 18, 18, 27, 27 },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1 });
|
||||
}
|
||||
|
||||
private void assertAnalyzesToReuse(Analyzer a, String input, String[] output,
|
||||
int[] startOffsets, int[] endOffsets, int[] posIncr) throws Exception {
|
||||
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
|
||||
TermAttribute termAtt = (TermAttribute) ts
|
||||
.getAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) ts
|
||||
.getAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts
|
||||
.getAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(output[i], termAtt.term());
|
||||
assertEquals(startOffsets[i], offsetAtt.startOffset());
|
||||
assertEquals(endOffsets[i], offsetAtt.endOffset());
|
||||
assertEquals(posIncr[i], posIncAtt.getPositionIncrement());
|
||||
}
|
||||
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,10 +18,13 @@ package org.apache.lucene.analysis.shingle;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
@ -182,6 +185,20 @@ public class ShingleFilterTest extends TestCase {
|
|||
TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES);
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
|
||||
TokenStream filter = new ShingleFilter(wsTokenizer, 2);
|
||||
TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
|
||||
assertTrue(filter.incrementToken());
|
||||
assertEquals("(please,0,6)", termAtt.toString());
|
||||
assertTrue(filter.incrementToken());
|
||||
assertEquals("(please divide,0,13,type=shingle,posIncr=0)", termAtt.toString());
|
||||
wsTokenizer.reset(new StringReader("please divide this sentence"));
|
||||
filter.reset();
|
||||
assertTrue(filter.incrementToken());
|
||||
assertEquals("(please,0,6)", termAtt.toString());
|
||||
}
|
||||
|
||||
protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
|
||||
int[] positionIncrements, String[] types)
|
||||
throws IOException {
|
||||
|
|
|
@ -17,12 +17,14 @@ package org.apache.lucene.analysis.th;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
@ -91,6 +93,23 @@ public class TestThaiAnalyzer extends TestCase {
|
|||
ts.close();
|
||||
}
|
||||
|
||||
public void assertAnalyzesToReuse(Analyzer a, String input, String[] output)
|
||||
throws Exception {
|
||||
|
||||
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
|
||||
TermAttribute termAtt = (TermAttribute) ts
|
||||
.getAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) ts
|
||||
.getAttribute(OffsetAttribute.class);
|
||||
TypeAttribute typeAtt = (TypeAttribute) ts
|
||||
.getAttribute(TypeAttribute.class);
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(termAtt.term(), output[i]);
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
|
||||
public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
|
||||
assertAnalyzesTo(a, input, output, null, null, null);
|
||||
}
|
||||
|
@ -124,4 +143,33 @@ public class TestThaiAnalyzer extends TestCase {
|
|||
"ประโยคว่า The quick brown fox jumped over the lazy dogs",
|
||||
new String[] { "ประโยค", "ว่า", "quick", "brown", "fox", "jumped", "over", "lazy", "dogs" });
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
ThaiAnalyzer analyzer = new ThaiAnalyzer();
|
||||
assertAnalyzesToReuse(analyzer, "", new String[] {});
|
||||
|
||||
assertAnalyzesToReuse(
|
||||
analyzer,
|
||||
"การที่ได้ต้องแสดงว่างานดี",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
|
||||
|
||||
assertAnalyzesToReuse(
|
||||
analyzer,
|
||||
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
|
||||
new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
|
||||
}
|
||||
|
||||
/**
|
||||
* subclass that acts just like whitespace analyzer for testing
|
||||
*/
|
||||
private class ThaiSubclassAnalyzer extends ThaiAnalyzer {
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new WhitespaceTokenizer(reader);
|
||||
}
|
||||
}
|
||||
|
||||
public void testLUCENE1678BWComp() throws Exception {
|
||||
ThaiSubclassAnalyzer a = new ThaiSubclassAnalyzer();
|
||||
assertAnalyzesToReuse(a, "การที่ได้ต้องแสดงว่างานดี", new String[] { "การที่ได้ต้องแสดงว่างานดี" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -151,6 +151,7 @@ public class SmartChineseAnalyzer extends Analyzer {
|
|||
}
|
||||
} else {
|
||||
streams.tokenStream.reset(reader);
|
||||
streams.filteredTokenStream.reset(); // reset WordTokenFilter's state
|
||||
}
|
||||
|
||||
return streams.filteredTokenStream;
|
||||
|
|
|
@ -102,4 +102,13 @@ public final class SentenceTokenizer extends Tokenizer {
|
|||
}
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
tokenStart = tokenEnd = 0;
|
||||
}
|
||||
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
reset();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -81,4 +81,9 @@ public final class WordTokenFilter extends TokenFilter {
|
|||
typeAtt.setType("word");
|
||||
return true;
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
tokenIter = null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -108,6 +108,33 @@ public class TestSmartChineseAnalyzer extends TestCase {
|
|||
new int[] { 1, 3, 4, 6, 7, 9 });
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new SmartChineseAnalyzer();
|
||||
assertAnalyzesToReuse(a, "我购买 Tests 了道具和服装",
|
||||
new String[] { "我", "购买", "test", "了", "道具", "和", "服装"},
|
||||
new int[] { 0, 1, 4, 10, 11, 13, 14 },
|
||||
new int[] { 1, 3, 9, 11, 13, 14, 16 });
|
||||
assertAnalyzesToReuse(a, "我购买了道具和服装。",
|
||||
new String[] { "我", "购买", "了", "道具", "和", "服装" },
|
||||
new int[] { 0, 1, 3, 4, 6, 7 },
|
||||
new int[] { 1, 3, 4, 6, 7, 9 });
|
||||
}
|
||||
|
||||
public void assertAnalyzesToReuse(Analyzer a, String input, String[] output,
|
||||
int startOffsets[], int endOffsets[]) throws Exception {
|
||||
|
||||
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
|
||||
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(termAtt.term(), output[i]);
|
||||
assertEquals(offsetAtt.startOffset(), startOffsets[i]);
|
||||
assertEquals(offsetAtt.endOffset(), endOffsets[i]);
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
|
||||
public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[])
|
||||
throws Exception {
|
||||
|
||||
|
|
|
@ -141,5 +141,12 @@ public class SynonymTokenFilter extends TokenFilter {
|
|||
arr[i + r] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
stack = null;
|
||||
index = 0;
|
||||
current = null;
|
||||
todo = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,160 @@
|
|||
package org.apache.lucene.index.memory;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class TestSynonymTokenFilter extends TestCase {
|
||||
File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||
File testFile = new File(dataDir, "org/apache/lucene/index/memory/testSynonyms.txt");
|
||||
|
||||
public void testSynonyms() throws Exception {
|
||||
SynonymMap map = new SynonymMap(new FileInputStream(testFile));
|
||||
/* all expansions */
|
||||
Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, Integer.MAX_VALUE);
|
||||
assertAnalyzesTo(analyzer, "Lost in the woods",
|
||||
new String[] { "lost", "in", "the", "woods", "forest", "wood" },
|
||||
new int[] { 0, 5, 8, 12, 12, 12 },
|
||||
new int[] { 4, 7, 11, 17, 17, 17 },
|
||||
new int[] { 1, 1, 1, 1, 0, 0 });
|
||||
}
|
||||
|
||||
public void testSynonymsLimitedAmount() throws Exception {
|
||||
SynonymMap map = new SynonymMap(new FileInputStream(testFile));
|
||||
/* limit to one synonym expansion */
|
||||
Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, 1);
|
||||
assertAnalyzesTo(analyzer, "Lost in the woods",
|
||||
/* wood comes before forest due to
|
||||
* the input file, not lexicographic order
|
||||
*/
|
||||
new String[] { "lost", "in", "the", "woods", "wood" },
|
||||
new int[] { 0, 5, 8, 12, 12 },
|
||||
new int[] { 4, 7, 11, 17, 17 },
|
||||
new int[] { 1, 1, 1, 1, 0 });
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
SynonymMap map = new SynonymMap(new FileInputStream(testFile));
|
||||
/* limit to one synonym expansion */
|
||||
Analyzer analyzer = new SynonymWhitespaceAnalyzer(map, 1);
|
||||
assertAnalyzesToReuse(analyzer, "Lost in the woods",
|
||||
new String[] { "lost", "in", "the", "woods", "wood" },
|
||||
new int[] { 0, 5, 8, 12, 12 },
|
||||
new int[] { 4, 7, 11, 17, 17 },
|
||||
new int[] { 1, 1, 1, 1, 0 });
|
||||
assertAnalyzesToReuse(analyzer, "My wolfish dog went to the forest",
|
||||
new String[] { "my", "wolfish", "ravenous", "dog", "went", "to",
|
||||
"the", "forest", "woods" },
|
||||
new int[] { 0, 3, 3, 11, 15, 20, 23, 27, 27 },
|
||||
new int[] { 2, 10, 10, 14, 19, 22, 26, 33, 33 },
|
||||
new int[] { 1, 1, 0, 1, 1, 1, 1, 1, 0 });
|
||||
}
|
||||
|
||||
private class SynonymWhitespaceAnalyzer extends Analyzer {
|
||||
private SynonymMap synonyms;
|
||||
private int maxSynonyms;
|
||||
|
||||
public SynonymWhitespaceAnalyzer(SynonymMap synonyms, int maxSynonyms) {
|
||||
this.synonyms = synonyms;
|
||||
this.maxSynonyms = maxSynonyms;
|
||||
}
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream ts = new WhitespaceTokenizer(reader);
|
||||
ts = new LowerCaseFilter(ts);
|
||||
ts = new SynonymTokenFilter(ts, synonyms, maxSynonyms);
|
||||
return ts;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new WhitespaceTokenizer(reader);
|
||||
streams.result = new LowerCaseFilter(streams.source);
|
||||
streams.result = new SynonymTokenFilter(streams.result, synonyms, maxSynonyms);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
streams.result.reset(); // reset the SynonymTokenFilter
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
}
|
||||
|
||||
public void assertAnalyzesTo(Analyzer a, String input, String[] output,
|
||||
int startOffsets[], int endOffsets[], int posIncs[]) throws Exception {
|
||||
|
||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||
TermAttribute termAtt = (TermAttribute) ts
|
||||
.getAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) ts
|
||||
.getAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts
|
||||
.getAttribute(PositionIncrementAttribute.class);
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(termAtt.term(), output[i]);
|
||||
assertEquals(offsetAtt.startOffset(), startOffsets[i]);
|
||||
assertEquals(offsetAtt.endOffset(), endOffsets[i]);
|
||||
assertEquals(posIncAtt.getPositionIncrement(), posIncs[i]);
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.close();
|
||||
}
|
||||
|
||||
public void assertAnalyzesToReuse(Analyzer a, String input, String[] output,
|
||||
int startOffsets[], int endOffsets[], int posIncs[]) throws Exception {
|
||||
|
||||
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
|
||||
TermAttribute termAtt = (TermAttribute) ts
|
||||
.getAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) ts
|
||||
.getAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts
|
||||
.getAttribute(PositionIncrementAttribute.class);
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(termAtt.term(), output[i]);
|
||||
assertEquals(offsetAtt.startOffset(), startOffsets[i]);
|
||||
assertEquals(offsetAtt.endOffset(), endOffsets[i]);
|
||||
assertEquals(posIncAtt.getPositionIncrement(), posIncs[i]);
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
s(100000001,1,'woods',n,1,0).
|
||||
s(100000001,2,'wood',n,1,0).
|
||||
s(100000001,3,'forest',n,1,0).
|
||||
s(100000002,1,'wolfish',n,1,0).
|
||||
s(100000002,2,'ravenous',n,1,0).
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.snowball;
|
|||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.standard.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -37,6 +38,7 @@ public class SnowballAnalyzer extends Analyzer {
|
|||
/** Builds the named analyzer with no stop words. */
|
||||
public SnowballAnalyzer(String name) {
|
||||
this.name = name;
|
||||
setOverridesTokenStreamMethod(SnowballAnalyzer.class);
|
||||
}
|
||||
|
||||
/** Builds the named analyzer with the given stop words. */
|
||||
|
@ -46,7 +48,8 @@ public class SnowballAnalyzer extends Analyzer {
|
|||
}
|
||||
|
||||
/** Constructs a {@link StandardTokenizer} filtered by a {@link
|
||||
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
|
||||
StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter},
|
||||
and a {@link SnowballFilter} */
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer(reader);
|
||||
result = new StandardFilter(result);
|
||||
|
@ -56,4 +59,37 @@ public class SnowballAnalyzer extends Analyzer {
|
|||
result = new SnowballFilter(result, name);
|
||||
return result;
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
|
||||
/** Returns a (possibly reused) {@link StandardTokenizer} filtered by a
|
||||
* {@link StandardFilter}, a {@link LowerCaseFilter},
|
||||
* a {@link StopFilter}, and a {@link SnowballFilter} */
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
if (overridesTokenStreamMethod) {
|
||||
// LUCENE-1678: force fallback to tokenStream() if we
|
||||
// have been subclassed and that subclass overrides
|
||||
// tokenStream but not reusableTokenStream
|
||||
return tokenStream(fieldName, reader);
|
||||
}
|
||||
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new LowerCaseFilter(streams.result);
|
||||
if (stopSet != null)
|
||||
streams.result = new StopFilter(streams.result, stopSet);
|
||||
streams.result = new SnowballFilter(streams.result, name);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,11 +17,13 @@ package org.apache.lucene.analysis.snowball;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
|
@ -45,6 +47,18 @@ public class TestSnowball extends TestCase {
|
|||
assertFalse(ts.incrementToken());
|
||||
ts.close();
|
||||
}
|
||||
|
||||
public void assertAnalyzesToReuse(Analyzer a,
|
||||
String input,
|
||||
String[] output) throws Exception {
|
||||
TokenStream ts = a.reusableTokenStream("dummy", new StringReader(input));
|
||||
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(output[i], termAtt.term());
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
|
||||
public void testEnglish() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer("English");
|
||||
|
@ -52,7 +66,33 @@ public class TestSnowball extends TestCase {
|
|||
new String[]{"he", "abhor", "accent"});
|
||||
}
|
||||
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer("English");
|
||||
assertAnalyzesToReuse(a, "he abhorred accents",
|
||||
new String[]{"he", "abhor", "accent"});
|
||||
assertAnalyzesToReuse(a, "she abhorred him",
|
||||
new String[]{"she", "abhor", "him"});
|
||||
}
|
||||
|
||||
/**
|
||||
* subclass that acts just like whitespace analyzer for testing
|
||||
*/
|
||||
private class SnowballSubclassAnalyzer extends SnowballAnalyzer {
|
||||
public SnowballSubclassAnalyzer(String name) {
|
||||
super(name);
|
||||
}
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new WhitespaceTokenizer(reader);
|
||||
}
|
||||
}
|
||||
|
||||
public void testLUCENE1678BWComp() throws Exception {
|
||||
Analyzer a = new SnowballSubclassAnalyzer("English");
|
||||
assertAnalyzesToReuse(a, "he abhorred accents",
|
||||
new String[]{"he", "abhorred", "accents"});
|
||||
}
|
||||
|
||||
public void testFilterTokens() throws Exception {
|
||||
SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
|
||||
TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
|
||||
|
|
Loading…
Reference in New Issue