mirror of https://github.com/apache/lucene.git
LUCENE-2100: Marked all contrib Analyzer subclasses as final. Analyzers should be only act as a composition of TokenStreams, users should compose their own analyzers instead of subclassing existing ones.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@888799 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
43c475d296
commit
6c0c318218
|
@ -2,6 +2,13 @@ Lucene contrib change Log
|
|||
|
||||
======================= Trunk (not yet released) =======================
|
||||
|
||||
Changes in backwards compatibility policy
|
||||
|
||||
* LUCENE-2100: All Analyzers in Lucene-contrib have been marked as final.
|
||||
Analyzers should be only act as a composition of TokenStreams, users should
|
||||
compose their own analyzers instead of subclassing existing ones.
|
||||
(Simon Willnauer)
|
||||
|
||||
Changes in runtime behavior
|
||||
|
||||
* LUCENE-2117: SnowballAnalyzer uses TurkishLowerCaseFilter instead of
|
||||
|
|
|
@ -35,7 +35,7 @@ import java.util.Set;
|
|||
* filters with {@link StopFilter}
|
||||
*
|
||||
*/
|
||||
public class CJKAnalyzer extends Analyzer {
|
||||
public final class CJKAnalyzer extends Analyzer {
|
||||
//~ Static fields/initializers ---------------------------------------------
|
||||
|
||||
/**
|
||||
|
|
|
@ -29,7 +29,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
*
|
||||
*/
|
||||
|
||||
public class ChineseAnalyzer extends Analyzer {
|
||||
public final class ChineseAnalyzer extends Analyzer {
|
||||
|
||||
public ChineseAnalyzer() {
|
||||
}
|
||||
|
|
|
@ -51,7 +51,7 @@ import org.apache.lucene.util.Version;
|
|||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
public class GermanAnalyzer extends Analyzer {
|
||||
public final class GermanAnalyzer extends Analyzer {
|
||||
|
||||
/**
|
||||
* List of typical german stopwords.
|
||||
|
@ -133,7 +133,6 @@ public class GermanAnalyzer extends Analyzer {
|
|||
public GermanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
|
||||
setOverridesTokenStreamMethod(GermanAnalyzer.class);
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
|
@ -221,13 +220,6 @@ public class GermanAnalyzer extends Analyzer {
|
|||
*/
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
if (overridesTokenStreamMethod) {
|
||||
// LUCENE-1678: force fallback to tokenStream() if we
|
||||
// have been subclassed and that subclass overrides
|
||||
// tokenStream but not reusableTokenStream
|
||||
return tokenStream(fieldName, reader);
|
||||
}
|
||||
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
|
|
|
@ -64,7 +64,7 @@ import org.apache.lucene.util.Version;
|
|||
* </pre>
|
||||
*
|
||||
*/
|
||||
public class PatternAnalyzer extends Analyzer {
|
||||
public final class PatternAnalyzer extends Analyzer {
|
||||
|
||||
/** <code>"\\W+"</code>; Divides text at non-letters (NOT Character.isLetter(c)) */
|
||||
public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
|
||||
|
|
|
@ -52,7 +52,7 @@ import java.util.Map;
|
|||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
public class DutchAnalyzer extends Analyzer {
|
||||
public final class DutchAnalyzer extends Analyzer {
|
||||
/**
|
||||
* List of typical Dutch stopwords.
|
||||
* @deprecated use {@link #getDefaultStopSet()} instead
|
||||
|
@ -119,7 +119,6 @@ public class DutchAnalyzer extends Analyzer {
|
|||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
|
||||
this.matchVersion = matchVersion;
|
||||
setOverridesTokenStreamMethod(DutchAnalyzer.class);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -151,7 +150,6 @@ public class DutchAnalyzer extends Analyzer {
|
|||
*/
|
||||
public DutchAnalyzer(Version matchVersion, File stopwords) {
|
||||
// this is completely broken!
|
||||
setOverridesTokenStreamMethod(DutchAnalyzer.class);
|
||||
try {
|
||||
stoptable = org.apache.lucene.analysis.WordlistLoader.getWordSet(stopwords);
|
||||
} catch (IOException e) {
|
||||
|
@ -243,13 +241,6 @@ public class DutchAnalyzer extends Analyzer {
|
|||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
if (overridesTokenStreamMethod) {
|
||||
// LUCENE-1678: force fallback to tokenStream() if we
|
||||
// have been subclassed and that subclass overrides
|
||||
// tokenStream but not reusableTokenStream
|
||||
return tokenStream(fieldName, reader);
|
||||
}
|
||||
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
|
|
|
@ -43,7 +43,7 @@ import java.util.*;
|
|||
* stop words found in an already existing index.
|
||||
* </p>
|
||||
*/
|
||||
public class QueryAutoStopWordAnalyzer extends Analyzer {
|
||||
public final class QueryAutoStopWordAnalyzer extends Analyzer {
|
||||
Analyzer delegate;
|
||||
HashMap<String,HashSet<String>> stopWordsPerField = new HashMap<String,HashSet<String>>();
|
||||
//The default maximum percentage (40%) of index documents which
|
||||
|
@ -58,7 +58,6 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
|
|||
*/
|
||||
public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer delegate) {
|
||||
this.delegate = delegate;
|
||||
setOverridesTokenStreamMethod(QueryAutoStopWordAnalyzer.class);
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
|
@ -198,13 +197,6 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
|
|||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
if (overridesTokenStreamMethod) {
|
||||
// LUCENE-1678: force fallback to tokenStream() if we
|
||||
// have been subclassed and that subclass overrides
|
||||
// tokenStream but not reusableTokenStream
|
||||
return tokenStream(fieldName, reader);
|
||||
}
|
||||
|
||||
/* map of SavedStreams for each field */
|
||||
Map<String,SavedStreams> streamMap = (Map<String,SavedStreams>) getPreviousTokenStream();
|
||||
if (streamMap == null) {
|
||||
|
|
|
@ -31,7 +31,7 @@ import org.apache.lucene.util.Version;
|
|||
* A shingle is another name for a token based n-gram.
|
||||
* </p>
|
||||
*/
|
||||
public class ShingleAnalyzerWrapper extends Analyzer {
|
||||
public final class ShingleAnalyzerWrapper extends Analyzer {
|
||||
|
||||
protected Analyzer defaultAnalyzer;
|
||||
protected int maxShingleSize = 2;
|
||||
|
@ -40,7 +40,6 @@ public class ShingleAnalyzerWrapper extends Analyzer {
|
|||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
|
||||
super();
|
||||
this.defaultAnalyzer = defaultAnalyzer;
|
||||
setOverridesTokenStreamMethod(ShingleAnalyzerWrapper.class);
|
||||
}
|
||||
|
||||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) {
|
||||
|
@ -54,7 +53,6 @@ public class ShingleAnalyzerWrapper extends Analyzer {
|
|||
public ShingleAnalyzerWrapper(Version matchVersion) {
|
||||
super();
|
||||
this.defaultAnalyzer = new StandardAnalyzer(matchVersion);
|
||||
setOverridesTokenStreamMethod(ShingleAnalyzerWrapper.class);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -119,13 +117,6 @@ public class ShingleAnalyzerWrapper extends Analyzer {
|
|||
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
if (overridesTokenStreamMethod) {
|
||||
// LUCENE-1678: force fallback to tokenStream() if we
|
||||
// have been subclassed and that subclass overrides
|
||||
// tokenStream but not reusableTokenStream
|
||||
return tokenStream(fieldName, reader);
|
||||
}
|
||||
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
|
|
|
@ -35,11 +35,10 @@ import org.apache.lucene.util.Version;
|
|||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
public class ThaiAnalyzer extends Analyzer {
|
||||
public final class ThaiAnalyzer extends Analyzer {
|
||||
private final Version matchVersion;
|
||||
|
||||
public ThaiAnalyzer(Version matchVersion) {
|
||||
setOverridesTokenStreamMethod(ThaiAnalyzer.class);
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
|
@ -59,13 +58,6 @@ public class ThaiAnalyzer extends Analyzer {
|
|||
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
if (overridesTokenStreamMethod) {
|
||||
// LUCENE-1678: force fallback to tokenStream() if we
|
||||
// have been subclassed and that subclass overrides
|
||||
// tokenStream but not reusableTokenStream
|
||||
return tokenStream(fieldName, reader);
|
||||
}
|
||||
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
|
|
|
@ -21,13 +21,9 @@ import java.io.BufferedReader;
|
|||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -68,24 +64,6 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
|||
checkReuse(a, "Tischen", "tisch");
|
||||
}
|
||||
|
||||
/**
|
||||
* subclass that acts just like whitespace analyzer for testing
|
||||
*/
|
||||
private class GermanSubclassAnalyzer extends GermanAnalyzer {
|
||||
public GermanSubclassAnalyzer(Version matchVersion) {
|
||||
super(matchVersion);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new WhitespaceTokenizer(reader);
|
||||
}
|
||||
}
|
||||
|
||||
public void testLUCENE1678BWComp() throws Exception {
|
||||
checkReuse(new GermanSubclassAnalyzer(Version.LUCENE_CURRENT), "Tischen", "Tischen");
|
||||
}
|
||||
|
||||
/*
|
||||
* Test that changes to the exclusion table are applied immediately
|
||||
* when using reusable token streams.
|
||||
|
|
|
@ -18,12 +18,9 @@ package org.apache.lucene.analysis.nl;
|
|||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -127,27 +124,6 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
|||
checkOneTermReuse(a, "lichamelijkheden", "licham");
|
||||
}
|
||||
|
||||
/**
|
||||
* subclass that acts just like whitespace analyzer for testing
|
||||
*/
|
||||
private class DutchSubclassAnalyzer extends DutchAnalyzer {
|
||||
public DutchSubclassAnalyzer(Version matchVersion) {
|
||||
super(matchVersion);
|
||||
}
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new WhitespaceTokenizer(reader);
|
||||
}
|
||||
}
|
||||
|
||||
public void testLUCENE1678BWComp() throws Exception {
|
||||
Analyzer a = new DutchSubclassAnalyzer(Version.LUCENE_CURRENT);
|
||||
checkOneTermReuse(a, "lichaamsziek", "lichaamsziek");
|
||||
checkOneTermReuse(a, "lichamelijk", "lichamelijk");
|
||||
checkOneTermReuse(a, "lichamelijke", "lichamelijke");
|
||||
checkOneTermReuse(a, "lichamelijkheden", "lichamelijkheden");
|
||||
}
|
||||
|
||||
/*
|
||||
* Test that changes to the exclusion table are applied immediately
|
||||
* when using reusable token streams.
|
||||
|
|
|
@ -148,27 +148,6 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* subclass that acts just like whitespace analyzer for testing
|
||||
*/
|
||||
private class QueryAutoStopWordSubclassAnalyzer extends QueryAutoStopWordAnalyzer {
|
||||
public QueryAutoStopWordSubclassAnalyzer(Version matchVersion) {
|
||||
super(matchVersion, new WhitespaceAnalyzer());
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new WhitespaceTokenizer(reader);
|
||||
}
|
||||
}
|
||||
|
||||
public void testLUCENE1678BWComp() throws Exception {
|
||||
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordSubclassAnalyzer(Version.LUCENE_CURRENT);
|
||||
a.addStopWords(reader, "repetitiveField", 10);
|
||||
int numHits = search(a, "repetitiveField:boring");
|
||||
assertFalse(numHits == 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* analyzer that does not support reuse
|
||||
* it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even.
|
||||
|
|
|
@ -213,28 +213,6 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
new int[] { 1, 0, 1, 0, 1, 0, 1 });
|
||||
}
|
||||
|
||||
/**
|
||||
* subclass that acts just like whitespace analyzer for testing
|
||||
*/
|
||||
private class ShingleWrapperSubclassAnalyzer extends ShingleAnalyzerWrapper {
|
||||
public ShingleWrapperSubclassAnalyzer() {
|
||||
super(org.apache.lucene.util.Version.LUCENE_CURRENT);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new WhitespaceTokenizer(reader);
|
||||
}
|
||||
};
|
||||
|
||||
public void testLUCENE1678BWComp() throws Exception {
|
||||
Analyzer a = new ShingleWrapperSubclassAnalyzer();
|
||||
assertAnalyzesToReuse(a, "this is a test",
|
||||
new String[] { "this", "is", "a", "test" },
|
||||
new int[] { 0, 5, 8, 10 },
|
||||
new int[] { 4, 7, 9, 14 });
|
||||
}
|
||||
|
||||
/*
|
||||
* analyzer that does not support reuse
|
||||
* it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even.
|
||||
|
|
|
@ -17,12 +17,7 @@ package org.apache.lucene.analysis.th;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -124,22 +119,4 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
|||
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
|
||||
new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
|
||||
}
|
||||
|
||||
/**
|
||||
* subclass that acts just like whitespace analyzer for testing
|
||||
*/
|
||||
private class ThaiSubclassAnalyzer extends ThaiAnalyzer {
|
||||
public ThaiSubclassAnalyzer(Version matchVersion) {
|
||||
super(matchVersion);
|
||||
}
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new WhitespaceTokenizer(reader);
|
||||
}
|
||||
}
|
||||
|
||||
public void testLUCENE1678BWComp() throws Exception {
|
||||
ThaiSubclassAnalyzer a = new ThaiSubclassAnalyzer(Version.LUCENE_CURRENT);
|
||||
assertAnalyzesToReuse(a, "การที่ได้ต้องแสดงว่างานดี", new String[] { "การที่ได้ต้องแสดงว่างานดี" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -58,7 +58,7 @@ import org.apache.lucene.util.Version;
|
|||
* supported anymore in such a case.</font>
|
||||
* </p>
|
||||
*/
|
||||
public class SmartChineseAnalyzer extends Analyzer {
|
||||
public final class SmartChineseAnalyzer extends Analyzer {
|
||||
|
||||
private final Set<?> stopWords;
|
||||
|
||||
|
|
|
@ -69,7 +69,7 @@ import java.io.IOException;
|
|||
* java.text.Collator over several languages.
|
||||
* </p>
|
||||
*/
|
||||
public class ICUCollationKeyAnalyzer extends Analyzer {
|
||||
public final class ICUCollationKeyAnalyzer extends Analyzer {
|
||||
private Collator collator;
|
||||
|
||||
public ICUCollationKeyAnalyzer(Collator collator) {
|
||||
|
|
|
@ -40,7 +40,7 @@ import java.util.Set;
|
|||
* </ul>
|
||||
* </p>
|
||||
*/
|
||||
public class SnowballAnalyzer extends Analyzer {
|
||||
public final class SnowballAnalyzer extends Analyzer {
|
||||
private String name;
|
||||
private Set<?> stopSet;
|
||||
private final Version matchVersion;
|
||||
|
@ -48,7 +48,6 @@ public class SnowballAnalyzer extends Analyzer {
|
|||
/** Builds the named analyzer with no stop words. */
|
||||
public SnowballAnalyzer(Version matchVersion, String name) {
|
||||
this.name = name;
|
||||
setOverridesTokenStreamMethod(SnowballAnalyzer.class);
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
|
@ -80,7 +79,7 @@ public class SnowballAnalyzer extends Analyzer {
|
|||
private class SavedStreams {
|
||||
Tokenizer source;
|
||||
TokenStream result;
|
||||
};
|
||||
}
|
||||
|
||||
/** Returns a (possibly reused) {@link StandardTokenizer} filtered by a
|
||||
* {@link StandardFilter}, a {@link LowerCaseFilter},
|
||||
|
@ -88,13 +87,6 @@ public class SnowballAnalyzer extends Analyzer {
|
|||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||
throws IOException {
|
||||
if (overridesTokenStreamMethod) {
|
||||
// LUCENE-1678: force fallback to tokenStream() if we
|
||||
// have been subclassed and that subclass overrides
|
||||
// tokenStream but not reusableTokenStream
|
||||
return tokenStream(fieldName, reader);
|
||||
}
|
||||
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
|
|
|
@ -17,11 +17,8 @@ package org.apache.lucene.analysis.snowball;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
|
@ -86,26 +83,6 @@ public class TestSnowball extends BaseTokenStreamTestCase {
|
|||
new String[]{"she", "abhor", "him"});
|
||||
}
|
||||
|
||||
/**
|
||||
* subclass that acts just like whitespace analyzer for testing
|
||||
*/
|
||||
private class SnowballSubclassAnalyzer extends SnowballAnalyzer {
|
||||
public SnowballSubclassAnalyzer(String name) {
|
||||
super(Version.LUCENE_CURRENT, name);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new WhitespaceTokenizer(reader);
|
||||
}
|
||||
}
|
||||
|
||||
public void testLUCENE1678BWComp() throws Exception {
|
||||
Analyzer a = new SnowballSubclassAnalyzer("English");
|
||||
assertAnalyzesToReuse(a, "he abhorred accents",
|
||||
new String[]{"he", "abhorred", "accents"});
|
||||
}
|
||||
|
||||
public void testFilterTokens() throws Exception {
|
||||
SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
|
||||
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
|
||||
|
|
Loading…
Reference in New Issue