LUCENE-2100: Marked all contrib Analyzer subclasses as final. Analyzers should be only act as a composition of TokenStreams, users should compose their own analyzers instead of subclassing existing ones.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@888799 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2009-12-09 13:32:32 +00:00
parent 43c475d296
commit 6c0c318218
18 changed files with 23 additions and 201 deletions

View File

@ -2,12 +2,19 @@ Lucene contrib change Log
======================= Trunk (not yet released) =======================
Changes in backwards compatibility policy
* LUCENE-2100: All Analyzers in Lucene-contrib have been marked as final.
Analyzers should be only act as a composition of TokenStreams, users should
compose their own analyzers instead of subclassing existing ones.
(Simon Willnauer)
Changes in runtime behavior
* LUCENE-2117: SnowballAnalyzer uses TurkishLowerCaseFilter instead of
LowercaseFilter to correctly handle the unique Turkish casing behavior if
used with Version > 3.0 and the TurkishStemmer.
(Robert Muir via Simon Willnauer)
* LUCENE-2117: SnowballAnalyzer uses TurkishLowerCaseFilter instead of
LowercaseFilter to correctly handle the unique Turkish casing behavior if
used with Version > 3.0 and the TurkishStemmer.
(Robert Muir via Simon Willnauer)
Bug fixes

View File

@ -35,7 +35,7 @@ import java.util.Set;
* filters with {@link StopFilter}
*
*/
public class CJKAnalyzer extends Analyzer {
public final class CJKAnalyzer extends Analyzer {
//~ Static fields/initializers ---------------------------------------------
/**

View File

@ -29,7 +29,7 @@ import org.apache.lucene.analysis.Tokenizer;
*
*/
public class ChineseAnalyzer extends Analyzer {
public final class ChineseAnalyzer extends Analyzer {
public ChineseAnalyzer() {
}

View File

@ -51,7 +51,7 @@ import org.apache.lucene.util.Version;
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
public class GermanAnalyzer extends Analyzer {
public final class GermanAnalyzer extends Analyzer {
/**
* List of typical german stopwords.
@ -133,7 +133,6 @@ public class GermanAnalyzer extends Analyzer {
public GermanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
setOverridesTokenStreamMethod(GermanAnalyzer.class);
this.matchVersion = matchVersion;
}
@ -221,13 +220,6 @@ public class GermanAnalyzer extends Analyzer {
*/
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
if (overridesTokenStreamMethod) {
// LUCENE-1678: force fallback to tokenStream() if we
// have been subclassed and that subclass overrides
// tokenStream but not reusableTokenStream
return tokenStream(fieldName, reader);
}
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();

View File

@ -64,7 +64,7 @@ import org.apache.lucene.util.Version;
* </pre>
*
*/
public class PatternAnalyzer extends Analyzer {
public final class PatternAnalyzer extends Analyzer {
/** <code>"\\W+"</code>; Divides text at non-letters (NOT Character.isLetter(c)) */
public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");

View File

@ -52,7 +52,7 @@ import java.util.Map;
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
public class DutchAnalyzer extends Analyzer {
public final class DutchAnalyzer extends Analyzer {
/**
* List of typical Dutch stopwords.
* @deprecated use {@link #getDefaultStopSet()} instead
@ -119,7 +119,6 @@ public class DutchAnalyzer extends Analyzer {
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
this.matchVersion = matchVersion;
setOverridesTokenStreamMethod(DutchAnalyzer.class);
}
/**
@ -151,7 +150,6 @@ public class DutchAnalyzer extends Analyzer {
*/
public DutchAnalyzer(Version matchVersion, File stopwords) {
// this is completely broken!
setOverridesTokenStreamMethod(DutchAnalyzer.class);
try {
stoptable = org.apache.lucene.analysis.WordlistLoader.getWordSet(stopwords);
} catch (IOException e) {
@ -243,13 +241,6 @@ public class DutchAnalyzer extends Analyzer {
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
if (overridesTokenStreamMethod) {
// LUCENE-1678: force fallback to tokenStream() if we
// have been subclassed and that subclass overrides
// tokenStream but not reusableTokenStream
return tokenStream(fieldName, reader);
}
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();

View File

@ -43,7 +43,7 @@ import java.util.*;
* stop words found in an already existing index.
* </p>
*/
public class QueryAutoStopWordAnalyzer extends Analyzer {
public final class QueryAutoStopWordAnalyzer extends Analyzer {
Analyzer delegate;
HashMap<String,HashSet<String>> stopWordsPerField = new HashMap<String,HashSet<String>>();
//The default maximum percentage (40%) of index documents which
@ -58,7 +58,6 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
*/
public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer delegate) {
this.delegate = delegate;
setOverridesTokenStreamMethod(QueryAutoStopWordAnalyzer.class);
this.matchVersion = matchVersion;
}
@ -198,13 +197,6 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
if (overridesTokenStreamMethod) {
// LUCENE-1678: force fallback to tokenStream() if we
// have been subclassed and that subclass overrides
// tokenStream but not reusableTokenStream
return tokenStream(fieldName, reader);
}
/* map of SavedStreams for each field */
Map<String,SavedStreams> streamMap = (Map<String,SavedStreams>) getPreviousTokenStream();
if (streamMap == null) {

View File

@ -31,7 +31,7 @@ import org.apache.lucene.util.Version;
* A shingle is another name for a token based n-gram.
* </p>
*/
public class ShingleAnalyzerWrapper extends Analyzer {
public final class ShingleAnalyzerWrapper extends Analyzer {
protected Analyzer defaultAnalyzer;
protected int maxShingleSize = 2;
@ -40,7 +40,6 @@ public class ShingleAnalyzerWrapper extends Analyzer {
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
super();
this.defaultAnalyzer = defaultAnalyzer;
setOverridesTokenStreamMethod(ShingleAnalyzerWrapper.class);
}
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) {
@ -54,7 +53,6 @@ public class ShingleAnalyzerWrapper extends Analyzer {
public ShingleAnalyzerWrapper(Version matchVersion) {
super();
this.defaultAnalyzer = new StandardAnalyzer(matchVersion);
setOverridesTokenStreamMethod(ShingleAnalyzerWrapper.class);
}
/**
@ -119,13 +117,6 @@ public class ShingleAnalyzerWrapper extends Analyzer {
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
if (overridesTokenStreamMethod) {
// LUCENE-1678: force fallback to tokenStream() if we
// have been subclassed and that subclass overrides
// tokenStream but not reusableTokenStream
return tokenStream(fieldName, reader);
}
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();

View File

@ -35,11 +35,10 @@ import org.apache.lucene.util.Version;
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
public class ThaiAnalyzer extends Analyzer {
public final class ThaiAnalyzer extends Analyzer {
private final Version matchVersion;
public ThaiAnalyzer(Version matchVersion) {
setOverridesTokenStreamMethod(ThaiAnalyzer.class);
this.matchVersion = matchVersion;
}
@ -59,13 +58,6 @@ public class ThaiAnalyzer extends Analyzer {
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
if (overridesTokenStreamMethod) {
// LUCENE-1678: force fallback to tokenStream() if we
// have been subclassed and that subclass overrides
// tokenStream but not reusableTokenStream
return tokenStream(fieldName, reader);
}
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();

View File

@ -21,13 +21,9 @@ import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
/**
@ -68,24 +64,6 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
checkReuse(a, "Tischen", "tisch");
}
/**
* subclass that acts just like whitespace analyzer for testing
*/
private class GermanSubclassAnalyzer extends GermanAnalyzer {
public GermanSubclassAnalyzer(Version matchVersion) {
super(matchVersion);
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new WhitespaceTokenizer(reader);
}
}
public void testLUCENE1678BWComp() throws Exception {
checkReuse(new GermanSubclassAnalyzer(Version.LUCENE_CURRENT), "Tischen", "Tischen");
}
/*
* Test that changes to the exclusion table are applied immediately
* when using reusable token streams.

View File

@ -18,12 +18,9 @@ package org.apache.lucene.analysis.nl;
*/
import java.io.File;
import java.io.Reader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.util.Version;
/**
@ -127,27 +124,6 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
checkOneTermReuse(a, "lichamelijkheden", "licham");
}
/**
* subclass that acts just like whitespace analyzer for testing
*/
private class DutchSubclassAnalyzer extends DutchAnalyzer {
public DutchSubclassAnalyzer(Version matchVersion) {
super(matchVersion);
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new WhitespaceTokenizer(reader);
}
}
public void testLUCENE1678BWComp() throws Exception {
Analyzer a = new DutchSubclassAnalyzer(Version.LUCENE_CURRENT);
checkOneTermReuse(a, "lichaamsziek", "lichaamsziek");
checkOneTermReuse(a, "lichamelijk", "lichamelijk");
checkOneTermReuse(a, "lichamelijke", "lichamelijke");
checkOneTermReuse(a, "lichamelijkheden", "lichamelijkheden");
}
/*
* Test that changes to the exclusion table are applied immediately
* when using reusable token streams.

View File

@ -148,27 +148,6 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
}
/**
* subclass that acts just like whitespace analyzer for testing
*/
private class QueryAutoStopWordSubclassAnalyzer extends QueryAutoStopWordAnalyzer {
public QueryAutoStopWordSubclassAnalyzer(Version matchVersion) {
super(matchVersion, new WhitespaceAnalyzer());
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new WhitespaceTokenizer(reader);
}
}
public void testLUCENE1678BWComp() throws Exception {
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordSubclassAnalyzer(Version.LUCENE_CURRENT);
a.addStopWords(reader, "repetitiveField", 10);
int numHits = search(a, "repetitiveField:boring");
assertFalse(numHits == 0);
}
/*
* analyzer that does not support reuse
* it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even.

View File

@ -213,28 +213,6 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
new int[] { 1, 0, 1, 0, 1, 0, 1 });
}
/**
* subclass that acts just like whitespace analyzer for testing
*/
private class ShingleWrapperSubclassAnalyzer extends ShingleAnalyzerWrapper {
public ShingleWrapperSubclassAnalyzer() {
super(org.apache.lucene.util.Version.LUCENE_CURRENT);
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new WhitespaceTokenizer(reader);
}
};
public void testLUCENE1678BWComp() throws Exception {
Analyzer a = new ShingleWrapperSubclassAnalyzer();
assertAnalyzesToReuse(a, "this is a test",
new String[] { "this", "is", "a", "test" },
new int[] { 0, 5, 8, 10 },
new int[] { 4, 7, 9, 14 });
}
/*
* analyzer that does not support reuse
* it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even.

View File

@ -17,12 +17,7 @@ package org.apache.lucene.analysis.th;
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.util.Version;
/**
@ -124,22 +119,4 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
}
/**
* subclass that acts just like whitespace analyzer for testing
*/
private class ThaiSubclassAnalyzer extends ThaiAnalyzer {
public ThaiSubclassAnalyzer(Version matchVersion) {
super(matchVersion);
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new WhitespaceTokenizer(reader);
}
}
public void testLUCENE1678BWComp() throws Exception {
ThaiSubclassAnalyzer a = new ThaiSubclassAnalyzer(Version.LUCENE_CURRENT);
assertAnalyzesToReuse(a, "การที่ได้ต้องแสดงว่างานดี", new String[] { "การที่ได้ต้องแสดงว่างานดี" });
}
}

View File

@ -58,7 +58,7 @@ import org.apache.lucene.util.Version;
* supported anymore in such a case.</font>
* </p>
*/
public class SmartChineseAnalyzer extends Analyzer {
public final class SmartChineseAnalyzer extends Analyzer {
private final Set<?> stopWords;

View File

@ -69,7 +69,7 @@ import java.io.IOException;
* java.text.Collator over several languages.
* </p>
*/
public class ICUCollationKeyAnalyzer extends Analyzer {
public final class ICUCollationKeyAnalyzer extends Analyzer {
private Collator collator;
public ICUCollationKeyAnalyzer(Collator collator) {

View File

@ -40,7 +40,7 @@ import java.util.Set;
* </ul>
* </p>
*/
public class SnowballAnalyzer extends Analyzer {
public final class SnowballAnalyzer extends Analyzer {
private String name;
private Set<?> stopSet;
private final Version matchVersion;
@ -48,7 +48,6 @@ public class SnowballAnalyzer extends Analyzer {
/** Builds the named analyzer with no stop words. */
public SnowballAnalyzer(Version matchVersion, String name) {
this.name = name;
setOverridesTokenStreamMethod(SnowballAnalyzer.class);
this.matchVersion = matchVersion;
}
@ -80,7 +79,7 @@ public class SnowballAnalyzer extends Analyzer {
private class SavedStreams {
Tokenizer source;
TokenStream result;
};
}
/** Returns a (possibly reused) {@link StandardTokenizer} filtered by a
* {@link StandardFilter}, a {@link LowerCaseFilter},
@ -88,13 +87,6 @@ public class SnowballAnalyzer extends Analyzer {
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
if (overridesTokenStreamMethod) {
// LUCENE-1678: force fallback to tokenStream() if we
// have been subclassed and that subclass overrides
// tokenStream but not reusableTokenStream
return tokenStream(fieldName, reader);
}
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();

View File

@ -17,11 +17,8 @@ package org.apache.lucene.analysis.snowball;
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.index.Payload;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
@ -86,26 +83,6 @@ public class TestSnowball extends BaseTokenStreamTestCase {
new String[]{"she", "abhor", "him"});
}
/**
* subclass that acts just like whitespace analyzer for testing
*/
private class SnowballSubclassAnalyzer extends SnowballAnalyzer {
public SnowballSubclassAnalyzer(String name) {
super(Version.LUCENE_CURRENT, name);
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new WhitespaceTokenizer(reader);
}
}
public void testLUCENE1678BWComp() throws Exception {
Analyzer a = new SnowballSubclassAnalyzer("English");
assertAnalyzesToReuse(a, "he abhorred accents",
new String[]{"he", "abhorred", "accents"});
}
public void testFilterTokens() throws Exception {
SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);