LUCENE-2051: Contrib Analyzer Setters should be deprecated and replace with ctor arguments, thanks to Simon Willnauer

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@880715 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2009-11-16 11:48:37 +00:00
parent 7370094ead
commit 00f07ee460
13 changed files with 478 additions and 128 deletions

View File

@ -154,6 +154,10 @@ New features
* LUCENE-2041: Parallelize the rest of ParallelMultiSearcher. Lots of
code refactoring and Java 5 concurrent support in MultiSearcher.
(Joey Surls, Simon Willnauer via Uwe Schindler)
* LUCENE-2051: Add CharArraySet.copy() as a simple method to copy
any Set<?> to a CharArraySet that is optimized, if Set<?> is already
an CharArraySet.
Optimizations

View File

@ -20,6 +20,10 @@ API Changes
text exactly the same as LowerCaseFilter. Please use LowerCaseFilter
instead, which has the same functionality. (Robert Muir)
* LUCENE-2051: Contrib Analyzer setters were deprecated and replaced
with ctor arguments / Version number. Also stop word lists
were unified. (Simon Willnauer)
Bug fixes
* LUCENE-1781: Fixed various issues with the lat/lng bounding box
@ -59,6 +63,7 @@ Optimizations
Previous versions were loading the stopword files each time a new
instance was created. This might improve performance for applications
creating lots of instances of these Analyzers. (Simon Willnauer)
Documentation
* LUCENE-1916: Translated documentation in the smartcn hhmm package.
@ -72,7 +77,6 @@ Build
* LUCENE-2031: Moved PatternAnalyzer from contrib/memory into
contrib/analyzers/common, under miscellaneous. (Robert Muir)
Test Cases
======================= Release 2.9.1 2009-11-06 =======================
Changes in backwards compatibility policy

View File

@ -23,11 +23,11 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Collections;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
@ -67,7 +67,8 @@ public final class ArabicAnalyzer extends Analyzer {
*/
private final Set<?> stoptable;
/**
* The comment character in the stopwords file. All lines prefixed with this will be ignored
* The comment character in the stopwords file. All lines prefixed with this will be ignored
* @deprecated use {@link WordlistLoader#getWordSet(File, String)} directly
*/
public static final String STOPWORDS_COMMENT = "#";
@ -116,32 +117,44 @@ public final class ArabicAnalyzer extends Analyzer {
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/
public ArabicAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words
*
* @param matchVersion
* lucene compatibility version
* @param stopwords
* a stopword set
*/
public ArabicAnalyzer(Version matchVersion, Set<?> stopwords){
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
this.matchVersion = matchVersion;
stoptable = DefaultSetHolder.DEFAULT_STOP_SET;
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #ArabicAnalyzer(Version, Set)} instead
*/
public ArabicAnalyzer( Version matchVersion, String... stopwords ) {
stoptable = StopFilter.makeStopSet( stopwords );
this.matchVersion = matchVersion;
this(matchVersion, StopFilter.makeStopSet( stopwords ));
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #ArabicAnalyzer(Version, Set)} instead
*/
public ArabicAnalyzer( Version matchVersion, Hashtable<?,?> stopwords ) {
stoptable = new HashSet(stopwords.keySet());
this.matchVersion = matchVersion;
this(matchVersion, stopwords.keySet());
}
/**
* Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT}
* @deprecated use {@link #ArabicAnalyzer(Version, Set)} instead
*/
public ArabicAnalyzer( Version matchVersion, File stopwords ) throws IOException {
stoptable = WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT);
this.matchVersion = matchVersion;
this(matchVersion, WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT));
}

View File

@ -20,12 +20,14 @@ package org.apache.lucene.analysis.br;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.Collections;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
@ -51,7 +53,9 @@ public final class BrazilianAnalyzer extends Analyzer {
/**
* List of typical Brazilian Portuguese stopwords.
* @deprecated use {@link #getDefaultStopSet()} instead
*/
// TODO make this private in 3.1
public final static String[] BRAZILIAN_STOP_WORDS = {
"a","ainda","alem","ambas","ambos","antes",
"ao","aonde","aos","apos","aquele","aqueles",
@ -73,52 +77,98 @@ public final class BrazilianAnalyzer extends Analyzer {
"suas","tal","tambem","teu","teus","toda","todas","todo",
"todos","tua","tuas","tudo","um","uma","umas","uns"};
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
public static Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET = CharArraySet
.unmodifiableSet(new CharArraySet(Arrays.asList(BRAZILIAN_STOP_WORDS),
false));
}
/**
* Contains the stopwords used with the {@link StopFilter}.
*/
private Set stoptable = Collections.emptySet();
private final Set<?> stoptable;
/**
* Contains words that should be indexed but not stemmed.
*/
private Set excltable = Collections.emptySet();
private final Version matchVersion;
// TODO make this private in 3.1
private Set<?> excltable = Collections.emptySet();
private final Version matchVersion;
/**
* Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}).
*/
public BrazilianAnalyzer(Version matchVersion) {
stoptable = StopFilter.makeStopSet( BRAZILIAN_STOP_WORDS );
this.matchVersion = matchVersion;
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words
*
* @param matchVersion
* lucene compatibility version
* @param stopwords
* a stopword set
*/
public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords) {
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
this.matchVersion = matchVersion;
}
/**
* Builds an analyzer with the given stop words and stemming exclusion words
*
* @param matchVersion
* lucene compatibility version
* @param stopwords
* a stopword set
* @param stemExclutionSet
* a stemming exclusion set
*/
public BrazilianAnalyzer(Version matchVersion, Set<?> stopset,
Set<?> stemExclusionSet) {
this(matchVersion, stopset);
excltable = CharArraySet.unmodifiableSet(CharArraySet
.copy(stemExclusionSet));
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
*/
public BrazilianAnalyzer( Version matchVersion, String... stopwords ) {
stoptable = StopFilter.makeStopSet( stopwords );
this.matchVersion = matchVersion;
}
public BrazilianAnalyzer(Version matchVersion, String... stopwords) {
this(matchVersion, StopFilter.makeStopSet(stopwords));
}
/**
* Builds an analyzer with the given stop words.
*/
public BrazilianAnalyzer( Version matchVersion, Map stopwords ) {
stoptable = new HashSet(stopwords.keySet());
this.matchVersion = matchVersion;
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
*/
public BrazilianAnalyzer(Version matchVersion, Map<?,?> stopwords) {
this(matchVersion, stopwords.keySet());
}
/**
* Builds an analyzer with the given stop words.
*/
public BrazilianAnalyzer( Version matchVersion, File stopwords ) throws IOException {
stoptable = WordlistLoader.getWordSet( stopwords );
this.matchVersion = matchVersion;
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
*/
public BrazilianAnalyzer(Version matchVersion, File stopwords)
throws IOException {
this(matchVersion, WordlistLoader.getWordSet(stopwords));
}
/**
* Builds an exclusionlist from an array of Strings.
* @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable( String... exclusionlist ) {
excltable = StopFilter.makeStopSet( exclusionlist );
@ -126,13 +176,15 @@ public final class BrazilianAnalyzer extends Analyzer {
}
/**
* Builds an exclusionlist from a {@link Map}.
* @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable( Map exclusionlist ) {
excltable = new HashSet(exclusionlist.keySet());
public void setStemExclusionTable( Map<?,?> exclusionlist ) {
excltable = new HashSet<Object>(exclusionlist.keySet());
setPreviousTokenStream(null); // force a new stemmer to be created
}
/**
* Builds an exclusionlist from the words contained in the given file.
* @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable( File exclusionlist ) throws IOException {
excltable = WordlistLoader.getWordSet( exclusionlist );

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.cjk;
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@ -25,6 +26,7 @@ import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Set;
@ -39,7 +41,10 @@ public class CJKAnalyzer extends Analyzer {
/**
* An array containing some common English words that are not usually
* useful for searching and some double-byte interpunctions.
* @deprecated use {@link #getDefaultStopSet()} instead
*/
// TODO make this final in 3.1 -
// this might be revised and merged with StopFilter stop words too
public final static String[] STOP_WORDS = {
"a", "and", "are", "as", "at", "be",
"but", "by", "for", "if", "in",
@ -53,10 +58,23 @@ public class CJKAnalyzer extends Analyzer {
//~ Instance fields --------------------------------------------------------
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
public static Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET = CharArraySet
.unmodifiableSet(new CharArraySet(Arrays.asList(STOP_WORDS),
false));
}
/**
* stop word list
*/
private final Set stopTable;
private final Set<?> stopTable;
private final Version matchVersion;
//~ Constructors -----------------------------------------------------------
@ -65,7 +83,19 @@ public class CJKAnalyzer extends Analyzer {
* Builds an analyzer which removes words in {@link #STOP_WORDS}.
*/
public CJKAnalyzer(Version matchVersion) {
stopTable = StopFilter.makeStopSet(STOP_WORDS);
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words
*
* @param matchVersion
* lucene compatibility version
* @param stopwords
* a stopword set
*/
public CJKAnalyzer(Version matchVersion, Set<?> stopwords){
stopTable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
this.matchVersion = matchVersion;
}
@ -73,6 +103,7 @@ public class CJKAnalyzer extends Analyzer {
* Builds an analyzer which removes words in the provided array.
*
* @param stopWords stop word array
* @deprecated use {@link #CJKAnalyzer(Version, Set)} instead
*/
public CJKAnalyzer(Version matchVersion, String... stopWords) {
stopTable = StopFilter.makeStopSet(stopWords);

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.cz;
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
@ -29,6 +30,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.util.Version;
import java.io.*;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.Collections;
@ -48,7 +50,9 @@ public final class CzechAnalyzer extends Analyzer {
/**
* List of typical stopwords.
* @deprecated use {@link #getDefaultStopSet()} instead
*/
// TODO make this private in 3.1
public final static String[] CZECH_STOP_WORDS = {
"a","s","k","o","i","u","v","z","dnes","cz","t\u00edmto","bude\u0161","budem",
"byli","jse\u0161","m\u016fj","sv\u00fdm","ta","tomto","tohle","tuto","tyto",
@ -69,51 +73,84 @@ public final class CzechAnalyzer extends Analyzer {
"j\u00ed","ji","m\u011b","mne","jemu","tomu","t\u011bm","t\u011bmu","n\u011bmu","n\u011bmu\u017e",
"jeho\u017e","j\u00ed\u017e","jeliko\u017e","je\u017e","jako\u017e","na\u010de\u017e",
};
/**
* Returns a set of default Czech-stopwords
* @return a set of default Czech-stopwords
*/
public static final Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_SET;
}
private static class DefaultSetHolder {
private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
Arrays.asList(CZECH_STOP_WORDS), false));
}
/**
* Contains the stopwords used with the {@link StopFilter}.
*/
private Set stoptable;
private final Version matchVersion;
// TODO make this final in 3.1
private Set<?> stoptable;
private final Version matchVersion;
/**
* Builds an analyzer with the default stop words ({@link #CZECH_STOP_WORDS}).
*/
public CzechAnalyzer(Version matchVersion) {
stoptable = StopFilter.makeStopSet( CZECH_STOP_WORDS );
this.matchVersion = matchVersion;
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
}
/**
* Builds an analyzer with the given stop words and stemming exclusion words
*
* @param matchversion
* lucene compatibility version
* @param stopwords
* a stopword set
*/
public CzechAnalyzer(Version matchVersion, Set<?> stopwords) {
this.matchVersion = matchVersion;
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
*/
public CzechAnalyzer(Version matchVersion, String... stopwords) {
this(matchVersion, StopFilter.makeStopSet( stopwords ));
}
/**
* Builds an analyzer with the given stop words.
*
* @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
*/
public CzechAnalyzer(Version matchVersion, HashSet<?> stopwords) {
this(matchVersion, (Set<?>)stopwords);
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
*/
public CzechAnalyzer(Version matchVersion, String... stopwords) {
stoptable = StopFilter.makeStopSet( stopwords );
this.matchVersion = matchVersion;
}
public CzechAnalyzer(Version matchVersion, HashSet stopwords) {
stoptable = stopwords;
this.matchVersion = matchVersion;
}
/**
* Builds an analyzer with the given stop words.
*/
public CzechAnalyzer(Version matchVersion, File stopwords ) throws IOException {
stoptable = WordlistLoader.getWordSet( stopwords );
this.matchVersion = matchVersion;
public CzechAnalyzer(Version matchVersion, File stopwords ) throws IOException {
this(matchVersion, (Set<?>)WordlistLoader.getWordSet( stopwords ));
}
/**
* Loads stopwords hash from resource stream (file, database...).
* @param wordfile File containing the wordlist
* @param encoding Encoding used (win-1250, iso-8859-2, ...), null for default system encoding
* @deprecated use {@link WordlistLoader#getWordSet(Reader, String) }
* and {@link #CzechAnalyzer(Version, Set)} instead
*/
public void loadStopWords( InputStream wordfile, String encoding ) {
setPreviousTokenStream(null); // force a new stopfilter to be created
if ( wordfile == null ) {
stoptable = new HashSet();
stoptable = Collections.emptySet();
return;
}
try {

View File

@ -21,11 +21,13 @@ package org.apache.lucene.analysis.de;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
@ -53,7 +55,9 @@ public class GermanAnalyzer extends Analyzer {
/**
* List of typical german stopwords.
* @deprecated use {@link #getDefaultStopSet()} instead
*/
//TODO make this private in 3.1
public final static String[] GERMAN_STOP_WORDS = {
"einer", "eine", "eines", "einem", "einen",
"der", "die", "das", "dass", "daß",
@ -68,58 +72,99 @@ public class GermanAnalyzer extends Analyzer {
"mein", "sein", "kein",
"durch", "wegen", "wird"
};
/**
* Returns a set of default German-stopwords
* @return a set of default German-stopwords
*/
public static final Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_SET;
}
private static class DefaultSetHolder {
private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
Arrays.asList(GERMAN_STOP_WORDS), false));
}
/**
* Contains the stopwords used with the {@link StopFilter}.
*/
private Set stopSet = new HashSet();
//TODO make this final in 3.1
private Set<?> stopSet;
/**
* Contains words that should be indexed but not stemmed.
*/
private Set exclusionSet = new HashSet();
// TODO make this final in 3.1
private Set<?> exclusionSet;
private final Version matchVersion;
/**
* Builds an analyzer with the default stop words:
* {@link #GERMAN_STOP_WORDS}.
* {@link #getDefaultStopSet()}.
*/
public GermanAnalyzer(Version matchVersion) {
stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS);
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
}
/**
* Builds an analyzer with the given stop words
*
* @param matchversion
* lucene compatibility version
* @param stopwords
* a stopword set
*/
public GermanAnalyzer(Version matchVersion, Set<?> stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
}
/**
* Builds an analyzer with the given stop words
*
* @param matchversion
* lucene compatibility version
* @param stopwords
* a stopword set
* @param stemExclutionSet
* a stemming exclusion set
*/
public GermanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
setOverridesTokenStreamMethod(GermanAnalyzer.class);
this.matchVersion = matchVersion;
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #GermanAnalyzer(Version, Set)}
*/
public GermanAnalyzer(Version matchVersion, String... stopwords) {
stopSet = StopFilter.makeStopSet(stopwords);
setOverridesTokenStreamMethod(GermanAnalyzer.class);
this.matchVersion = matchVersion;
this(matchVersion, StopFilter.makeStopSet(stopwords));
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #GermanAnalyzer(Version, Set)}
*/
public GermanAnalyzer(Version matchVersion, Map stopwords) {
stopSet = new HashSet(stopwords.keySet());
setOverridesTokenStreamMethod(GermanAnalyzer.class);
this.matchVersion = matchVersion;
public GermanAnalyzer(Version matchVersion, Map<?,?> stopwords) {
this(matchVersion, stopwords.keySet());
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #GermanAnalyzer(Version, Set)}
*/
public GermanAnalyzer(Version matchVersion, File stopwords) throws IOException {
stopSet = WordlistLoader.getWordSet(stopwords);
setOverridesTokenStreamMethod(GermanAnalyzer.class);
this.matchVersion = matchVersion;
this(matchVersion, WordlistLoader.getWordSet(stopwords));
}
/**
* Builds an exclusionlist from an array of Strings.
* @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable(String[] exclusionlist) {
exclusionSet = StopFilter.makeStopSet(exclusionlist);
@ -128,6 +173,7 @@ public class GermanAnalyzer extends Analyzer {
/**
* Builds an exclusionlist from a {@link Map}
* @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable(Map exclusionlist) {
exclusionSet = new HashSet(exclusionlist.keySet());
@ -136,6 +182,7 @@ public class GermanAnalyzer extends Analyzer {
/**
* Builds an exclusionlist from the words contained in the given file.
* @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable(File exclusionlist) throws IOException {
exclusionSet = WordlistLoader.getWordSet(exclusionlist);

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.el;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@ -27,7 +28,7 @@ import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
import java.util.HashSet;
import java.util.Arrays;
import java.util.Map;
import java.util.Set;
@ -58,39 +59,61 @@ public final class GreekAnalyzer extends Analyzer
"εκεινοι", "εκεινεσ", "εκεινα", "εκεινων", "εκεινουσ", "οπωσ", "ομωσ",
"ισωσ", "οσο", "οτι"
};
/**
* Returns a set of default Greek-stopwords
* @return a set of default Greek-stopwords
*/
public static final Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_SET;
}
private static class DefaultSetHolder {
private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
Arrays.asList(GREEK_STOP_WORDS), false));
}
/**
* Contains the stopwords used with the {@link StopFilter}.
*/
private Set stopSet = new HashSet();
private final Set<?> stopSet;
private final Version matchVersion;
public GreekAnalyzer(Version matchVersion) {
super();
stopSet = StopFilter.makeStopSet(GREEK_STOP_WORDS);
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
}
/**
* Builds an analyzer with the given stop words
*
* @param matchversion
* lucene compatibility version
* @param stopwords
* a stopword set
*/
public GreekAnalyzer(Version matchVersion, Set<?> stopwords) {
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
this.matchVersion = matchVersion;
}
/**
* Builds an analyzer with the given stop words.
* @param stopwords Array of stopwords to use.
* @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
*/
public GreekAnalyzer(Version matchVersion, String... stopwords)
{
super();
stopSet = StopFilter.makeStopSet(stopwords);
this.matchVersion = matchVersion;
this(matchVersion, StopFilter.makeStopSet(stopwords));
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
*/
public GreekAnalyzer(Version matchVersion, Map stopwords)
public GreekAnalyzer(Version matchVersion, Map<?,?> stopwords)
{
super();
stopSet = new HashSet(stopwords.keySet());
this.matchVersion = matchVersion;
this(matchVersion, stopwords.keySet());
}
/**

View File

@ -23,11 +23,11 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Collections;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
@ -60,7 +60,7 @@ public final class PersianAnalyzer extends Analyzer {
/**
* Contains the stopwords used with the StopFilter.
*/
private final Set stoptable;
private final Set<?> stoptable;
/**
* The comment character in the stopwords file. All lines prefixed with this
@ -72,7 +72,7 @@ public final class PersianAnalyzer extends Analyzer {
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
public static Set<String> getDefaultStopSet(){
public static Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
@ -81,7 +81,7 @@ public final class PersianAnalyzer extends Analyzer {
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final Set<String> DEFAULT_STOP_SET;
static final Set<?> DEFAULT_STOP_SET;
static {
try {
@ -114,33 +114,45 @@ public final class PersianAnalyzer extends Analyzer {
* {@link #DEFAULT_STOPWORD_FILE}.
*/
public PersianAnalyzer(Version matchVersion) {
stoptable = DefaultSetHolder.DEFAULT_STOP_SET;
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words
*
* @param matchversion
* lucene compatibility version
* @param stopwords
* a stopword set
*/
public PersianAnalyzer(Version matchVersion, Set<?> stopwords){
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
this.matchVersion = matchVersion;
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
*/
public PersianAnalyzer(Version matchVersion, String... stopwords) {
stoptable = StopFilter.makeStopSet(stopwords);
this.matchVersion = matchVersion;
this(matchVersion, StopFilter.makeStopSet(stopwords));
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
*/
public PersianAnalyzer(Version matchVersion, Hashtable stopwords) {
stoptable = new HashSet(stopwords.keySet());
this.matchVersion = matchVersion;
public PersianAnalyzer(Version matchVersion, Hashtable<?, ?> stopwords) {
this(matchVersion, stopwords.keySet());
}
/**
* Builds an analyzer with the given stop words. Lines can be commented out
* using {@link #STOPWORDS_COMMENT}
* @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
*/
public PersianAnalyzer(Version matchVersion, File stopwords) throws IOException {
stoptable = WordlistLoader.getWordSet(stopwords, STOPWORDS_COMMENT);
this.matchVersion = matchVersion;
this(matchVersion, WordlistLoader.getWordSet(stopwords, STOPWORDS_COMMENT));
}
/**

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.fr;
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
@ -31,6 +32,7 @@ import org.apache.lucene.util.Version;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
@ -60,7 +62,9 @@ public final class FrenchAnalyzer extends Analyzer {
/**
* Extended list of typical French stopwords.
* @deprecated use {@link #getDefaultStopSet()} instead
*/
// TODO make this final in 3.1
public final static String[] FRENCH_STOP_WORDS = {
"a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
"autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir",
@ -89,41 +93,87 @@ public final class FrenchAnalyzer extends Analyzer {
/**
* Contains the stopwords used with the {@link StopFilter}.
*/
private Set stoptable = new HashSet();
private final Set<?> stoptable;
/**
* Contains words that should be indexed but not stemmed.
*/
private Set excltable = new HashSet();
//TODO make this final in 3.0
private Set<?> excltable = new HashSet();
private final Version matchVersion;
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
public static Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET = CharArraySet
.unmodifiableSet(new CharArraySet(Arrays.asList(FRENCH_STOP_WORDS),
false));
}
/**
* Builds an analyzer with the default stop words ({@link #FRENCH_STOP_WORDS}).
*/
public FrenchAnalyzer(Version matchVersion) {
stoptable = StopFilter.makeStopSet(FRENCH_STOP_WORDS);
this.matchVersion = matchVersion;
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words
*
* @param matchversion
* lucene compatibility version
* @param stopwords
* a stopword set
*/
public FrenchAnalyzer(Version matchVersion, Set<?> stopwords){
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
}
/**
* Builds an analyzer with the given stop words
*
* @param matchversion
* lucene compatibility version
* @param stopwords
* a stopword set
* @param stemExclutionSet
* a stemming exclusion set
*/
public FrenchAnalyzer(Version matchVersion, Set<?> stopwords,
Set<?> stemExclutionSet) {
this.matchVersion = matchVersion;
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
this.excltable = CharArraySet.unmodifiableSet(CharArraySet
.copy(stemExclutionSet));
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead
*/
public FrenchAnalyzer(Version matchVersion, String... stopwords) {
stoptable = StopFilter.makeStopSet(stopwords);
this.matchVersion = matchVersion;
this(matchVersion, StopFilter.makeStopSet(stopwords));
}
/**
* Builds an analyzer with the given stop words.
* @throws IOException
* @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead
*/
public FrenchAnalyzer(Version matchVersion, File stopwords) throws IOException {
stoptable = new HashSet(WordlistLoader.getWordSet(stopwords));
this.matchVersion = matchVersion;
this(matchVersion, WordlistLoader.getWordSet(stopwords));
}
/**
* Builds an exclusionlist from an array of Strings.
* @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable(String... exclusionlist) {
excltable = StopFilter.makeStopSet(exclusionlist);
@ -132,6 +182,7 @@ public final class FrenchAnalyzer extends Analyzer {
/**
* Builds an exclusionlist from a Map.
* @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable(Map exclusionlist) {
excltable = new HashSet(exclusionlist.keySet());
@ -141,6 +192,7 @@ public final class FrenchAnalyzer extends Analyzer {
/**
* Builds an exclusionlist from the words contained in the given file.
* @throws IOException
* @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable(File exclusionlist) throws IOException {
excltable = new HashSet(WordlistLoader.getWordSet(exclusionlist));

View File

@ -18,9 +18,11 @@ package org.apache.lucene.analysis.nl;
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
@ -29,6 +31,8 @@ import org.apache.lucene.util.Version;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
@ -51,6 +55,7 @@ import java.util.Map;
public class DutchAnalyzer extends Analyzer {
/**
* List of typical Dutch stopwords.
* @deprecated use {@link #getDefaultStopSet()} instead
*/
public final static String[] DUTCH_STOP_WORDS =
{
@ -65,19 +70,32 @@ public class DutchAnalyzer extends Analyzer {
"wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", "niets",
"uw", "iemand", "geweest", "andere"
};
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
public static Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET = CharArraySet
.unmodifiableSet(new CharArraySet(Arrays.asList(DUTCH_STOP_WORDS),
false));
}
/**
* Contains the stopwords used with the StopFilter.
*/
private Set stoptable = new HashSet();
private final Set<?> stoptable;
/**
* Contains words that should be indexed but not stemmed.
*/
private Set excltable = new HashSet();
private Set<?> excltable = Collections.emptySet();
private Map stemdict = new HashMap();
private Map<String, String> stemdict = new HashMap<String, String>();
private final Version matchVersion;
/**
@ -86,13 +104,22 @@ public class DutchAnalyzer extends Analyzer {
*
*/
public DutchAnalyzer(Version matchVersion) {
setOverridesTokenStreamMethod(DutchAnalyzer.class);
stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
stemdict.put("fiets", "fiets"); //otherwise fiet
stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
stemdict.put("ei", "eier");
stemdict.put("kind", "kinder");
}
public DutchAnalyzer(Version matchVersion, Set<?> stopwords){
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
}
public DutchAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionTable){
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionTable));
this.matchVersion = matchVersion;
setOverridesTokenStreamMethod(DutchAnalyzer.class);
}
/**
@ -100,30 +127,30 @@ public class DutchAnalyzer extends Analyzer {
*
* @param matchVersion
* @param stopwords
* @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
*/
public DutchAnalyzer(Version matchVersion, String... stopwords) {
setOverridesTokenStreamMethod(DutchAnalyzer.class);
stoptable = StopFilter.makeStopSet(stopwords);
this.matchVersion = matchVersion;
this(matchVersion, StopFilter.makeStopSet(stopwords));
}
/**
* Builds an analyzer with the given stop words.
*
* @param stopwords
* @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
*/
public DutchAnalyzer(Version matchVersion, HashSet stopwords) {
setOverridesTokenStreamMethod(DutchAnalyzer.class);
stoptable = stopwords;
this.matchVersion = matchVersion;
public DutchAnalyzer(Version matchVersion, HashSet<?> stopwords) {
this(matchVersion, (Set<?>)stopwords);
}
/**
* Builds an analyzer with the given stop words.
*
* @param stopwords
* @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
*/
public DutchAnalyzer(Version matchVersion, File stopwords) {
// this is completely broken!
setOverridesTokenStreamMethod(DutchAnalyzer.class);
try {
stoptable = org.apache.lucene.analysis.WordlistLoader.getWordSet(stopwords);
@ -138,6 +165,7 @@ public class DutchAnalyzer extends Analyzer {
* Builds an exclusionlist from an array of Strings.
*
* @param exclusionlist
* @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable(String... exclusionlist) {
excltable = StopFilter.makeStopSet(exclusionlist);
@ -146,14 +174,16 @@ public class DutchAnalyzer extends Analyzer {
/**
* Builds an exclusionlist from a Hashtable.
* @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable(HashSet exclusionlist) {
public void setStemExclusionTable(HashSet<?> exclusionlist) {
excltable = exclusionlist;
setPreviousTokenStream(null); // force a new stemmer to be created
}
/**
* Builds an exclusionlist from the words contained in the given file.
* @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable(File exclusionlist) {
try {
@ -172,7 +202,7 @@ public class DutchAnalyzer extends Analyzer {
*/
public void setStemDictionary(File stemdictFile) {
try {
stemdict = org.apache.lucene.analysis.WordlistLoader.getStemDict(stemdictFile);
stemdict = WordlistLoader.getStemDict(stemdictFile);
setPreviousTokenStream(null); // force a new stemmer to be created
} catch (IOException e) {
// TODO: throw IOException

View File

@ -19,11 +19,12 @@ package org.apache.lucene.analysis.ru;
import java.io.IOException;
import java.io.Reader;
import java.util.HashSet;
import java.util.Arrays;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
@ -55,37 +56,53 @@ public final class RussianAnalyzer extends Analyzer
"тоже", "той", "только", "том", "ты", "у", "уже", "хотя", "чего", "чей",
"чем", "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я"
};
private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET = CharArraySet
.unmodifiableSet(new CharArraySet(Arrays.asList(RUSSIAN_STOP_WORDS),
false));
}
/**
* Contains the stopwords used with the StopFilter.
*/
private Set stopSet = new HashSet();
private final Set<?> stopSet;
private final Version matchVersion;
public RussianAnalyzer(Version matchVersion) {
this(matchVersion, RUSSIAN_STOP_WORDS);
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
*/
public RussianAnalyzer(Version matchVersion, String... stopwords)
{
super();
stopSet = StopFilter.makeStopSet(stopwords);
public RussianAnalyzer(Version matchVersion, String... stopwords) {
this(matchVersion, StopFilter.makeStopSet(stopwords));
}
/**
* Builds an analyzer with the given stop words
*
* @param matchversion
* lucene compatibility version
* @param stopwords
* a stopword set
*/
public RussianAnalyzer(Version matchVersion, Set<?> stopwords){
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
this.matchVersion = matchVersion;
}
/**
* Builds an analyzer with the given stop words.
* TODO: create a Set version of this ctor
* @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
*/
public RussianAnalyzer(Version matchVersion, Map stopwords)
public RussianAnalyzer(Version matchVersion, Map<?,?> stopwords)
{
super();
stopSet = new HashSet(stopwords.keySet());
this.matchVersion = matchVersion;
this(matchVersion, stopwords.keySet());
}
/**

View File

@ -4,6 +4,7 @@ import java.util.AbstractSet;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.Set;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -47,6 +48,7 @@ public class CharArraySet extends AbstractSet<Object> {
private char[][] entries;
private int count;
private final boolean ignoreCase;
public static final CharArraySet EMPTY_SET = CharArraySet.unmodifiableSet(new CharArraySet(0, false));
/** Create set with enough capacity to hold startSize
* terms */
@ -263,6 +265,11 @@ public class CharArraySet extends AbstractSet<Object> {
public static CharArraySet unmodifiableSet(CharArraySet set) {
if (set == null)
throw new NullPointerException("Given set is null");
if (set == EMPTY_SET)
return EMPTY_SET;
if (set instanceof UnmodifiableCharArraySet)
return set;
/*
* Instead of delegating calls to the given set copy the low-level values to
* the unmodifiable Subclass
@ -270,6 +277,27 @@ public class CharArraySet extends AbstractSet<Object> {
return new UnmodifiableCharArraySet(set.entries, set.ignoreCase, set.count);
}
/**
* Returns a copy of the given set as a {@link CharArraySet}. If the given set
* is a {@link CharArraySet} the ignoreCase property will be preserved.
*
* @param set
* a set to copy
* @return a copy of the given set as a {@link CharArraySet}. If the given set
* is a {@link CharArraySet} the ignoreCase property will be
* preserved.
*/
public static CharArraySet copy(Set<?> set) {
if (set == null)
throw new NullPointerException("Given set is null");
if(set == EMPTY_SET)
return EMPTY_SET;
final boolean ignoreCase = set instanceof CharArraySet ? ((CharArraySet) set).ignoreCase
: false;
return new CharArraySet(set, ignoreCase);
}
/** The Iterator<String> for this set. Strings are constructed on the fly, so
* use <code>nextCharArray</code> for more efficient access. */
public class CharArraySetIterator implements Iterator<String> {