mirror of
https://github.com/apache/lucene.git
synced 2025-02-06 10:08:58 +00:00
LUCENE-2051: Contrib Analyzer Setters should be deprecated and replace with ctor arguments, thanks to Simon Willnauer
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@880715 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7370094ead
commit
00f07ee460
@ -154,6 +154,10 @@ New features
|
||||
* LUCENE-2041: Parallelize the rest of ParallelMultiSearcher. Lots of
|
||||
code refactoring and Java 5 concurrent support in MultiSearcher.
|
||||
(Joey Surls, Simon Willnauer via Uwe Schindler)
|
||||
|
||||
* LUCENE-2051: Add CharArraySet.copy() as a simple method to copy
|
||||
any Set<?> to a CharArraySet that is optimized, if Set<?> is already
|
||||
an CharArraySet.
|
||||
|
||||
Optimizations
|
||||
|
||||
|
@ -20,6 +20,10 @@ API Changes
|
||||
text exactly the same as LowerCaseFilter. Please use LowerCaseFilter
|
||||
instead, which has the same functionality. (Robert Muir)
|
||||
|
||||
* LUCENE-2051: Contrib Analyzer setters were deprecated and replaced
|
||||
with ctor arguments / Version number. Also stop word lists
|
||||
were unified. (Simon Willnauer)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-1781: Fixed various issues with the lat/lng bounding box
|
||||
@ -59,6 +63,7 @@ Optimizations
|
||||
Previous versions were loading the stopword files each time a new
|
||||
instance was created. This might improve performance for applications
|
||||
creating lots of instances of these Analyzers. (Simon Willnauer)
|
||||
|
||||
Documentation
|
||||
|
||||
* LUCENE-1916: Translated documentation in the smartcn hhmm package.
|
||||
@ -72,7 +77,6 @@ Build
|
||||
* LUCENE-2031: Moved PatternAnalyzer from contrib/memory into
|
||||
contrib/analyzers/common, under miscellaneous. (Robert Muir)
|
||||
|
||||
Test Cases
|
||||
======================= Release 2.9.1 2009-11-06 =======================
|
||||
|
||||
Changes in backwards compatibility policy
|
||||
|
@ -23,11 +23,11 @@ import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
@ -67,7 +67,8 @@ public final class ArabicAnalyzer extends Analyzer {
|
||||
*/
|
||||
private final Set<?> stoptable;
|
||||
/**
|
||||
* The comment character in the stopwords file. All lines prefixed with this will be ignored
|
||||
* The comment character in the stopwords file. All lines prefixed with this will be ignored
|
||||
* @deprecated use {@link WordlistLoader#getWordSet(File, String)} directly
|
||||
*/
|
||||
public static final String STOPWORDS_COMMENT = "#";
|
||||
|
||||
@ -116,32 +117,44 @@ public final class ArabicAnalyzer extends Analyzer {
|
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public ArabicAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public ArabicAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
stoptable = DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated use {@link #ArabicAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public ArabicAnalyzer( Version matchVersion, String... stopwords ) {
|
||||
stoptable = StopFilter.makeStopSet( stopwords );
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, StopFilter.makeStopSet( stopwords ));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated use {@link #ArabicAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public ArabicAnalyzer( Version matchVersion, Hashtable<?,?> stopwords ) {
|
||||
stoptable = new HashSet(stopwords.keySet());
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT}
|
||||
* @deprecated use {@link #ArabicAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public ArabicAnalyzer( Version matchVersion, File stopwords ) throws IOException {
|
||||
stoptable = WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT);
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT));
|
||||
}
|
||||
|
||||
|
||||
|
@ -20,12 +20,14 @@ package org.apache.lucene.analysis.br;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
@ -51,7 +53,9 @@ public final class BrazilianAnalyzer extends Analyzer {
|
||||
|
||||
/**
|
||||
* List of typical Brazilian Portuguese stopwords.
|
||||
* @deprecated use {@link #getDefaultStopSet()} instead
|
||||
*/
|
||||
// TODO make this private in 3.1
|
||||
public final static String[] BRAZILIAN_STOP_WORDS = {
|
||||
"a","ainda","alem","ambas","ambos","antes",
|
||||
"ao","aonde","aos","apos","aquele","aqueles",
|
||||
@ -73,52 +77,98 @@ public final class BrazilianAnalyzer extends Analyzer {
|
||||
"suas","tal","tambem","teu","teus","toda","todas","todo",
|
||||
"todos","tua","tuas","tudo","um","uma","umas","uns"};
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET = CharArraySet
|
||||
.unmodifiableSet(new CharArraySet(Arrays.asList(BRAZILIAN_STOP_WORDS),
|
||||
false));
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the {@link StopFilter}.
|
||||
*/
|
||||
private Set stoptable = Collections.emptySet();
|
||||
private final Set<?> stoptable;
|
||||
|
||||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
private Set excltable = Collections.emptySet();
|
||||
private final Version matchVersion;
|
||||
// TODO make this private in 3.1
|
||||
private Set<?> excltable = Collections.emptySet();
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}).
|
||||
*/
|
||||
public BrazilianAnalyzer(Version matchVersion) {
|
||||
stoptable = StopFilter.makeStopSet( BRAZILIAN_STOP_WORDS );
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words and stemming exclusion words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
* @param stemExclutionSet
|
||||
* a stemming exclusion set
|
||||
*/
|
||||
public BrazilianAnalyzer(Version matchVersion, Set<?> stopset,
|
||||
Set<?> stemExclusionSet) {
|
||||
this(matchVersion, stopset);
|
||||
excltable = CharArraySet.unmodifiableSet(CharArraySet
|
||||
.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public BrazilianAnalyzer( Version matchVersion, String... stopwords ) {
|
||||
stoptable = StopFilter.makeStopSet( stopwords );
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
public BrazilianAnalyzer(Version matchVersion, String... stopwords) {
|
||||
this(matchVersion, StopFilter.makeStopSet(stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public BrazilianAnalyzer( Version matchVersion, Map stopwords ) {
|
||||
stoptable = new HashSet(stopwords.keySet());
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public BrazilianAnalyzer(Version matchVersion, Map<?,?> stopwords) {
|
||||
this(matchVersion, stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public BrazilianAnalyzer( Version matchVersion, File stopwords ) throws IOException {
|
||||
stoptable = WordlistLoader.getWordSet( stopwords );
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public BrazilianAnalyzer(Version matchVersion, File stopwords)
|
||||
throws IOException {
|
||||
this(matchVersion, WordlistLoader.getWordSet(stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an exclusionlist from an array of Strings.
|
||||
* @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable( String... exclusionlist ) {
|
||||
excltable = StopFilter.makeStopSet( exclusionlist );
|
||||
@ -126,13 +176,15 @@ public final class BrazilianAnalyzer extends Analyzer {
|
||||
}
|
||||
/**
|
||||
* Builds an exclusionlist from a {@link Map}.
|
||||
* @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable( Map exclusionlist ) {
|
||||
excltable = new HashSet(exclusionlist.keySet());
|
||||
public void setStemExclusionTable( Map<?,?> exclusionlist ) {
|
||||
excltable = new HashSet<Object>(exclusionlist.keySet());
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
/**
|
||||
* Builds an exclusionlist from the words contained in the given file.
|
||||
* @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable( File exclusionlist ) throws IOException {
|
||||
excltable = WordlistLoader.getWordSet( exclusionlist );
|
||||
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.cjk;
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
@ -25,6 +26,7 @@ import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
@ -39,7 +41,10 @@ public class CJKAnalyzer extends Analyzer {
|
||||
/**
|
||||
* An array containing some common English words that are not usually
|
||||
* useful for searching and some double-byte interpunctions.
|
||||
* @deprecated use {@link #getDefaultStopSet()} instead
|
||||
*/
|
||||
// TODO make this final in 3.1 -
|
||||
// this might be revised and merged with StopFilter stop words too
|
||||
public final static String[] STOP_WORDS = {
|
||||
"a", "and", "are", "as", "at", "be",
|
||||
"but", "by", "for", "if", "in",
|
||||
@ -53,10 +58,23 @@ public class CJKAnalyzer extends Analyzer {
|
||||
|
||||
//~ Instance fields --------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET = CharArraySet
|
||||
.unmodifiableSet(new CharArraySet(Arrays.asList(STOP_WORDS),
|
||||
false));
|
||||
}
|
||||
/**
|
||||
* stop word list
|
||||
*/
|
||||
private final Set stopTable;
|
||||
private final Set<?> stopTable;
|
||||
private final Version matchVersion;
|
||||
|
||||
//~ Constructors -----------------------------------------------------------
|
||||
@ -65,7 +83,19 @@ public class CJKAnalyzer extends Analyzer {
|
||||
* Builds an analyzer which removes words in {@link #STOP_WORDS}.
|
||||
*/
|
||||
public CJKAnalyzer(Version matchVersion) {
|
||||
stopTable = StopFilter.makeStopSet(STOP_WORDS);
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchVersion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public CJKAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
stopTable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
@ -73,6 +103,7 @@ public class CJKAnalyzer extends Analyzer {
|
||||
* Builds an analyzer which removes words in the provided array.
|
||||
*
|
||||
* @param stopWords stop word array
|
||||
* @deprecated use {@link #CJKAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public CJKAnalyzer(Version matchVersion, String... stopWords) {
|
||||
stopTable = StopFilter.makeStopSet(stopWords);
|
||||
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.cz;
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
@ -29,6 +30,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.Collections;
|
||||
@ -48,7 +50,9 @@ public final class CzechAnalyzer extends Analyzer {
|
||||
|
||||
/**
|
||||
* List of typical stopwords.
|
||||
* @deprecated use {@link #getDefaultStopSet()} instead
|
||||
*/
|
||||
// TODO make this private in 3.1
|
||||
public final static String[] CZECH_STOP_WORDS = {
|
||||
"a","s","k","o","i","u","v","z","dnes","cz","t\u00edmto","bude\u0161","budem",
|
||||
"byli","jse\u0161","m\u016fj","sv\u00fdm","ta","tomto","tohle","tuto","tyto",
|
||||
@ -69,51 +73,84 @@ public final class CzechAnalyzer extends Analyzer {
|
||||
"j\u00ed","ji","m\u011b","mne","jemu","tomu","t\u011bm","t\u011bmu","n\u011bmu","n\u011bmu\u017e",
|
||||
"jeho\u017e","j\u00ed\u017e","jeliko\u017e","je\u017e","jako\u017e","na\u010de\u017e",
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a set of default Czech-stopwords
|
||||
* @return a set of default Czech-stopwords
|
||||
*/
|
||||
public static final Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_SET;
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
|
||||
Arrays.asList(CZECH_STOP_WORDS), false));
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the {@link StopFilter}.
|
||||
*/
|
||||
private Set stoptable;
|
||||
private final Version matchVersion;
|
||||
// TODO make this final in 3.1
|
||||
private Set<?> stoptable;
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words ({@link #CZECH_STOP_WORDS}).
|
||||
*/
|
||||
public CzechAnalyzer(Version matchVersion) {
|
||||
stoptable = StopFilter.makeStopSet( CZECH_STOP_WORDS );
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words and stemming exclusion words
|
||||
*
|
||||
* @param matchversion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public CzechAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
this.matchVersion = matchVersion;
|
||||
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public CzechAnalyzer(Version matchVersion, String... stopwords) {
|
||||
this(matchVersion, StopFilter.makeStopSet( stopwords ));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public CzechAnalyzer(Version matchVersion, HashSet<?> stopwords) {
|
||||
this(matchVersion, (Set<?>)stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public CzechAnalyzer(Version matchVersion, String... stopwords) {
|
||||
stoptable = StopFilter.makeStopSet( stopwords );
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
public CzechAnalyzer(Version matchVersion, HashSet stopwords) {
|
||||
stoptable = stopwords;
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public CzechAnalyzer(Version matchVersion, File stopwords ) throws IOException {
|
||||
stoptable = WordlistLoader.getWordSet( stopwords );
|
||||
this.matchVersion = matchVersion;
|
||||
public CzechAnalyzer(Version matchVersion, File stopwords ) throws IOException {
|
||||
this(matchVersion, (Set<?>)WordlistLoader.getWordSet( stopwords ));
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads stopwords hash from resource stream (file, database...).
|
||||
* @param wordfile File containing the wordlist
|
||||
* @param encoding Encoding used (win-1250, iso-8859-2, ...), null for default system encoding
|
||||
* @deprecated use {@link WordlistLoader#getWordSet(Reader, String) }
|
||||
* and {@link #CzechAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public void loadStopWords( InputStream wordfile, String encoding ) {
|
||||
setPreviousTokenStream(null); // force a new stopfilter to be created
|
||||
if ( wordfile == null ) {
|
||||
stoptable = new HashSet();
|
||||
stoptable = Collections.emptySet();
|
||||
return;
|
||||
}
|
||||
try {
|
||||
|
@ -21,11 +21,13 @@ package org.apache.lucene.analysis.de;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
@ -53,7 +55,9 @@ public class GermanAnalyzer extends Analyzer {
|
||||
|
||||
/**
|
||||
* List of typical german stopwords.
|
||||
* @deprecated use {@link #getDefaultStopSet()} instead
|
||||
*/
|
||||
//TODO make this private in 3.1
|
||||
public final static String[] GERMAN_STOP_WORDS = {
|
||||
"einer", "eine", "eines", "einem", "einen",
|
||||
"der", "die", "das", "dass", "daß",
|
||||
@ -68,58 +72,99 @@ public class GermanAnalyzer extends Analyzer {
|
||||
"mein", "sein", "kein",
|
||||
"durch", "wegen", "wird"
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a set of default German-stopwords
|
||||
* @return a set of default German-stopwords
|
||||
*/
|
||||
public static final Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_SET;
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
|
||||
Arrays.asList(GERMAN_STOP_WORDS), false));
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the {@link StopFilter}.
|
||||
*/
|
||||
private Set stopSet = new HashSet();
|
||||
//TODO make this final in 3.1
|
||||
private Set<?> stopSet;
|
||||
|
||||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
private Set exclusionSet = new HashSet();
|
||||
// TODO make this final in 3.1
|
||||
private Set<?> exclusionSet;
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words:
|
||||
* {@link #GERMAN_STOP_WORDS}.
|
||||
* {@link #getDefaultStopSet()}.
|
||||
*/
|
||||
public GermanAnalyzer(Version matchVersion) {
|
||||
stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS);
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchversion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public GermanAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchversion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
* @param stemExclutionSet
|
||||
* a stemming exclusion set
|
||||
*/
|
||||
public GermanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
setOverridesTokenStreamMethod(GermanAnalyzer.class);
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated use {@link #GermanAnalyzer(Version, Set)}
|
||||
*/
|
||||
public GermanAnalyzer(Version matchVersion, String... stopwords) {
|
||||
stopSet = StopFilter.makeStopSet(stopwords);
|
||||
setOverridesTokenStreamMethod(GermanAnalyzer.class);
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, StopFilter.makeStopSet(stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated use {@link #GermanAnalyzer(Version, Set)}
|
||||
*/
|
||||
public GermanAnalyzer(Version matchVersion, Map stopwords) {
|
||||
stopSet = new HashSet(stopwords.keySet());
|
||||
setOverridesTokenStreamMethod(GermanAnalyzer.class);
|
||||
this.matchVersion = matchVersion;
|
||||
public GermanAnalyzer(Version matchVersion, Map<?,?> stopwords) {
|
||||
this(matchVersion, stopwords.keySet());
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated use {@link #GermanAnalyzer(Version, Set)}
|
||||
*/
|
||||
public GermanAnalyzer(Version matchVersion, File stopwords) throws IOException {
|
||||
stopSet = WordlistLoader.getWordSet(stopwords);
|
||||
setOverridesTokenStreamMethod(GermanAnalyzer.class);
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, WordlistLoader.getWordSet(stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an exclusionlist from an array of Strings.
|
||||
* @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable(String[] exclusionlist) {
|
||||
exclusionSet = StopFilter.makeStopSet(exclusionlist);
|
||||
@ -128,6 +173,7 @@ public class GermanAnalyzer extends Analyzer {
|
||||
|
||||
/**
|
||||
* Builds an exclusionlist from a {@link Map}
|
||||
* @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable(Map exclusionlist) {
|
||||
exclusionSet = new HashSet(exclusionlist.keySet());
|
||||
@ -136,6 +182,7 @@ public class GermanAnalyzer extends Analyzer {
|
||||
|
||||
/**
|
||||
* Builds an exclusionlist from the words contained in the given file.
|
||||
* @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable(File exclusionlist) throws IOException {
|
||||
exclusionSet = WordlistLoader.getWordSet(exclusionlist);
|
||||
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.el;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
@ -27,7 +28,7 @@ import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
@ -58,39 +59,61 @@ public final class GreekAnalyzer extends Analyzer
|
||||
"εκεινοι", "εκεινεσ", "εκεινα", "εκεινων", "εκεινουσ", "οπωσ", "ομωσ",
|
||||
"ισωσ", "οσο", "οτι"
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a set of default Greek-stopwords
|
||||
* @return a set of default Greek-stopwords
|
||||
*/
|
||||
public static final Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_SET;
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
|
||||
Arrays.asList(GREEK_STOP_WORDS), false));
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the {@link StopFilter}.
|
||||
*/
|
||||
private Set stopSet = new HashSet();
|
||||
private final Set<?> stopSet;
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
public GreekAnalyzer(Version matchVersion) {
|
||||
super();
|
||||
stopSet = StopFilter.makeStopSet(GREEK_STOP_WORDS);
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchversion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public GreekAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @param stopwords Array of stopwords to use.
|
||||
* @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public GreekAnalyzer(Version matchVersion, String... stopwords)
|
||||
{
|
||||
super();
|
||||
stopSet = StopFilter.makeStopSet(stopwords);
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, StopFilter.makeStopSet(stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated use {@link #GreekAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public GreekAnalyzer(Version matchVersion, Map stopwords)
|
||||
public GreekAnalyzer(Version matchVersion, Map<?,?> stopwords)
|
||||
{
|
||||
super();
|
||||
stopSet = new HashSet(stopwords.keySet());
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -23,11 +23,11 @@ import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Hashtable;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
@ -60,7 +60,7 @@ public final class PersianAnalyzer extends Analyzer {
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private final Set stoptable;
|
||||
private final Set<?> stoptable;
|
||||
|
||||
/**
|
||||
* The comment character in the stopwords file. All lines prefixed with this
|
||||
@ -72,7 +72,7 @@ public final class PersianAnalyzer extends Analyzer {
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<String> getDefaultStopSet(){
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
@ -81,7 +81,7 @@ public final class PersianAnalyzer extends Analyzer {
|
||||
* accesses the static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<String> DEFAULT_STOP_SET;
|
||||
static final Set<?> DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
@ -114,33 +114,45 @@ public final class PersianAnalyzer extends Analyzer {
|
||||
* {@link #DEFAULT_STOPWORD_FILE}.
|
||||
*/
|
||||
public PersianAnalyzer(Version matchVersion) {
|
||||
stoptable = DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchversion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public PersianAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public PersianAnalyzer(Version matchVersion, String... stopwords) {
|
||||
stoptable = StopFilter.makeStopSet(stopwords);
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, StopFilter.makeStopSet(stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public PersianAnalyzer(Version matchVersion, Hashtable stopwords) {
|
||||
stoptable = new HashSet(stopwords.keySet());
|
||||
this.matchVersion = matchVersion;
|
||||
public PersianAnalyzer(Version matchVersion, Hashtable<?, ?> stopwords) {
|
||||
this(matchVersion, stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words. Lines can be commented out
|
||||
* using {@link #STOPWORDS_COMMENT}
|
||||
* @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public PersianAnalyzer(Version matchVersion, File stopwords) throws IOException {
|
||||
stoptable = WordlistLoader.getWordSet(stopwords, STOPWORDS_COMMENT);
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, WordlistLoader.getWordSet(stopwords, STOPWORDS_COMMENT));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.fr;
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
@ -31,6 +32,7 @@ import org.apache.lucene.util.Version;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
@ -60,7 +62,9 @@ public final class FrenchAnalyzer extends Analyzer {
|
||||
|
||||
/**
|
||||
* Extended list of typical French stopwords.
|
||||
* @deprecated use {@link #getDefaultStopSet()} instead
|
||||
*/
|
||||
// TODO make this final in 3.1
|
||||
public final static String[] FRENCH_STOP_WORDS = {
|
||||
"a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
|
||||
"autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir",
|
||||
@ -89,41 +93,87 @@ public final class FrenchAnalyzer extends Analyzer {
|
||||
/**
|
||||
* Contains the stopwords used with the {@link StopFilter}.
|
||||
*/
|
||||
private Set stoptable = new HashSet();
|
||||
private final Set<?> stoptable;
|
||||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
private Set excltable = new HashSet();
|
||||
//TODO make this final in 3.0
|
||||
private Set<?> excltable = new HashSet();
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET = CharArraySet
|
||||
.unmodifiableSet(new CharArraySet(Arrays.asList(FRENCH_STOP_WORDS),
|
||||
false));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words ({@link #FRENCH_STOP_WORDS}).
|
||||
*/
|
||||
public FrenchAnalyzer(Version matchVersion) {
|
||||
stoptable = StopFilter.makeStopSet(FRENCH_STOP_WORDS);
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchversion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public FrenchAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchversion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
* @param stemExclutionSet
|
||||
* a stemming exclusion set
|
||||
*/
|
||||
public FrenchAnalyzer(Version matchVersion, Set<?> stopwords,
|
||||
Set<?> stemExclutionSet) {
|
||||
this.matchVersion = matchVersion;
|
||||
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
this.excltable = CharArraySet.unmodifiableSet(CharArraySet
|
||||
.copy(stemExclutionSet));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public FrenchAnalyzer(Version matchVersion, String... stopwords) {
|
||||
stoptable = StopFilter.makeStopSet(stopwords);
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, StopFilter.makeStopSet(stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @throws IOException
|
||||
* @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public FrenchAnalyzer(Version matchVersion, File stopwords) throws IOException {
|
||||
stoptable = new HashSet(WordlistLoader.getWordSet(stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, WordlistLoader.getWordSet(stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an exclusionlist from an array of Strings.
|
||||
* @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable(String... exclusionlist) {
|
||||
excltable = StopFilter.makeStopSet(exclusionlist);
|
||||
@ -132,6 +182,7 @@ public final class FrenchAnalyzer extends Analyzer {
|
||||
|
||||
/**
|
||||
* Builds an exclusionlist from a Map.
|
||||
* @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable(Map exclusionlist) {
|
||||
excltable = new HashSet(exclusionlist.keySet());
|
||||
@ -141,6 +192,7 @@ public final class FrenchAnalyzer extends Analyzer {
|
||||
/**
|
||||
* Builds an exclusionlist from the words contained in the given file.
|
||||
* @throws IOException
|
||||
* @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable(File exclusionlist) throws IOException {
|
||||
excltable = new HashSet(WordlistLoader.getWordSet(exclusionlist));
|
||||
|
@ -18,9 +18,11 @@ package org.apache.lucene.analysis.nl;
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WordlistLoader;
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
|
||||
@ -29,6 +31,8 @@ import org.apache.lucene.util.Version;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
@ -51,6 +55,7 @@ import java.util.Map;
|
||||
public class DutchAnalyzer extends Analyzer {
|
||||
/**
|
||||
* List of typical Dutch stopwords.
|
||||
* @deprecated use {@link #getDefaultStopSet()} instead
|
||||
*/
|
||||
public final static String[] DUTCH_STOP_WORDS =
|
||||
{
|
||||
@ -65,19 +70,32 @@ public class DutchAnalyzer extends Analyzer {
|
||||
"wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", "niets",
|
||||
"uw", "iemand", "geweest", "andere"
|
||||
};
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static Set<?> getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET = CharArraySet
|
||||
.unmodifiableSet(new CharArraySet(Arrays.asList(DUTCH_STOP_WORDS),
|
||||
false));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private Set stoptable = new HashSet();
|
||||
private final Set<?> stoptable;
|
||||
|
||||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
private Set excltable = new HashSet();
|
||||
private Set<?> excltable = Collections.emptySet();
|
||||
|
||||
private Map stemdict = new HashMap();
|
||||
private Map<String, String> stemdict = new HashMap<String, String>();
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
@ -86,13 +104,22 @@ public class DutchAnalyzer extends Analyzer {
|
||||
*
|
||||
*/
|
||||
public DutchAnalyzer(Version matchVersion) {
|
||||
setOverridesTokenStreamMethod(DutchAnalyzer.class);
|
||||
stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
stemdict.put("fiets", "fiets"); //otherwise fiet
|
||||
stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
|
||||
stemdict.put("ei", "eier");
|
||||
stemdict.put("kind", "kinder");
|
||||
}
|
||||
|
||||
public DutchAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
public DutchAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionTable){
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionTable));
|
||||
this.matchVersion = matchVersion;
|
||||
setOverridesTokenStreamMethod(DutchAnalyzer.class);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -100,30 +127,30 @@ public class DutchAnalyzer extends Analyzer {
|
||||
*
|
||||
* @param matchVersion
|
||||
* @param stopwords
|
||||
* @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public DutchAnalyzer(Version matchVersion, String... stopwords) {
|
||||
setOverridesTokenStreamMethod(DutchAnalyzer.class);
|
||||
stoptable = StopFilter.makeStopSet(stopwords);
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, StopFilter.makeStopSet(stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param stopwords
|
||||
* @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public DutchAnalyzer(Version matchVersion, HashSet stopwords) {
|
||||
setOverridesTokenStreamMethod(DutchAnalyzer.class);
|
||||
stoptable = stopwords;
|
||||
this.matchVersion = matchVersion;
|
||||
public DutchAnalyzer(Version matchVersion, HashSet<?> stopwords) {
|
||||
this(matchVersion, (Set<?>)stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param stopwords
|
||||
* @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public DutchAnalyzer(Version matchVersion, File stopwords) {
|
||||
// this is completely broken!
|
||||
setOverridesTokenStreamMethod(DutchAnalyzer.class);
|
||||
try {
|
||||
stoptable = org.apache.lucene.analysis.WordlistLoader.getWordSet(stopwords);
|
||||
@ -138,6 +165,7 @@ public class DutchAnalyzer extends Analyzer {
|
||||
* Builds an exclusionlist from an array of Strings.
|
||||
*
|
||||
* @param exclusionlist
|
||||
* @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable(String... exclusionlist) {
|
||||
excltable = StopFilter.makeStopSet(exclusionlist);
|
||||
@ -146,14 +174,16 @@ public class DutchAnalyzer extends Analyzer {
|
||||
|
||||
/**
|
||||
* Builds an exclusionlist from a Hashtable.
|
||||
* @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable(HashSet exclusionlist) {
|
||||
public void setStemExclusionTable(HashSet<?> exclusionlist) {
|
||||
excltable = exclusionlist;
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an exclusionlist from the words contained in the given file.
|
||||
* @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable(File exclusionlist) {
|
||||
try {
|
||||
@ -172,7 +202,7 @@ public class DutchAnalyzer extends Analyzer {
|
||||
*/
|
||||
public void setStemDictionary(File stemdictFile) {
|
||||
try {
|
||||
stemdict = org.apache.lucene.analysis.WordlistLoader.getStemDict(stemdictFile);
|
||||
stemdict = WordlistLoader.getStemDict(stemdictFile);
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
} catch (IOException e) {
|
||||
// TODO: throw IOException
|
||||
|
@ -19,11 +19,12 @@ package org.apache.lucene.analysis.ru;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
@ -55,37 +56,53 @@ public final class RussianAnalyzer extends Analyzer
|
||||
"тоже", "той", "только", "том", "ты", "у", "уже", "хотя", "чего", "чей",
|
||||
"чем", "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я"
|
||||
};
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET = CharArraySet
|
||||
.unmodifiableSet(new CharArraySet(Arrays.asList(RUSSIAN_STOP_WORDS),
|
||||
false));
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains the stopwords used with the StopFilter.
|
||||
*/
|
||||
private Set stopSet = new HashSet();
|
||||
private final Set<?> stopSet;
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
public RussianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, RUSSIAN_STOP_WORDS);
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public RussianAnalyzer(Version matchVersion, String... stopwords)
|
||||
{
|
||||
super();
|
||||
stopSet = StopFilter.makeStopSet(stopwords);
|
||||
public RussianAnalyzer(Version matchVersion, String... stopwords) {
|
||||
this(matchVersion, StopFilter.makeStopSet(stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchversion
|
||||
* lucene compatibility version
|
||||
* @param stopwords
|
||||
* a stopword set
|
||||
*/
|
||||
public RussianAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* TODO: create a Set version of this ctor
|
||||
* @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public RussianAnalyzer(Version matchVersion, Map stopwords)
|
||||
public RussianAnalyzer(Version matchVersion, Map<?,?> stopwords)
|
||||
{
|
||||
super();
|
||||
stopSet = new HashSet(stopwords.keySet());
|
||||
this.matchVersion = matchVersion;
|
||||
this(matchVersion, stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -4,6 +4,7 @@ import java.util.AbstractSet;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
@ -47,6 +48,7 @@ public class CharArraySet extends AbstractSet<Object> {
|
||||
private char[][] entries;
|
||||
private int count;
|
||||
private final boolean ignoreCase;
|
||||
public static final CharArraySet EMPTY_SET = CharArraySet.unmodifiableSet(new CharArraySet(0, false));
|
||||
|
||||
/** Create set with enough capacity to hold startSize
|
||||
* terms */
|
||||
@ -263,6 +265,11 @@ public class CharArraySet extends AbstractSet<Object> {
|
||||
public static CharArraySet unmodifiableSet(CharArraySet set) {
|
||||
if (set == null)
|
||||
throw new NullPointerException("Given set is null");
|
||||
if (set == EMPTY_SET)
|
||||
return EMPTY_SET;
|
||||
if (set instanceof UnmodifiableCharArraySet)
|
||||
return set;
|
||||
|
||||
/*
|
||||
* Instead of delegating calls to the given set copy the low-level values to
|
||||
* the unmodifiable Subclass
|
||||
@ -270,6 +277,27 @@ public class CharArraySet extends AbstractSet<Object> {
|
||||
return new UnmodifiableCharArraySet(set.entries, set.ignoreCase, set.count);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a copy of the given set as a {@link CharArraySet}. If the given set
|
||||
* is a {@link CharArraySet} the ignoreCase property will be preserved.
|
||||
*
|
||||
* @param set
|
||||
* a set to copy
|
||||
* @return a copy of the given set as a {@link CharArraySet}. If the given set
|
||||
* is a {@link CharArraySet} the ignoreCase property will be
|
||||
* preserved.
|
||||
*/
|
||||
public static CharArraySet copy(Set<?> set) {
|
||||
if (set == null)
|
||||
throw new NullPointerException("Given set is null");
|
||||
if(set == EMPTY_SET)
|
||||
return EMPTY_SET;
|
||||
final boolean ignoreCase = set instanceof CharArraySet ? ((CharArraySet) set).ignoreCase
|
||||
: false;
|
||||
return new CharArraySet(set, ignoreCase);
|
||||
}
|
||||
|
||||
|
||||
/** The Iterator<String> for this set. Strings are constructed on the fly, so
|
||||
* use <code>nextCharArray</code> for more efficient access. */
|
||||
public class CharArraySetIterator implements Iterator<String> {
|
||||
|
Loading…
x
Reference in New Issue
Block a user