LUCENE-2094: Prepare CharArraySet for Unicode 4.0

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@885592 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2009-11-30 21:49:21 +00:00
parent c155a0c477
commit 9edfb3b66a
33 changed files with 953 additions and 275 deletions

View File

@ -25,6 +25,13 @@ Bug fixes
New features
* LUCENE-2069: Added Unicode 4 support to CharArraySet. Due to the switch
to Java 5, supplementary characters are now lowercased correctly if the
set is created as case insensitive.
CharArraySet now requires a Version argument to preserve
backwards compatibility. If Version < 3.1 is passed to the constructor,
CharArraySet yields the old behavior. (Simon Willnauer)
* LUCENE-2069: Added Unicode 4 support to LowerCaseFilter. Due to the switch
to Java 5, supplementary characters are now lowercased correctly.
LowerCaseFilter now requires a Version argument to preserve

View File

@ -129,7 +129,7 @@ public final class ArabicAnalyzer extends Analyzer {
* a stopword set
*/
public ArabicAnalyzer(Version matchVersion, Set<?> stopwords){
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
this.matchVersion = matchVersion;
}
@ -138,7 +138,7 @@ public final class ArabicAnalyzer extends Analyzer {
* @deprecated use {@link #ArabicAnalyzer(Version, Set)} instead
*/
public ArabicAnalyzer( Version matchVersion, String... stopwords ) {
this(matchVersion, StopFilter.makeStopSet( stopwords ));
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords ));
}
/**
@ -170,8 +170,7 @@ public final class ArabicAnalyzer extends Analyzer {
TokenStream result = new ArabicLetterTokenizer( reader );
result = new LowerCaseFilter(matchVersion, result);
// the order here is important: the stopword list is not normalized!
result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stoptable );
result = new StopFilter( matchVersion, result, stoptable );
result = new ArabicNormalizationFilter( result );
result = new ArabicStemFilter( result );
@ -200,8 +199,7 @@ public final class ArabicAnalyzer extends Analyzer {
streams.source = new ArabicLetterTokenizer(reader);
streams.result = new LowerCaseFilter(matchVersion, streams.source);
// the order here is important: the stopword list is not normalized!
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stoptable);
streams.result = new StopFilter( matchVersion, streams.result, stoptable);
streams.result = new ArabicNormalizationFilter(streams.result);
streams.result = new ArabicStemFilter(streams.result);
setPreviousTokenStream(streams);

View File

@ -87,8 +87,8 @@ public final class BrazilianAnalyzer extends Analyzer {
private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET = CharArraySet
.unmodifiableSet(new CharArraySet(Arrays.asList(BRAZILIAN_STOP_WORDS),
false));
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList(BRAZILIAN_STOP_WORDS), false));
}
/**
@ -120,7 +120,7 @@ public final class BrazilianAnalyzer extends Analyzer {
* a stopword set
*/
public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords) {
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
this.matchVersion = matchVersion;
}
@ -136,7 +136,7 @@ public final class BrazilianAnalyzer extends Analyzer {
Set<?> stemExclusionSet) {
this(matchVersion, stopwords);
excltable = CharArraySet.unmodifiableSet(CharArraySet
.copy(stemExclusionSet));
.copy(matchVersion, stemExclusionSet));
}
/**
@ -144,7 +144,7 @@ public final class BrazilianAnalyzer extends Analyzer {
* @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
*/
public BrazilianAnalyzer(Version matchVersion, String... stopwords) {
this(matchVersion, StopFilter.makeStopSet(stopwords));
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
}
/**
@ -169,7 +169,7 @@ public final class BrazilianAnalyzer extends Analyzer {
* @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable( String... exclusionlist ) {
excltable = StopFilter.makeStopSet( exclusionlist );
excltable = StopFilter.makeStopSet( matchVersion, exclusionlist );
setPreviousTokenStream(null); // force a new stemmer to be created
}
/**
@ -201,8 +201,7 @@ public final class BrazilianAnalyzer extends Analyzer {
TokenStream result = new StandardTokenizer( matchVersion, reader );
result = new LowerCaseFilter( matchVersion, result );
result = new StandardFilter( result );
result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stoptable );
result = new StopFilter( matchVersion, result, stoptable );
result = new BrazilianStemFilter( result, excltable );
return result;
}
@ -229,8 +228,7 @@ public final class BrazilianAnalyzer extends Analyzer {
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new LowerCaseFilter(matchVersion, streams.source);
streams.result = new StandardFilter(streams.result);
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stoptable);
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
streams.result = new BrazilianStemFilter(streams.result, excltable);
setPreviousTokenStream(streams);
} else {

View File

@ -68,7 +68,7 @@ public class CJKAnalyzer extends Analyzer {
private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET = CharArraySet
.unmodifiableSet(new CharArraySet(Arrays.asList(STOP_WORDS),
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS),
false));
}
/**
@ -95,7 +95,7 @@ public class CJKAnalyzer extends Analyzer {
* a stopword set
*/
public CJKAnalyzer(Version matchVersion, Set<?> stopwords){
stopTable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
stopTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
this.matchVersion = matchVersion;
}
@ -106,7 +106,7 @@ public class CJKAnalyzer extends Analyzer {
* @deprecated use {@link #CJKAnalyzer(Version, Set)} instead
*/
public CJKAnalyzer(Version matchVersion, String... stopWords) {
stopTable = StopFilter.makeStopSet(stopWords);
stopTable = StopFilter.makeStopSet(matchVersion, stopWords);
this.matchVersion = matchVersion;
}
@ -122,8 +122,7 @@ public class CJKAnalyzer extends Analyzer {
*/
@Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
return new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
new CJKTokenizer(reader), stopTable);
return new StopFilter(matchVersion, new CJKTokenizer(reader), stopTable);
}
private class SavedStreams {
@ -147,8 +146,7 @@ public class CJKAnalyzer extends Analyzer {
if (streams == null) {
streams = new SavedStreams();
streams.source = new CJKTokenizer(reader);
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.source, stopTable);
streams.result = new StopFilter(matchVersion, streams.source, stopTable);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
/**
* A {@link TokenFilter} with a stop word table.
@ -63,7 +64,7 @@ public final class ChineseFilter extends TokenFilter {
public ChineseFilter(TokenStream in) {
super(in);
stopTable = new CharArraySet(Arrays.asList(STOP_WORDS), false);
stopTable = new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS), false);
termAtt = addAttribute(TermAttribute.class);
}

View File

@ -20,7 +20,6 @@ package org.apache.lucene.analysis.compound;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
@ -34,9 +33,18 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.Version;
/**
* Base class for decomposition token filters.
* Base class for decomposition token filters. <a name="version"/>
* <p>
* You must specify the required {@link Version} compatibility when creating
* CompoundWordTokenFilterBase:
* <ul>
* <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
* supplementary characters in strings and char arrays provided as compound word
* dictionaries.
* </ul>
*/
public abstract class CompoundWordTokenFilterBase extends TokenFilter {
/**
@ -55,7 +63,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
protected final CharArraySet dictionary;
protected final LinkedList tokens;
protected final LinkedList<Token> tokens;
protected final int minWordSize;
protected final int minSubwordSize;
protected final int maxSubwordSize;
@ -69,31 +77,72 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
private PayloadAttribute payloadAtt;
private final Token wrapper = new Token();
/**
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], int, int, int, boolean) instead
*/
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
this(input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
this(Version.LUCENE_30, input, makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
}
/**
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], boolean) instead
*/
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
this(input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
}
protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, boolean onlyLongestMatch) {
this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
/**
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set, boolean) instead
*/
protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {
this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
}
/**
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[]) instead
*/
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) {
this(input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
}
/**
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set) instead
*/
protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary) {
this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
}
protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary) {
this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
/**
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set[], int, int, int, boolean) instead
*/
protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
this(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
}
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
this(matchVersion, input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
}
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
}
protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary, boolean onlyLongestMatch) {
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
}
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) {
this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
}
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary) {
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
}
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(input);
this.tokens=new LinkedList();
this.tokens=new LinkedList<Token>();
this.minWordSize=minWordSize;
this.minSubwordSize=minSubwordSize;
this.maxSubwordSize=maxSubwordSize;
@ -102,7 +151,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
if (dictionary instanceof CharArraySet) {
this.dictionary = (CharArraySet) dictionary;
} else {
this.dictionary = new CharArraySet(dictionary.size(), false);
this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false);
addAllLowerCase(this.dictionary, dictionary);
}
@ -121,9 +170,13 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
* @param dictionary
* @return {@link Set} of lowercased terms
*/
public static final Set makeDictionary(final String[] dictionary) {
public static final Set<?> makeDictionary(final String[] dictionary) {
return makeDictionary(Version.LUCENE_30, dictionary);
}
public static final Set<?> makeDictionary(final Version matchVersion, final String[] dictionary) {
// is the below really case insensitive?
CharArraySet dict = new CharArraySet(dictionary.length, false);
CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false);
addAllLowerCase(dict, Arrays.asList(dictionary));
return dict;
}
@ -140,11 +193,11 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
@Override
public final boolean incrementToken() throws IOException {
if (tokens.size() > 0) {
setToken((Token)tokens.removeFirst());
setToken(tokens.removeFirst());
return true;
}
if (input.incrementToken() == false)
if (!input.incrementToken())
return false;
wrapper.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
@ -158,18 +211,16 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
decompose(wrapper);
if (tokens.size() > 0) {
setToken((Token)tokens.removeFirst());
setToken(tokens.removeFirst());
return true;
} else {
return false;
}
}
protected static final void addAllLowerCase(Set target, Collection col) {
Iterator iter=col.iterator();
while (iter.hasNext()) {
target.add(((String)iter.next()).toLowerCase());
protected static final void addAllLowerCase(Set<Object> target, Collection<String> col) {
for (String string : col) {
target.add(string.toLowerCase());
}
}

View File

@ -23,6 +23,7 @@ import java.util.Set;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter; // for javadocs
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
/**
* A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
@ -33,7 +34,9 @@ import org.apache.lucene.analysis.TokenStream;
* </p>
*/
public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
/**
* Creates a new {@link DictionaryCompoundWordTokenFilter}
*
* @param input the {@link TokenStream} to process
* @param dictionary the word dictionary to match against
@ -41,33 +44,39 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
* @param minSubwordSize only subwords longer than this get to the output stream
* @param maxSubwordSize only subwords shorter than this get to the output stream
* @param onlyLongestMatch Add only the longest matching subword to the stream
* @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, String[], int, int, int, boolean)} instead
*/
public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary,
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
}
/**
*
* Creates a new {@link DictionaryCompoundWordTokenFilter}
*
* @param input the {@link TokenStream} to process
* @param dictionary the word dictionary to match against
* @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, String[])} instead
*/
public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary) {
super(input, dictionary);
super(Version.LUCENE_30, input, dictionary);
}
/**
*
* Creates a new {@link DictionaryCompoundWordTokenFilter}
*
* @param input the {@link TokenStream} to process
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
* lower case strings.
* lower case strings.
* @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, Set)} instead
*/
public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary) {
super(input, dictionary);
super(Version.LUCENE_30, input, dictionary);
}
/**
*
* Creates a new {@link DictionaryCompoundWordTokenFilter}
*
* @param input the {@link TokenStream} to process
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
* lower case strings.
@ -75,10 +84,104 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
* @param minSubwordSize only subwords longer than this get to the output stream
* @param maxSubwordSize only subwords shorter than this get to the output stream
* @param onlyLongestMatch Add only the longest matching subword to the stream
* @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, Set, int, int, int, boolean)} instead
*/
public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary,
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
}
/**
* Creates a new {@link DictionaryCompoundWordTokenFilter}
*
* @param matchVersion
* Lucene version to enable correct Unicode 4.0 behavior in the
* dictionaries if Version > 3.0. See <a
* href="CompoundWordTokenFilterBase#version"
* >CompoundWordTokenFilterBase</a> for details.
* @param input
* the {@link TokenStream} to process
* @param dictionary
* the word dictionary to match against
* @param minWordSize
* only words longer than this get processed
* @param minSubwordSize
* only subwords longer than this get to the output stream
* @param maxSubwordSize
* only subwords shorter than this get to the output stream
* @param onlyLongestMatch
* Add only the longest matching subword to the stream
*/
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary,
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
}
/**
* Creates a new {@link DictionaryCompoundWordTokenFilter}
*
* @param matchVersion
* Lucene version to enable correct Unicode 4.0 behavior in the
* dictionaries if Version > 3.0. See <a
* href="CompoundWordTokenFilterBase#version"
* >CompoundWordTokenFilterBase</a> for details.
*
* @param input
* the {@link TokenStream} to process
* @param dictionary
* the word dictionary to match against
*/
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary) {
super(matchVersion, input, dictionary);
}
/**
* Creates a new {@link DictionaryCompoundWordTokenFilter}
*
* @param matchVersion
* Lucene version to enable correct Unicode 4.0 behavior in the
* dictionaries if Version > 3.0. See <a
* href="CompoundWordTokenFilterBase#version"
* >CompoundWordTokenFilterBase</a> for details.
* @param input
* the {@link TokenStream} to process
* @param dictionary
* the word dictionary to match against. If this is a
* {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
* must have set ignoreCase=false and only contain lower case
* strings.
*/
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary) {
super(matchVersion, input, dictionary);
}
/**
* Creates a new {@link DictionaryCompoundWordTokenFilter}
*
* @param matchVersion
* Lucene version to enable correct Unicode 4.0 behavior in the
* dictionaries if Version > 3.0. See <a
* href="CompoundWordTokenFilterBase#version"
* >CompoundWordTokenFilterBase</a> for details.
* @param input
* the {@link TokenStream} to process
* @param dictionary
* the word dictionary to match against. If this is a
* {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
* must have set ignoreCase=false and only contain lower case
* strings.
* @param minWordSize
* only words longer than this get processed
* @param minSubwordSize
* only subwords longer than this get to the output stream
* @param maxSubwordSize
* only subwords shorter than this get to the output stream
* @param onlyLongestMatch
* Add only the longest matching subword to the stream
*/
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary,
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
}
@Override

View File

@ -28,6 +28,7 @@ import org.apache.lucene.analysis.TokenFilter; // for javadocs
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.util.Version;
import org.xml.sax.InputSource;
/**
@ -41,20 +42,31 @@ import org.xml.sax.InputSource;
public class HyphenationCompoundWordTokenFilter extends
CompoundWordTokenFilterBase {
private HyphenationTree hyphenator;
/**
*
* @param input the {@link TokenStream} to process
* @param hyphenator the hyphenation pattern tree to use for hyphenation
* @param dictionary the word dictionary to match against
* @param minWordSize only words longer than this get processed
* @param minSubwordSize only subwords longer than this get to the output
* stream
* @param maxSubwordSize only subwords shorter than this get to the output
* stream
* @param onlyLongestMatch Add only the longest matching subword to the stream
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
*
* @param matchVersion
* Lucene version to enable correct Unicode 4.0 behavior in the
* dictionaries if Version > 3.0. See <a
* href="CompoundWordTokenFilterBase#version"
* >CompoundWordTokenFilterBase</a> for details.
* @param input
* the {@link TokenStream} to process
* @param hyphenator
* the hyphenation pattern tree to use for hyphenation
* @param dictionary
* the word dictionary to match against
* @param minWordSize
* only words longer than this get processed
* @param minSubwordSize
* only subwords longer than this get to the output stream
* @param maxSubwordSize
* only subwords shorter than this get to the output stream
* @param onlyLongestMatch
* Add only the longest matching subword to the stream
*/
public HyphenationCompoundWordTokenFilter(TokenStream input,
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
HyphenationTree hyphenator, String[] dictionary, int minWordSize,
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
this(input, hyphenator, makeDictionary(dictionary), minWordSize,
@ -62,32 +74,138 @@ public class HyphenationCompoundWordTokenFilter extends
}
/**
*
* @param input the {@link TokenStream} to process
* @param hyphenator the hyphenation pattern tree to use for hyphenation
* @param dictionary the word dictionary to match against
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
*
* @param matchVersion
* Lucene version to enable correct Unicode 4.0 behavior in the
* dictionaries if Version > 3.0. See <a
* href="CompoundWordTokenFilterBase#version"
* >CompoundWordTokenFilterBase</a> for details.
* @param input
* the {@link TokenStream} to process
* @param hyphenator
* the hyphenation pattern tree to use for hyphenation
* @param dictionary
* the word dictionary to match against
*/
public HyphenationCompoundWordTokenFilter(TokenStream input,
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
HyphenationTree hyphenator, String[] dictionary) {
this(input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
}
/**
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
*
* @param input the {@link TokenStream} to process
* @param hyphenator the hyphenation pattern tree to use for hyphenation
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
* lower case strings.
* @param matchVersion
* Lucene version to enable correct Unicode 4.0 behavior in the
* dictionaries if Version > 3.0. See <a
* href="CompoundWordTokenFilterBase#version"
* >CompoundWordTokenFilterBase</a> for details.
* @param input
* the {@link TokenStream} to process
* @param hyphenator
* the hyphenation pattern tree to use for hyphenation
* @param dictionary
* the word dictionary to match against. If this is a
* {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
* must have set ignoreCase=false and only contain lower case
* strings.
*/
public HyphenationCompoundWordTokenFilter(TokenStream input,
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
HyphenationTree hyphenator, Set dictionary) {
this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
}
/**
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
*
* @param matchVersion
* Lucene version to enable correct Unicode 4.0 behavior in the
* dictionaries if Version > 3.0. See <a
* href="CompoundWordTokenFilterBase#version"
* >CompoundWordTokenFilterBase</a> for details.
* @param input
* the {@link TokenStream} to process
* @param hyphenator
* the hyphenation pattern tree to use for hyphenation
* @param dictionary
* the word dictionary to match against. If this is a
* {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
* must have set ignoreCase=false and only contain lower case
* strings.
* @param minWordSize
* only words longer than this get processed
* @param minSubwordSize
* only subwords longer than this get to the output stream
* @param maxSubwordSize
* only subwords shorter than this get to the output stream
* @param onlyLongestMatch
* Add only the longest matching subword to the stream
*/
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
HyphenationTree hyphenator, Set dictionary, int minWordSize,
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
onlyLongestMatch);
this.hyphenator = hyphenator;
}
/**
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
*
* @param input the {@link TokenStream} to process
* @param hyphenator the hyphenation pattern tree to use for hyphenation
* @param dictionary the word dictionary to match against
* @param minWordSize only words longer than this get processed
* @param minSubwordSize only subwords longer than this get to the output
* stream
* @param maxSubwordSize only subwords shorter than this get to the output
* stream
* @param onlyLongestMatch Add only the longest matching subword to the stream
* @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, String[], int, int, int, boolean)} instead.
*/
public HyphenationCompoundWordTokenFilter(TokenStream input,
HyphenationTree hyphenator, String[] dictionary, int minWordSize,
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
this(Version.LUCENE_30, input, hyphenator, makeDictionary(dictionary), minWordSize,
minSubwordSize, maxSubwordSize, onlyLongestMatch);
}
/**
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
*
* @param input the {@link TokenStream} to process
* @param hyphenator the hyphenation pattern tree to use for hyphenation
* @param dictionary the word dictionary to match against
* @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, String[])} instead.
*/
public HyphenationCompoundWordTokenFilter(TokenStream input,
HyphenationTree hyphenator, String[] dictionary) {
this(Version.LUCENE_30, input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
}
/**
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
*
* @param input the {@link TokenStream} to process
* @param hyphenator the hyphenation pattern tree to use for hyphenation
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
* lower case strings.
* @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set)} instead.
*/
public HyphenationCompoundWordTokenFilter(TokenStream input,
HyphenationTree hyphenator, Set dictionary) {
this(Version.LUCENE_30, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
}
/**
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
*
* @param input the {@link TokenStream} to process
* @param hyphenator the hyphenation pattern tree to use for hyphenation
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
@ -98,11 +216,12 @@ public class HyphenationCompoundWordTokenFilter extends
* @param maxSubwordSize only subwords shorter than this get to the output
* stream
* @param onlyLongestMatch Add only the longest matching subword to the stream
* @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)} instead.
*/
public HyphenationCompoundWordTokenFilter(TokenStream input,
HyphenationTree hyphenator, Set dictionary, int minWordSize,
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
onlyLongestMatch);
this.hyphenator = hyphenator;

View File

@ -92,7 +92,7 @@ public final class CzechAnalyzer extends Analyzer {
private static class DefaultSetHolder {
private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
Arrays.asList(CZECH_STOP_WORDS), false));
Version.LUCENE_CURRENT, Arrays.asList(CZECH_STOP_WORDS), false));
}
/**
@ -121,7 +121,7 @@ public final class CzechAnalyzer extends Analyzer {
*/
public CzechAnalyzer(Version matchVersion, Set<?> stopwords) {
this.matchVersion = matchVersion;
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
}
@ -134,7 +134,7 @@ public final class CzechAnalyzer extends Analyzer {
* @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
*/
public CzechAnalyzer(Version matchVersion, String... stopwords) {
this(matchVersion, StopFilter.makeStopSet( stopwords ));
this(matchVersion, StopFilter.makeStopSet( matchVersion, stopwords ));
}
/**
@ -206,8 +206,7 @@ public final class CzechAnalyzer extends Analyzer {
TokenStream result = new StandardTokenizer( matchVersion, reader );
result = new StandardFilter( result );
result = new LowerCaseFilter( matchVersion, result );
result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stoptable );
result = new StopFilter( matchVersion, result, stoptable );
if (matchVersion.onOrAfter(Version.LUCENE_31))
result = new CzechStemFilter(result);
return result;
@ -236,8 +235,7 @@ public final class CzechAnalyzer extends Analyzer {
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new StandardFilter(streams.source);
streams.result = new LowerCaseFilter(matchVersion, streams.result);
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stoptable);
streams.result = new StopFilter( matchVersion, streams.result, stoptable);
if (matchVersion.onOrAfter(Version.LUCENE_31))
streams.result = new CzechStemFilter(streams.result);
setPreviousTokenStream(streams);

View File

@ -83,7 +83,7 @@ public class GermanAnalyzer extends Analyzer {
private static class DefaultSetHolder {
private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
Arrays.asList(GERMAN_STOP_WORDS), false));
Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false));
}
/**
@ -131,8 +131,8 @@ public class GermanAnalyzer extends Analyzer {
* a stemming exclusion set
*/
public GermanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
setOverridesTokenStreamMethod(GermanAnalyzer.class);
this.matchVersion = matchVersion;
}
@ -142,7 +142,7 @@ public class GermanAnalyzer extends Analyzer {
* @deprecated use {@link #GermanAnalyzer(Version, Set)}
*/
public GermanAnalyzer(Version matchVersion, String... stopwords) {
this(matchVersion, StopFilter.makeStopSet(stopwords));
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
}
/**
@ -167,7 +167,7 @@ public class GermanAnalyzer extends Analyzer {
* @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable(String[] exclusionlist) {
exclusionSet = StopFilter.makeStopSet(exclusionlist);
exclusionSet = StopFilter.makeStopSet(matchVersion, exclusionlist);
setPreviousTokenStream(null); // force a new stemmer to be created
}
@ -175,8 +175,8 @@ public class GermanAnalyzer extends Analyzer {
* Builds an exclusionlist from a {@link Map}
* @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable(Map exclusionlist) {
exclusionSet = new HashSet(exclusionlist.keySet());
public void setStemExclusionTable(Map<?,?> exclusionlist) {
exclusionSet = new HashSet<Object>(exclusionlist.keySet());
setPreviousTokenStream(null); // force a new stemmer to be created
}
@ -201,8 +201,7 @@ public class GermanAnalyzer extends Analyzer {
TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stopSet);
result = new StopFilter( matchVersion, result, stopSet);
result = new GermanStemFilter(result, exclusionSet);
return result;
}
@ -235,8 +234,7 @@ public class GermanAnalyzer extends Analyzer {
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new StandardFilter(streams.source);
streams.result = new LowerCaseFilter(matchVersion, streams.result);
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stopSet);
streams.result = new StopFilter( matchVersion, streams.result, stopSet);
streams.result = new GermanStemFilter(streams.result, exclusionSet);
setPreviousTokenStream(streams);
} else {

View File

@ -70,7 +70,7 @@ public final class GreekAnalyzer extends Analyzer
private static class DefaultSetHolder {
private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
Arrays.asList(GREEK_STOP_WORDS), false));
Version.LUCENE_CURRENT, Arrays.asList(GREEK_STOP_WORDS), false));
}
/**
@ -93,7 +93,7 @@ public final class GreekAnalyzer extends Analyzer
* a stopword set
*/
public GreekAnalyzer(Version matchVersion, Set<?> stopwords) {
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
this.matchVersion = matchVersion;
}
@ -104,7 +104,7 @@ public final class GreekAnalyzer extends Analyzer
*/
public GreekAnalyzer(Version matchVersion, String... stopwords)
{
this(matchVersion, StopFilter.makeStopSet(stopwords));
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
}
/**
@ -127,8 +127,7 @@ public final class GreekAnalyzer extends Analyzer
{
TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new GreekLowerCaseFilter(result);
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stopSet);
result = new StopFilter(matchVersion, result, stopSet);
return result;
}
@ -152,8 +151,7 @@ public final class GreekAnalyzer extends Analyzer
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new GreekLowerCaseFilter(streams.source);
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stopSet);
streams.result = new StopFilter(matchVersion, streams.result, stopSet);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);

View File

@ -126,7 +126,7 @@ public final class PersianAnalyzer extends Analyzer {
* a stopword set
*/
public PersianAnalyzer(Version matchVersion, Set<?> stopwords){
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
this.matchVersion = matchVersion;
}
@ -135,7 +135,7 @@ public final class PersianAnalyzer extends Analyzer {
* @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
*/
public PersianAnalyzer(Version matchVersion, String... stopwords) {
this(matchVersion, StopFilter.makeStopSet(stopwords));
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
}
/**
@ -175,8 +175,7 @@ public final class PersianAnalyzer extends Analyzer {
* the order here is important: the stopword list is normalized with the
* above!
*/
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stoptable);
result = new StopFilter(matchVersion, result, stoptable);
return result;
}
@ -209,8 +208,7 @@ public final class PersianAnalyzer extends Analyzer {
* the order here is important: the stopword list is normalized with the
* above!
*/
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stoptable);
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);

View File

@ -25,6 +25,7 @@ import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
/**
* Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be
@ -35,44 +36,77 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
* @see <a href="http://fr.wikipedia.org/wiki/%C3%89lision">Elision in Wikipedia</a>
*/
public final class ElisionFilter extends TokenFilter {
private CharArraySet articles = null;
private TermAttribute termAtt;
private CharArraySet articles = CharArraySet.EMPTY_SET;
private final TermAttribute termAtt;
private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(
"l", "m", "t", "qu", "n", "s", "j"), true));
private static char[] apostrophes = {'\'', ''};
private static char[] apostrophes = {'\'', '\u2019'};
/**
* Set the stopword articles
* @param matchVersion the lucene backwards compatibility version
* @param articles a set of articles
* @deprecated use {@link #ElisionFilter(Version, TokenStream, Set)} instead
*/
public void setArticles(Version matchVersion, Set<?> articles) {
this.articles = CharArraySet.unmodifiableSet(
CharArraySet.copy(matchVersion, articles));
}
/**
* Set the stopword articles
* @param articles a set of articles
* @deprecated use {@link #setArticles(Version, Set)} instead
*/
public void setArticles(Set<?> articles) {
if (articles instanceof CharArraySet)
this.articles = (CharArraySet) articles;
else
this.articles = new CharArraySet(articles, true);
setArticles(Version.LUCENE_CURRENT, articles);
}
/**
* Constructs an elision filter with standard stop words
*/
protected ElisionFilter(Version matchVersion, TokenStream input) {
this(matchVersion, input, DEFAULT_ARTICLES);
}
/**
* Constructs an elision filter with standard stop words
* @deprecated use {@link #ElisionFilter(Version, TokenStream)} instead
*/
protected ElisionFilter(TokenStream input) {
super(input);
this.articles = new CharArraySet(Arrays.asList(
"l", "m", "t", "qu", "n", "s", "j"), true);
termAtt = addAttribute(TermAttribute.class);
this(Version.LUCENE_30, input);
}
/**
* Constructs an elision filter with a Set of stop words
* @deprecated use {@link #ElisionFilter(Version, TokenStream, Set)} instead
*/
public ElisionFilter(TokenStream input, Set<?> articles) {
this(Version.LUCENE_30, input, articles);
}
/**
* Constructs an elision filter with a Set of stop words
* @param matchVersion the lucene backwards compatibility version
* @param input the source {@link TokenStream}
* @param articles a set of stopword articles
*/
public ElisionFilter(Version matchVersion, TokenStream input, Set<?> articles) {
super(input);
setArticles(articles);
this.articles = CharArraySet.unmodifiableSet(
new CharArraySet(matchVersion, articles, true));
termAtt = addAttribute(TermAttribute.class);
}
/**
* Constructs an elision filter with an array of stop words
* @deprecated use {@link #ElisionFilter(Version, TokenStream, Set)} instead
*/
public ElisionFilter(TokenStream input, String[] articles) {
super(input);
this.articles = new CharArraySet(Arrays.asList(articles), true);
termAtt = addAttribute(TermAttribute.class);
this(Version.LUCENE_CURRENT, input,
new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList(articles), true));
}
/**

View File

@ -34,6 +34,7 @@ import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Collections;
import java.util.Map;
import java.util.Set;
@ -98,7 +99,7 @@ public final class FrenchAnalyzer extends Analyzer {
* Contains words that should be indexed but not stemmed.
*/
//TODO make this final in 3.0
private Set<?> excltable = new HashSet();
private Set<?> excltable = Collections.<Object>emptySet();
private final Version matchVersion;
@ -112,7 +113,7 @@ public final class FrenchAnalyzer extends Analyzer {
private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET = CharArraySet
.unmodifiableSet(new CharArraySet(Arrays.asList(FRENCH_STOP_WORDS),
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(FRENCH_STOP_WORDS),
false));
}
@ -148,9 +149,10 @@ public final class FrenchAnalyzer extends Analyzer {
public FrenchAnalyzer(Version matchVersion, Set<?> stopwords,
Set<?> stemExclutionSet) {
this.matchVersion = matchVersion;
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet
.copy(matchVersion, stopwords));
this.excltable = CharArraySet.unmodifiableSet(CharArraySet
.copy(stemExclutionSet));
.copy(matchVersion, stemExclutionSet));
}
@ -159,7 +161,7 @@ public final class FrenchAnalyzer extends Analyzer {
* @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead
*/
public FrenchAnalyzer(Version matchVersion, String... stopwords) {
this(matchVersion, StopFilter.makeStopSet(stopwords));
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
}
/**
@ -176,7 +178,7 @@ public final class FrenchAnalyzer extends Analyzer {
* @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable(String... exclusionlist) {
excltable = StopFilter.makeStopSet(exclusionlist);
excltable = StopFilter.makeStopSet(matchVersion, exclusionlist);
setPreviousTokenStream(null); // force a new stemmer to be created
}
@ -184,8 +186,8 @@ public final class FrenchAnalyzer extends Analyzer {
* Builds an exclusionlist from a Map.
* @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable(Map exclusionlist) {
excltable = new HashSet(exclusionlist.keySet());
public void setStemExclusionTable(Map<?,?> exclusionlist) {
excltable = new HashSet<Object>(exclusionlist.keySet());
setPreviousTokenStream(null); // force a new stemmer to be created
}
@ -195,7 +197,7 @@ public final class FrenchAnalyzer extends Analyzer {
* @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable(File exclusionlist) throws IOException {
excltable = new HashSet(WordlistLoader.getWordSet(exclusionlist));
excltable = new HashSet<Object>(WordlistLoader.getWordSet(exclusionlist));
setPreviousTokenStream(null); // force a new stemmer to be created
}
@ -211,8 +213,7 @@ public final class FrenchAnalyzer extends Analyzer {
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new StandardFilter(result);
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stoptable);
result = new StopFilter(matchVersion, result, stoptable);
result = new FrenchStemFilter(result, excltable);
// Convert to lowercase after stemming!
result = new LowerCaseFilter(matchVersion, result);
@ -240,8 +241,7 @@ public final class FrenchAnalyzer extends Analyzer {
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new StandardFilter(streams.source);
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stoptable);
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
streams.result = new FrenchStemFilter(streams.result, excltable);
// Convert to lowercase after stemming!
streams.result = new LowerCaseFilter(matchVersion, streams.result);

View File

@ -73,7 +73,8 @@ public class PatternAnalyzer extends Analyzer {
public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
private static final CharArraySet EXTENDED_ENGLISH_STOP_WORDS =
CharArraySet.unmodifiableSet(new CharArraySet(Arrays.asList(
CharArraySet.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList(
"a", "about", "above", "across", "adj", "after", "afterwards",
"again", "against", "albeit", "all", "almost", "alone", "along",
"already", "also", "although", "always", "among", "amongst", "an",
@ -153,7 +154,7 @@ public class PatternAnalyzer extends Analyzer {
* if non-null, ignores all tokens that are contained in the
* given stop set (after previously having applied toLowerCase()
* if applicable). For example, created via
* {@link StopFilter#makeStopSet(String[])}and/or
* {@link StopFilter#makeStopSet(Version, String[])}and/or
* {@link org.apache.lucene.analysis.WordlistLoader}as in
* <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
* or <a href="http://www.unine.ch/info/clef/">other stop words
@ -199,7 +200,7 @@ public class PatternAnalyzer extends Analyzer {
}
else {
stream = new PatternTokenizer(text, pattern, toLowerCase);
if (stopWords != null) stream = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), stream, stopWords);
if (stopWords != null) stream = new StopFilter(matchVersion, stream, stopWords);
}
return stream;
@ -387,12 +388,12 @@ public class PatternAnalyzer extends Analyzer {
private int pos;
private final boolean isLetter;
private final boolean toLowerCase;
private final Set stopWords;
private final Set<?> stopWords;
private static final Locale locale = Locale.getDefault();
private TermAttribute termAtt = addAttribute(TermAttribute.class);
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) {
this.str = str;
this.isLetter = isLetter;
this.toLowerCase = toLowerCase;

View File

@ -80,8 +80,8 @@ public class DutchAnalyzer extends Analyzer {
private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET = CharArraySet
.unmodifiableSet(new CharArraySet(Arrays.asList(DUTCH_STOP_WORDS),
false));
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList(DUTCH_STOP_WORDS), false));
}
@ -116,8 +116,8 @@ public class DutchAnalyzer extends Analyzer {
}
public DutchAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionTable){
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionTable));
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
this.matchVersion = matchVersion;
setOverridesTokenStreamMethod(DutchAnalyzer.class);
}
@ -130,7 +130,7 @@ public class DutchAnalyzer extends Analyzer {
* @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
*/
public DutchAnalyzer(Version matchVersion, String... stopwords) {
this(matchVersion, StopFilter.makeStopSet(stopwords));
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
}
/**
@ -168,7 +168,7 @@ public class DutchAnalyzer extends Analyzer {
* @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
*/
public void setStemExclusionTable(String... exclusionlist) {
excltable = StopFilter.makeStopSet(exclusionlist);
excltable = StopFilter.makeStopSet(matchVersion, exclusionlist);
setPreviousTokenStream(null); // force a new stemmer to be created
}
@ -222,8 +222,7 @@ public class DutchAnalyzer extends Analyzer {
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new StandardFilter(result);
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stoptable);
result = new StopFilter(matchVersion, result, stoptable);
result = new DutchStemFilter(result, excltable, stemdict);
return result;
}
@ -256,8 +255,7 @@ public class DutchAnalyzer extends Analyzer {
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new StandardFilter(streams.source);
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stoptable);
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
streams.result = new DutchStemFilter(streams.result, excltable, stemdict);
setPreviousTokenStream(streams);
} else {

View File

@ -179,8 +179,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
}
HashSet<String> stopWords = stopWordsPerField.get(fieldName);
if (stopWords != null) {
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stopWords);
result = new StopFilter(matchVersion, result, stopWords);
}
return result;
}
@ -223,8 +222,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
/* if there are any stopwords for the field, save the stopfilter */
HashSet<String> stopWords = stopWordsPerField.get(fieldName);
if (stopWords != null)
streams.withStopFilter = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.wrapped, stopWords);
streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
else
streams.withStopFilter = streams.wrapped;
@ -245,8 +243,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
streams.wrapped = result;
HashSet<String> stopWords = stopWordsPerField.get(fieldName);
if (stopWords != null)
streams.withStopFilter = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.wrapped, stopWords);
streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
else
streams.withStopFilter = streams.wrapped;
}

View File

@ -59,8 +59,8 @@ public final class RussianAnalyzer extends Analyzer
private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET = CharArraySet
.unmodifiableSet(new CharArraySet(Arrays.asList(RUSSIAN_STOP_WORDS),
false));
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList(RUSSIAN_STOP_WORDS), false));
}
/**
@ -79,7 +79,7 @@ public final class RussianAnalyzer extends Analyzer
* @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
*/
public RussianAnalyzer(Version matchVersion, String... stopwords) {
this(matchVersion, StopFilter.makeStopSet(stopwords));
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
}
/**
@ -91,7 +91,7 @@ public final class RussianAnalyzer extends Analyzer
* a stopword set
*/
public RussianAnalyzer(Version matchVersion, Set<?> stopwords){
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
this.matchVersion = matchVersion;
}
@ -119,8 +119,7 @@ public final class RussianAnalyzer extends Analyzer
{
TokenStream result = new RussianLetterTokenizer(reader);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stopSet);
result = new StopFilter(matchVersion, result, stopSet);
result = new RussianStemFilter(result);
return result;
}
@ -147,8 +146,7 @@ public final class RussianAnalyzer extends Analyzer
streams = new SavedStreams();
streams.source = new RussianLetterTokenizer(reader);
streams.result = new LowerCaseFilter(matchVersion, streams.source);
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stopSet);
streams.result = new StopFilter(matchVersion, streams.result, stopSet);
streams.result = new RussianStemFilter(streams.result);
setPreviousTokenStream(streams);
} else {

View File

@ -48,8 +48,7 @@ public class ThaiAnalyzer extends Analyzer {
TokenStream ts = new StandardTokenizer(matchVersion, reader);
ts = new StandardFilter(ts);
ts = new ThaiWordFilter(ts);
ts = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
ts = new StopFilter(matchVersion, ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
return ts;
}
@ -73,8 +72,7 @@ public class ThaiAnalyzer extends Analyzer {
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new StandardFilter(streams.source);
streams.result = new ThaiWordFilter(streams.result);
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
streams.result = new StopFilter(matchVersion, streams.result, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);

View File

@ -42,7 +42,7 @@ public class TestElision extends BaseTokenStreamTestCase {
Set articles = new HashSet();
articles.add("l");
articles.add("M");
TokenFilter filter = new ElisionFilter(tokenizer, articles);
TokenFilter filter = new ElisionFilter(Version.LUCENE_CURRENT, tokenizer, articles);
List tas = filtre(filter);
assertEquals("embrouille", tas.get(4));
assertEquals("O'brian", tas.get(6));

View File

@ -153,8 +153,7 @@ public class SmartChineseAnalyzer extends Analyzer {
// The porter stemming is too strict, this is not a bug, this is a feature:)
result = new PorterStemFilter(result);
if (!stopWords.isEmpty()) {
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stopWords, false);
result = new StopFilter(matchVersion, result, stopWords, false);
}
return result;
}
@ -175,8 +174,7 @@ public class SmartChineseAnalyzer extends Analyzer {
streams.filteredTokenStream = new WordTokenFilter(streams.tokenStream);
streams.filteredTokenStream = new PorterStemFilter(streams.filteredTokenStream);
if (!stopWords.isEmpty()) {
streams.filteredTokenStream = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.filteredTokenStream, stopWords, false);
streams.filteredTokenStream = new StopFilter(matchVersion, streams.filteredTokenStream, stopWords, false);
}
} else {
streams.tokenStream.reset(reader);

View File

@ -1076,7 +1076,7 @@ public class TestQPHelper extends LocalizedTestCase {
public void testStopwords() throws Exception {
StandardQueryParser qp = new StandardQueryParser();
qp.setAnalyzer(
new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet("the", "foo" )));
new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet(Version.LUCENE_CURRENT, "the", "foo" )));
Query result = qp.parse("a:the OR a:foo", "a");
assertNotNull("result is null and it shouldn't be", result);
@ -1099,7 +1099,7 @@ public class TestQPHelper extends LocalizedTestCase {
public void testPositionIncrement() throws Exception {
StandardQueryParser qp = new StandardQueryParser();
qp.setAnalyzer(
new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet("the", "in", "are", "this" )));
new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet(Version.LUCENE_CURRENT, "the", "in", "are", "this" )));
qp.setEnablePositionIncrements(true);

View File

@ -1056,7 +1056,7 @@ public class TestQueryParserWrapper extends LocalizedTestCase {
}
public void testStopwords() throws Exception {
QueryParserWrapper qp = new QueryParserWrapper("a", new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet("the", "foo")));
QueryParserWrapper qp = new QueryParserWrapper("a", new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet(Version.LUCENE_CURRENT, "the", "foo")));
Query result = qp.parse("a:the OR a:foo");
assertNotNull("result is null and it shouldn't be", result);
assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
@ -1075,7 +1075,7 @@ public class TestQueryParserWrapper extends LocalizedTestCase {
}
public void testPositionIncrement() throws Exception {
QueryParserWrapper qp = new QueryParserWrapper("a", new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet("the", "in", "are", "this")));
QueryParserWrapper qp = new QueryParserWrapper("a", new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet(Version.LUCENE_CURRENT, "the", "in", "are", "this")));
qp.setEnablePositionIncrements(true);
String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\"";
// 0 2 5 7 8

View File

@ -50,7 +50,7 @@ public class SnowballAnalyzer extends Analyzer {
/** Builds the named analyzer with the given stop words. */
public SnowballAnalyzer(Version matchVersion, String name, String[] stopWords) {
this(matchVersion, name);
stopSet = StopFilter.makeStopSet(stopWords);
stopSet = StopFilter.makeStopSet(matchVersion, stopWords);
}
/** Constructs a {@link StandardTokenizer} filtered by a {@link
@ -62,7 +62,7 @@ public class SnowballAnalyzer extends Analyzer {
result = new StandardFilter(result);
result = new LowerCaseFilter(matchVersion, result);
if (stopSet != null)
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result = new StopFilter(matchVersion,
result, stopSet);
result = new SnowballFilter(result, name);
return result;
@ -93,7 +93,7 @@ public class SnowballAnalyzer extends Analyzer {
streams.result = new StandardFilter(streams.source);
streams.result = new LowerCaseFilter(matchVersion, streams.result);
if (stopSet != null)
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result = new StopFilter(matchVersion,
streams.result, stopSet);
streams.result = new SnowballFilter(streams.result, name);
setPreviousTokenStream(streams);

View File

@ -6,6 +6,9 @@ import java.util.Collections;
import java.util.Iterator;
import java.util.Set;
import org.apache.lucene.util.CharacterUtils;
import org.apache.lucene.util.Version;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -32,45 +35,113 @@ import java.util.Set;
* etc. It is designed to be quick to test if a char[]
* is in the set without the necessity of converting it
* to a String first.
* <p>You must specify the required {@link Version}
* compatibility when creating {@link CharArraySet}:
* <ul>
* <li> As of 3.1, supplementary characters are
* properly lowercased.</li>
* </ul>
* Before 3.1 supplementary characters could not be
* lowercased correctly due to the lack of Unicode 4
* support in JDK 1.4. To use instances of
* {@link CharArraySet} with the behavior before Lucene
* 3.1 pass a {@link Version} < 3.1 to the constructors.
* <P>
* <em>Please note:</em> This class implements {@link java.util.Set Set} but
* does not behave like it should in all cases. The generic type is
* {@code Set<Object>}, because you can add any object to it,
* that has a string representation. The add methods will use
* {@link Object#toString} and store the result using a {@code char[]}
* buffer. The same behaviour have the {@code contains()} methods.
* buffer. The same behavior have the {@code contains()} methods.
* The {@link #iterator()} returns an {@code Iterator<String>}.
* For type safety also {@link #stringIterator()} is provided.
*/
public class CharArraySet extends AbstractSet<Object> {
private final static int INIT_SIZE = 8;
private char[][] entries;
private int count;
private final boolean ignoreCase;
public static final CharArraySet EMPTY_SET = CharArraySet.unmodifiableSet(new CharArraySet(0, false));
public static final CharArraySet EMPTY_SET = CharArraySet.unmodifiableSet(
new CharArraySet(Version.LUCENE_CURRENT, 0, false));
private final CharacterUtils charUtils;
private final Version matchVersion;
/** Create set with enough capacity to hold startSize
* terms */
public CharArraySet(int startSize, boolean ignoreCase) {
/**
* Create set with enough capacity to hold startSize terms
*
* @param matchVersion
* compatibility match version see <a href="#version">Version
* note</a> above for details.
* @param startSize
* the initial capacity
* @param ignoreCase
* <code>false</code> if and only if the set should be case sensitive
* otherwise <code>true</code>.
*/
public CharArraySet(Version matchVersion, int startSize, boolean ignoreCase) {
this.ignoreCase = ignoreCase;
int size = INIT_SIZE;
while(startSize + (startSize>>2) > size)
size <<= 1;
entries = new char[size][];
this.charUtils = CharacterUtils.getInstance(matchVersion);
this.matchVersion = matchVersion;
}
/** Create set from a Collection of char[] or String */
/**
* Creates a set from a Collection of objects.
*
* @param matchVersion
* compatibility match version see <a href="#version">Version
* note</a> above for details.
* @param c
* a collection whose elements to be placed into the set
* @param ignoreCase
* <code>false</code> if and only if the set should be case sensitive
* otherwise <code>true</code>.
*/
public CharArraySet(Version matchVersion, Collection<? extends Object> c, boolean ignoreCase) {
this(matchVersion, c.size(), ignoreCase);
addAll(c);
}
/**
* Creates a set with enough capacity to hold startSize terms
*
* @param startSize
* the initial capacity
* @param ignoreCase
* <code>false</code> if and only if the set should be case sensitive
* otherwise <code>true</code>.
* @deprecated use {@link #CharArraySet(Version, int, boolean)} instead
*/
public CharArraySet(int startSize, boolean ignoreCase) {
this(Version.LUCENE_30, startSize, ignoreCase);
}
/**
* Creates a set from a Collection of objects.
*
* @param c
* a collection whose elements to be placed into the set
* @param ignoreCase
* <code>false</code> if and only if the set should be case sensitive
* otherwise <code>true</code>.
* @deprecated use {@link #CharArraySet(Version, Collection, boolean)} instead
*/
public CharArraySet(Collection<? extends Object> c, boolean ignoreCase) {
this(c.size(), ignoreCase);
this(Version.LUCENE_30, c.size(), ignoreCase);
addAll(c);
}
/** Create set from entries */
private CharArraySet(char[][] entries, boolean ignoreCase, int count){
private CharArraySet(Version matchVersion, char[][] entries, boolean ignoreCase, int count){
this.entries = entries;
this.ignoreCase = ignoreCase;
this.count = count;
this.charUtils = CharacterUtils.getInstance(matchVersion);
this.matchVersion = matchVersion;
}
/** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
@ -131,8 +202,11 @@ public class CharArraySet extends AbstractSet<Object> {
*/
public boolean add(char[] text) {
if (ignoreCase)
for(int i=0;i<text.length;i++)
text[i] = Character.toLowerCase(text[i]);
for(int i=0;i<text.length;){
i += Character.toChars(
Character.toLowerCase(
charUtils.codePointAt(text, i)), text, i);
}
int slot = getSlot(text, 0, text.length);
if (entries[slot] != null) return false;
entries[slot] = text;
@ -148,10 +222,13 @@ public class CharArraySet extends AbstractSet<Object> {
private boolean equals(char[] text1, int off, int len, char[] text2) {
if (len != text2.length)
return false;
final int limit = off+len;
if (ignoreCase) {
for(int i=0;i<len;i++) {
if (Character.toLowerCase(text1[off+i]) != text2[i])
for(int i=0;i<len;) {
final int codePointAt = charUtils.codePointAt(text1, off+i, limit);
if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i))
return false;
i += Character.charCount(codePointAt);
}
} else {
for(int i=0;i<len;i++) {
@ -167,9 +244,11 @@ public class CharArraySet extends AbstractSet<Object> {
if (len != text2.length)
return false;
if (ignoreCase) {
for(int i=0;i<len;i++) {
if (Character.toLowerCase(text1.charAt(i)) != text2[i])
for(int i=0;i<len;) {
final int codePointAt = charUtils.codePointAt(text1, i);
if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i))
return false;
i += Character.charCount(codePointAt);
}
} else {
for(int i=0;i<len;i++) {
@ -179,6 +258,8 @@ public class CharArraySet extends AbstractSet<Object> {
}
return true;
}
private void rehash() {
final int newSize = 2*entries.length;
@ -198,8 +279,10 @@ public class CharArraySet extends AbstractSet<Object> {
int code = 0;
final int stop = offset + len;
if (ignoreCase) {
for (int i=offset; i<stop; i++) {
code = code*31 + Character.toLowerCase(text[i]);
for (int i=offset; i<stop;) {
final int codePointAt = charUtils.codePointAt(text, i, stop);
code = code*31 + Character.toLowerCase(codePointAt);
i += Character.charCount(codePointAt);
}
} else {
for (int i=offset; i<stop; i++) {
@ -213,8 +296,10 @@ public class CharArraySet extends AbstractSet<Object> {
int code = 0;
int len = text.length();
if (ignoreCase) {
for (int i=0; i<len; i++) {
code = code*31 + Character.toLowerCase(text.charAt(i));
for (int i=0; i<len;) {
int codePointAt = charUtils.codePointAt(text, i);
code = code*31 + Character.toLowerCase(codePointAt);
i += Character.charCount(codePointAt);
}
} else {
for (int i=0; i<len; i++) {
@ -274,7 +359,7 @@ public class CharArraySet extends AbstractSet<Object> {
* Instead of delegating calls to the given set copy the low-level values to
* the unmodifiable Subclass
*/
return new UnmodifiableCharArraySet(set.entries, set.ignoreCase, set.count);
return new UnmodifiableCharArraySet(set.matchVersion, set.entries, set.ignoreCase, set.count);
}
/**
@ -286,15 +371,33 @@ public class CharArraySet extends AbstractSet<Object> {
* @return a copy of the given set as a {@link CharArraySet}. If the given set
* is a {@link CharArraySet} the ignoreCase property will be
* preserved.
* @deprecated use {@link #copy(Version, Set)} instead
*/
public static CharArraySet copy(Set<?> set) {
return copy(Version.LUCENE_30, set);
}
/**
* Returns a copy of the given set as a {@link CharArraySet}. If the given set
* is a {@link CharArraySet} the ignoreCase property will be preserved.
*
* @param matchVersion
* compatibility match version see <a href="#version">Version
* note</a> above for details.
* @param set
* a set to copy
* @return a copy of the given set as a {@link CharArraySet}. If the given set
* is a {@link CharArraySet} the ignoreCase property will be
* preserved.
*/
public static CharArraySet copy(Version matchVersion, Set<?> set) {
if (set == null)
throw new NullPointerException("Given set is null");
if(set == EMPTY_SET)
return EMPTY_SET;
final boolean ignoreCase = set instanceof CharArraySet ? ((CharArraySet) set).ignoreCase
: false;
return new CharArraySet(set, ignoreCase);
return new CharArraySet(matchVersion, set, ignoreCase);
}
@ -356,9 +459,9 @@ public class CharArraySet extends AbstractSet<Object> {
*/
private static final class UnmodifiableCharArraySet extends CharArraySet {
private UnmodifiableCharArraySet(char[][] entries, boolean ignoreCase,
private UnmodifiableCharArraySet(Version matchVersion, char[][] entries, boolean ignoreCase,
int count) {
super(entries, ignoreCase, count);
super(matchVersion, entries, ignoreCase, count);
}
@Override

View File

@ -32,13 +32,15 @@ import org.apache.lucene.util.Version;
* <p>You must specify the required {@link Version}
* compatibility when creating StopAnalyzer:
* <ul>
* <li> As of 3.1, StopFilter correctly handles Unicode 4.0
* supplementary characters in stopwords
* <li> As of 2.9, position increments are preserved
* </ul>
*/
public final class StopAnalyzer extends Analyzer {
private final Set<?> stopWords;
private final boolean enablePositionIncrements;
private final Version matchVersion;
/** An unmodifiable set containing some common English words that are not usually useful
for searching.*/
@ -52,7 +54,8 @@ public final class StopAnalyzer extends Analyzer {
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
);
final CharArraySet stopSet = new CharArraySet(stopWords.size(), false);
final CharArraySet stopSet = new CharArraySet(Version.LUCENE_CURRENT,
stopWords.size(), false);
stopSet.addAll(stopWords);
ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
}
@ -63,7 +66,7 @@ public final class StopAnalyzer extends Analyzer {
*/
public StopAnalyzer(Version matchVersion) {
stopWords = ENGLISH_STOP_WORDS_SET;
enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
this.matchVersion = matchVersion;
}
/** Builds an analyzer with the stop words from the given set.
@ -71,7 +74,7 @@ public final class StopAnalyzer extends Analyzer {
* @param stopWords Set of stop words */
public StopAnalyzer(Version matchVersion, Set<?> stopWords) {
this.stopWords = stopWords;
enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
this.matchVersion = matchVersion;
}
/** Builds an analyzer with the stop words from the given file.
@ -80,7 +83,7 @@ public final class StopAnalyzer extends Analyzer {
* @param stopwordsFile File to load stop words from */
public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException {
stopWords = WordlistLoader.getWordSet(stopwordsFile);
this.enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
this.matchVersion = matchVersion;
}
/** Builds an analyzer with the stop words from the given reader.
@ -89,13 +92,14 @@ public final class StopAnalyzer extends Analyzer {
* @param stopwords Reader to load stop words from */
public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
stopWords = WordlistLoader.getWordSet(stopwords);
this.enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
this.matchVersion = matchVersion;
}
/** Filters LowerCaseTokenizer with StopFilter. */
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new StopFilter(enablePositionIncrements, new LowerCaseTokenizer(reader), stopWords);
return new StopFilter(matchVersion,
new LowerCaseTokenizer(reader), stopWords);
}
/** Filters LowerCaseTokenizer with StopFilter. */
@ -109,7 +113,8 @@ public final class StopAnalyzer extends Analyzer {
if (streams == null) {
streams = new SavedStreams();
streams.source = new LowerCaseTokenizer(reader);
streams.result = new StopFilter(enablePositionIncrements, streams.source, stopWords);
streams.result = new StopFilter(matchVersion,
streams.source, stopWords);
setPreviousTokenStream(streams);
} else
streams.source.reset(reader);

View File

@ -29,8 +29,16 @@ import org.apache.lucene.util.Version;
/**
* Removes stop words from a token stream.
*
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating StopFilter:
* <ul>
* <li> As of 3.1, StopFilter correctly handles Unicode 4.0
* supplementary characters in stopwords and position
* increments are preserved
* </ul>
*/
public final class StopFilter extends TokenFilter {
private final CharArraySet stopWords;
@ -54,16 +62,46 @@ public final class StopFilter extends TokenFilter {
* @param input Input TokenStream
* @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords
* @param ignoreCase if true, all words are lower cased first
* @deprecated use {@link #StopFilter(Version, TokenStream, Set, boolean)} instead
*/
public StopFilter(boolean enablePositionIncrements, TokenStream input, Set<?> stopWords, boolean ignoreCase)
{
this(Version.LUCENE_30, enablePositionIncrements, input, stopWords, ignoreCase);
}
/**
* Construct a token stream filtering the given input. If
* <code>stopWords</code> is an instance of {@link CharArraySet} (true if
* <code>makeStopSet()</code> was used to construct the set) it will be
* directly used and <code>ignoreCase</code> will be ignored since
* <code>CharArraySet</code> directly controls case sensitivity.
* <p/>
* If <code>stopWords</code> is not an instance of {@link CharArraySet}, a new
* CharArraySet will be constructed and <code>ignoreCase</code> will be used
* to specify the case sensitivity of that set.
*
* @param matchVersion
* Lucene version to enable correct Unicode 4.0 behavior in the stop
* set if Version > 3.0. See <a href="#version">above</a> for details.
* @param input
* Input TokenStream
* @param stopWords
* A Set of Strings or char[] or any other toString()-able set
* representing the stopwords
* @param ignoreCase
* if true, all words are lower cased first
*/
public StopFilter(Version matchVersion, TokenStream input, Set<?> stopWords, boolean ignoreCase)
{
this(matchVersion, matchVersion.onOrAfter(Version.LUCENE_29), input, stopWords, ignoreCase);
}
/*
* convenience ctor to enable deprecated ctors to set posInc explicitly
*/
private StopFilter(Version matchVersion, boolean enablePositionIncrements, TokenStream input, Set<?> stopWords, boolean ignoreCase){
super(input);
if (stopWords instanceof CharArraySet) {
this.stopWords = (CharArraySet)stopWords;
} else {
this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
this.stopWords.addAll(stopWords);
}
this.stopWords = CharArraySet.unmodifiableSet(new CharArraySet(matchVersion, stopWords, ignoreCase));
this.enablePositionIncrements = enablePositionIncrements;
termAtt = addAttribute(TermAttribute.class);
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
@ -76,10 +114,29 @@ public final class StopFilter extends TokenFilter {
* @param enablePositionIncrements true if token positions should record the removed stop words
* @param in Input stream
* @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords
* @see #makeStopSet(java.lang.String[])
* @see #makeStopSet(Version, java.lang.String[])
* @deprecated use {@link #StopFilter(Version, TokenStream, Set)} instead
*/
public StopFilter(boolean enablePositionIncrements, TokenStream in, Set<?> stopWords) {
this(enablePositionIncrements, in, stopWords, false);
this(Version.LUCENE_CURRENT, enablePositionIncrements, in, stopWords, false);
}
/**
* Constructs a filter which removes words from the input TokenStream that are
* named in the Set.
*
* @param matchVersion
* Lucene version to enable correct Unicode 4.0 behavior in the stop
* set if Version > 3.0. See <a href="#version">above</a> for details.
* @param in
* Input stream
* @param stopWords
* A Set of Strings or char[] or any other toString()-able set
* representing the stopwords
* @see #makeStopSet(Version, java.lang.String[])
*/
public StopFilter(Version matchVersion, TokenStream in, Set<?> stopWords) {
this(matchVersion, in, stopWords, false);
}
/**
@ -88,12 +145,27 @@ public final class StopFilter extends TokenFilter {
* This permits this stopWords construction to be cached once when
* an Analyzer is constructed.
*
* @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
* @deprecated use {@link #makeStopSet(Version, String...)} instead
*/
public static final Set<Object> makeStopSet(String... stopWords) {
return makeStopSet(stopWords, false);
return makeStopSet(Version.LUCENE_30, stopWords, false);
}
/**
* Builds a Set from an array of stop words,
* appropriate for passing into the StopFilter constructor.
* This permits this stopWords construction to be cached once when
* an Analyzer is constructed.
*
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
* @param stopWords An array of stopwords
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
*/
public static final Set<Object> makeStopSet(Version matchVersion, String... stopWords) {
return makeStopSet(matchVersion, stopWords, false);
}
/**
* Builds a Set from an array of stop words,
* appropriate for passing into the StopFilter constructor.
@ -101,32 +173,72 @@ public final class StopFilter extends TokenFilter {
* an Analyzer is constructed.
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
* @return A Set ({@link CharArraySet}) containing the words
* @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
* @deprecated use {@link #makeStopSet(Version, List)} instead
*/
public static final Set<Object> makeStopSet(List<?> stopWords) {
return makeStopSet(stopWords, false);
return makeStopSet(Version.LUCENE_30, stopWords, false);
}
/**
* Builds a Set from an array of stop words,
* appropriate for passing into the StopFilter constructor.
* This permits this stopWords construction to be cached once when
* an Analyzer is constructed.
*
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
* @return A Set ({@link CharArraySet}) containing the words
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
*/
public static final Set<Object> makeStopSet(Version matchVersion, List<?> stopWords) {
return makeStopSet(matchVersion, stopWords, false);
}
/**
* Creates a stopword set from the given stopword array.
* @param stopWords An array of stopwords
* @param ignoreCase If true, all words are lower cased first.
* @return a Set containing the words
* @deprecated use {@link #makeStopSet(Version, String[], boolean)} instead;
*/
public static final Set<Object> makeStopSet(String[] stopWords, boolean ignoreCase) {
return makeStopSet(Version.LUCENE_30, stopWords, ignoreCase);
}
/**
* Creates a stopword set from the given stopword array.
*
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
* @param stopWords An array of stopwords
* @param ignoreCase If true, all words are lower cased first.
* @return a Set containing the words
*/
public static final Set<Object> makeStopSet(String[] stopWords, boolean ignoreCase) {
CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
public static final Set<Object> makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) {
CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.length, ignoreCase);
stopSet.addAll(Arrays.asList(stopWords));
return stopSet;
}
/**
* Creates a stopword set from the given stopword list.
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
* @param ignoreCase if true, all words are lower cased first
* @return A Set ({@link CharArraySet}) containing the words
* @deprecated use {@link #makeStopSet(Version, List, boolean)} instead
*/
public static final Set<Object> makeStopSet(List<?> stopWords, boolean ignoreCase){
return makeStopSet(Version.LUCENE_30, stopWords, ignoreCase);
}
/**
*
* Creates a stopword set from the given stopword list.
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
* @param ignoreCase if true, all words are lower cased first
* @return A Set ({@link CharArraySet}) containing the words
*/
public static final Set<Object> makeStopSet(List<?> stopWords, boolean ignoreCase){
CharArraySet stopSet = new CharArraySet(stopWords.size(), ignoreCase);
public static final Set<Object> makeStopSet(Version matchVersion, List<?> stopWords, boolean ignoreCase){
CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.size(), ignoreCase);
stopSet.addAll(stopWords);
return stopSet;
}
@ -157,13 +269,14 @@ public final class StopFilter extends TokenFilter {
* StopFilter use this method when creating the
* StopFilter. Prior to 2.9, this returns false. On 2.9
* or later, it returns true.
* @deprecated use {@link #StopFilter(Version, TokenStream, Set)} instead
*/
public static boolean getEnablePositionIncrementsVersionDefault(Version matchVersion) {
return matchVersion.onOrAfter(Version.LUCENE_29);
}
/**
* @see #setEnablePositionIncrements(boolean).
* @see #setEnablePositionIncrements(boolean)
*/
public boolean getEnablePositionIncrements() {
return enablePositionIncrements;

View File

@ -34,6 +34,8 @@ import java.util.Set;
* <p>You must specify the required {@link Version}
* compatibility when creating StandardAnalyzer:
* <ul>
* <li> As of 3.1, StopFilter correctly handles Unicode 4.0
* supplementary characters in stopwords
* <li> As of 2.9, StopFilter preserves position
* increments
* <li> As of 2.4, Tokens incorrectly identified as acronyms
@ -47,7 +49,7 @@ public class StandardAnalyzer extends Analyzer {
* Specifies whether deprecated acronyms should be replaced with HOST type.
* See {@linkplain https://issues.apache.org/jira/browse/LUCENE-1068}
*/
private final boolean replaceInvalidAcronym,enableStopPositionIncrements;
private final boolean replaceInvalidAcronym;
/** An unmodifiable set containing some common English words that are usually not
useful for searching. */
@ -70,7 +72,6 @@ public class StandardAnalyzer extends Analyzer {
public StandardAnalyzer(Version matchVersion, Set<?> stopWords) {
stopSet = stopWords;
setOverridesTokenStreamMethod(StandardAnalyzer.class);
enableStopPositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_24);
this.matchVersion = matchVersion;
}
@ -101,7 +102,7 @@ public class StandardAnalyzer extends Analyzer {
tokenStream.setMaxTokenLength(maxTokenLength);
TokenStream result = new StandardFilter(tokenStream);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(enableStopPositionIncrements, result, stopSet);
result = new StopFilter(matchVersion, result, stopSet);
return result;
}
@ -148,8 +149,7 @@ public class StandardAnalyzer extends Analyzer {
streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
streams.filteredTokenStream = new LowerCaseFilter(matchVersion,
streams.filteredTokenStream);
streams.filteredTokenStream = new StopFilter(enableStopPositionIncrements,
streams.filteredTokenStream, stopSet);
streams.filteredTokenStream = new StopFilter(matchVersion, streams.filteredTokenStream, stopSet);
} else {
streams.tokenStream.reset(reader);
}

View File

@ -35,7 +35,7 @@ public abstract class CharacterUtils {
* @return a {@link CharacterUtils} implementation according to the given
* {@link Version} instance.
*/
public static CharacterUtils getInstance(Version matchVersion) {
public static CharacterUtils getInstance(final Version matchVersion) {
return matchVersion.onOrAfter(Version.LUCENE_31) ? JAVA_5 : JAVA_4;
}
@ -58,7 +58,7 @@ public abstract class CharacterUtils {
* - if the value offset is negative or not less than the length of
* the char array.
*/
public abstract int codePointAt(char[] chars, int offset);
public abstract int codePointAt(final char[] chars, final int offset);
/**
* Returns the code point at the given index of the {@link CharSequence}.
@ -79,21 +79,52 @@ public abstract class CharacterUtils {
* - if the value offset is negative or not less than the length of
* the character sequence.
*/
public abstract int codePointAt(CharSequence seq, int offset);
public abstract int codePointAt(final CharSequence seq, final int offset);
/**
* Returns the code point at the given index of the char array where only elements
* with index less than the limit are used.
* Depending on the {@link Version} passed to
* {@link CharacterUtils#getInstance(Version)} this method mimics the behavior
* of {@link Character#codePointAt(char[], int)} as it would have been
* available on a Java 1.4 JVM or on a later virtual machine version.
*
* @param chars
* a character array
* @param offset
* the offset to the char values in the chars array to be converted
* @param limit the index afer the last element that should be used to calculate
* codepoint.
*
* @return the Unicode code point at the given index
* @throws NullPointerException
* - if the array is null.
* @throws IndexOutOfBoundsException
* - if the value offset is negative or not less than the length of
* the char array.
*/
public abstract int codePointAt(final char[] chars, final int offset, final int limit);
private static final class Java5CharacterUtils extends CharacterUtils {
Java5CharacterUtils() {
};
@Override
public final int codePointAt(char[] chars, int offset) {
public final int codePointAt(final char[] chars, final int offset) {
return Character.codePointAt(chars, offset);
}
@Override
public int codePointAt(CharSequence seq, int offset) {
public int codePointAt(final CharSequence seq, final int offset) {
return Character.codePointAt(seq, offset);
}
@Override
public int codePointAt(final char[] chars, final int offset, final int limit) {
return Character.codePointAt(chars, offset, limit);
}
}
private static final class Java4CharacterUtils extends CharacterUtils {
@ -101,14 +132,22 @@ public abstract class CharacterUtils {
};
@Override
public final int codePointAt(char[] chars, int offset) {
public final int codePointAt(final char[] chars, final int offset) {
return chars[offset];
}
@Override
public int codePointAt(CharSequence seq, int offset) {
public int codePointAt(final CharSequence seq, final int offset) {
return seq.charAt(offset);
}
@Override
public int codePointAt(final char[] chars, final int offset, final int limit) {
if(offset >= limit)
throw new IndexOutOfBoundsException("offset must be less than limit");
return chars[offset];
}
}
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis;
import java.util.Arrays;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.Version;
public class TestCharArraySet extends LuceneTestCase {
@ -33,7 +34,7 @@ public class TestCharArraySet extends LuceneTestCase {
public void testRehash() throws Exception {
CharArraySet cas = new CharArraySet(0, true);
CharArraySet cas = new CharArraySet(Version.LUCENE_CURRENT, 0, true);
for(int i=0;i<TEST_STOP_WORDS.length;i++)
cas.add(TEST_STOP_WORDS[i]);
assertEquals(TEST_STOP_WORDS.length, cas.size());
@ -44,7 +45,7 @@ public class TestCharArraySet extends LuceneTestCase {
public void testNonZeroOffset() {
String[] words={"Hello","World","this","is","a","test"};
char[] findme="xthisy".toCharArray();
CharArraySet set=new CharArraySet(10,true);
CharArraySet set=new CharArraySet(Version.LUCENE_CURRENT, 10,true);
set.addAll(Arrays.asList(words));
assertTrue(set.contains(findme, 1, 4));
assertTrue(set.contains(new String(findme,1,4)));
@ -56,7 +57,7 @@ public class TestCharArraySet extends LuceneTestCase {
}
public void testObjectContains() {
CharArraySet set = new CharArraySet(10, true);
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 10, true);
Integer val = Integer.valueOf(1);
set.add(val);
assertTrue(set.contains(val));
@ -68,7 +69,7 @@ public class TestCharArraySet extends LuceneTestCase {
}
public void testClear(){
CharArraySet set=new CharArraySet(10,true);
CharArraySet set=new CharArraySet(Version.LUCENE_CURRENT, 10,true);
set.addAll(Arrays.asList(TEST_STOP_WORDS));
assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
try{
@ -81,7 +82,7 @@ public class TestCharArraySet extends LuceneTestCase {
}
public void testModifyOnUnmodifiable(){
CharArraySet set=new CharArraySet(10,true);
CharArraySet set=new CharArraySet(Version.LUCENE_CURRENT, 10,true);
set.addAll(Arrays.asList(TEST_STOP_WORDS));
final int size = set.size();
set = CharArraySet.unmodifiableSet(set);
@ -162,7 +163,7 @@ public class TestCharArraySet extends LuceneTestCase {
}
public void testUnmodifiableSet(){
CharArraySet set=new CharArraySet(10,true);
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 10,true);
set.addAll(Arrays.asList(TEST_STOP_WORDS));
final int size = set.size();
set = CharArraySet.unmodifiableSet(set);
@ -175,4 +176,129 @@ public class TestCharArraySet extends LuceneTestCase {
// expected
}
}
public void testSupplementaryChars() {
String missing = "Term %s is missing in the set";
String falsePos = "Term %s is in the set but shouldn't";
// for reference see
// http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[[%3ACase_Sensitive%3DTrue%3A]%26[^[\u0000-\uFFFF]]]&esc=on
String[] upperArr = new String[] {"Abc\ud801\udc1c",
"\ud801\udc1c\ud801\udc1cCDE", "A\ud801\udc1cB"};
String[] lowerArr = new String[] {"abc\ud801\udc44",
"\ud801\udc44\ud801\udc44cde", "a\ud801\udc44b"};
CharArraySet set = new CharArraySet(Version.LUCENE_31, Arrays.asList(TEST_STOP_WORDS), true);
for (String upper : upperArr) {
set.add(upper);
}
for (int i = 0; i < upperArr.length; i++) {
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
assertTrue(String.format(missing, lowerArr[i]), set.contains(lowerArr[i]));
}
set = new CharArraySet(Version.LUCENE_31, Arrays.asList(TEST_STOP_WORDS), false);
for (String upper : upperArr) {
set.add(upper);
}
for (int i = 0; i < upperArr.length; i++) {
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
assertFalse(String.format(falsePos, lowerArr[i]), set.contains(lowerArr[i]));
}
}
public void testSingleHighSurrogate() {
String missing = "Term %s is missing in the set";
String falsePos = "Term %s is in the set but shouldn't";
String[] upperArr = new String[] { "ABC\uD800", "ABC\uD800EfG",
"\uD800EfG", "\uD800\ud801\udc1cB" };
String[] lowerArr = new String[] { "abc\uD800", "abc\uD800efg",
"\uD800efg", "\uD800\ud801\udc44b" };
CharArraySet set = new CharArraySet(Version.LUCENE_31, Arrays
.asList(TEST_STOP_WORDS), true);
for (String upper : upperArr) {
set.add(upper);
}
for (int i = 0; i < upperArr.length; i++) {
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
assertTrue(String.format(missing, lowerArr[i]), set.contains(lowerArr[i]));
}
set = new CharArraySet(Version.LUCENE_31, Arrays.asList(TEST_STOP_WORDS),
false);
for (String upper : upperArr) {
set.add(upper);
}
for (int i = 0; i < upperArr.length; i++) {
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
assertFalse(String.format(falsePos, upperArr[i]), set
.contains(lowerArr[i]));
}
}
/**
* @deprecated remove this test when lucene 3.0 "broken unicode 4" support is
* no longer needed.
*/
public void testSupplementaryCharsBWCompat() {
String missing = "Term %s is missing in the set";
String falsePos = "Term %s is in the set but shouldn't";
// for reference see
// http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[[%3ACase_Sensitive%3DTrue%3A]%26[^[\u0000-\uFFFF]]]&esc=on
String[] upperArr = new String[] {"Abc\ud801\udc1c",
"\ud801\udc1c\ud801\udc1cCDE", "A\ud801\udc1cB"};
String[] lowerArr = new String[] {"abc\ud801\udc44",
"\ud801\udc44\ud801\udc44cde", "a\ud801\udc44b"};
CharArraySet set = new CharArraySet(Version.LUCENE_30, Arrays.asList(TEST_STOP_WORDS), true);
for (String upper : upperArr) {
set.add(upper);
}
for (int i = 0; i < upperArr.length; i++) {
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
assertFalse(String.format(falsePos, lowerArr[i]), set.contains(lowerArr[i]));
}
set = new CharArraySet(Version.LUCENE_30, Arrays.asList(TEST_STOP_WORDS), false);
for (String upper : upperArr) {
set.add(upper);
}
for (int i = 0; i < upperArr.length; i++) {
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
assertFalse(String.format(falsePos, lowerArr[i]), set.contains(lowerArr[i]));
}
}
/**
* @deprecated remove this test when lucene 3.0 "broken unicode 4" support is
* no longer needed.
*/
public void testSingleHighSurrogateBWComapt() {
String missing = "Term %s is missing in the set";
String falsePos = "Term %s is in the set but shouldn't";
String[] upperArr = new String[] { "ABC\uD800", "ABC\uD800EfG",
"\uD800EfG", "\uD800\ud801\udc1cB" };
String[] lowerArr = new String[] { "abc\uD800", "abc\uD800efg",
"\uD800efg", "\uD800\ud801\udc44b" };
CharArraySet set = new CharArraySet(Version.LUCENE_30, Arrays
.asList(TEST_STOP_WORDS), true);
for (String upper : upperArr) {
set.add(upper);
}
for (int i = 0; i < upperArr.length; i++) {
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
if (i == lowerArr.length - 1)
assertFalse(String.format(falsePos, lowerArr[i]), set
.contains(lowerArr[i]));
else
assertTrue(String.format(missing, lowerArr[i]), set
.contains(lowerArr[i]));
}
set = new CharArraySet(Version.LUCENE_30, Arrays.asList(TEST_STOP_WORDS),
false);
for (String upper : upperArr) {
set.add(upper);
}
for (int i = 0; i < upperArr.length; i++) {
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
assertFalse(String.format(falsePos, lowerArr[i]), set
.contains(lowerArr[i]));
}
}
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.English;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.StringReader;
@ -37,7 +38,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
public void testExactCase() throws IOException {
StringReader reader = new StringReader("Now is The Time");
Set<String> stopWords = new HashSet(Arrays.asList("is", "the", "Time"));
TokenStream stream = new StopFilter(false, new WhitespaceTokenizer(reader), stopWords, false);
TokenStream stream = new StopFilter(Version.LUCENE_CURRENT, new WhitespaceTokenizer(reader), stopWords, false);
final TermAttribute termAtt = stream.getAttribute(TermAttribute.class);
assertTrue(stream.incrementToken());
assertEquals("Now", termAtt.term());
@ -49,7 +50,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
public void testIgnoreCase() throws IOException {
StringReader reader = new StringReader("Now is The Time");
Set<String> stopWords = new HashSet(Arrays.asList( "is", "the", "Time" ));
TokenStream stream = new StopFilter(false, new WhitespaceTokenizer(reader), stopWords, true);
TokenStream stream = new StopFilter(Version.LUCENE_CURRENT, new WhitespaceTokenizer(reader), stopWords, true);
final TermAttribute termAtt = stream.getAttribute(TermAttribute.class);
assertTrue(stream.incrementToken());
assertEquals("Now", termAtt.term());
@ -59,8 +60,8 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
public void testStopFilt() throws IOException {
StringReader reader = new StringReader("Now is The Time");
String[] stopWords = new String[] { "is", "the", "Time" };
Set stopSet = StopFilter.makeStopSet(stopWords);
TokenStream stream = new StopFilter(false, new WhitespaceTokenizer(reader), stopSet);
Set stopSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords);
TokenStream stream = new StopFilter(Version.LUCENE_CURRENT, new WhitespaceTokenizer(reader), stopSet);
final TermAttribute termAtt = stream.getAttribute(TermAttribute.class);
assertTrue(stream.incrementToken());
assertEquals("Now", termAtt.term());
@ -83,14 +84,14 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
log(sb.toString());
String stopWords[] = (String[]) a.toArray(new String[0]);
for (int i=0; i<a.size(); i++) log("Stop: "+stopWords[i]);
Set stopSet = StopFilter.makeStopSet(stopWords);
Set stopSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords);
// with increments
StringReader reader = new StringReader(sb.toString());
StopFilter stpf = new StopFilter(false, new WhitespaceTokenizer(reader), stopSet);
StopFilter stpf = new StopFilter(Version.LUCENE_24, new WhitespaceTokenizer(reader), stopSet);
doTestStopPositons(stpf,true);
// without increments
reader = new StringReader(sb.toString());
stpf = new StopFilter(false, new WhitespaceTokenizer(reader), stopSet);
stpf = new StopFilter(Version.LUCENE_CURRENT, new WhitespaceTokenizer(reader), stopSet);
doTestStopPositons(stpf,false);
// with increments, concatenating two stop filters
ArrayList a0 = new ArrayList();
@ -106,12 +107,12 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
for (int i=0; i<a0.size(); i++) log("Stop0: "+stopWords0[i]);
String stopWords1[] = (String[]) a1.toArray(new String[0]);
for (int i=0; i<a1.size(); i++) log("Stop1: "+stopWords1[i]);
Set stopSet0 = StopFilter.makeStopSet(stopWords0);
Set stopSet1 = StopFilter.makeStopSet(stopWords1);
Set stopSet0 = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords0);
Set stopSet1 = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords1);
reader = new StringReader(sb.toString());
StopFilter stpf0 = new StopFilter(false, new WhitespaceTokenizer(reader), stopSet0); // first part of the set
StopFilter stpf0 = new StopFilter(Version.LUCENE_CURRENT, new WhitespaceTokenizer(reader), stopSet0); // first part of the set
stpf0.setEnablePositionIncrements(true);
StopFilter stpf01 = new StopFilter(false, stpf0, stopSet1); // two stop filters concatenated!
StopFilter stpf01 = new StopFilter(Version.LUCENE_CURRENT, stpf0, stopSet1); // two stop filters concatenated!
doTestStopPositons(stpf01,true);
}

View File

@ -956,7 +956,7 @@ public class TestQueryParser extends LocalizedTestCase {
}
public void testStopwords() throws Exception {
QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "a", new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet("the", "foo")));
QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "a", new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet(Version.LUCENE_CURRENT, "the", "foo")));
Query result = qp.parse("a:the OR a:foo");
assertNotNull("result is null and it shouldn't be", result);
assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
@ -972,7 +972,7 @@ public class TestQueryParser extends LocalizedTestCase {
}
public void testPositionIncrement() throws Exception {
QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "a", new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet("the", "in", "are", "this")));
QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "a", new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet(Version.LUCENE_CURRENT, "the", "in", "are", "this")));
qp.setEnablePositionIncrements(true);
String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\"";
// 0 2 5 7 8

View File

@ -22,7 +22,6 @@ import java.io.IOException;
import java.io.StringReader;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
@ -232,7 +231,8 @@ public class TestPositionIncrement extends LuceneTestCase {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream ts = a.tokenStream(fieldName,reader);
return new StopFilter(enablePositionIncrements, ts, new CharArraySet(Collections.singleton("stop"), true));
return new StopFilter(enablePositionIncrements?Version.LUCENE_CURRENT:Version.LUCENE_24, ts,
new CharArraySet(Version.LUCENE_CURRENT, Collections.singleton("stop"), true));
}
}
@ -275,12 +275,12 @@ public class TestPositionIncrement extends LuceneTestCase {
Spans pspans = snq.getSpans(is.getIndexReader());
while (pspans.next()) {
//System.out.println(pspans.doc() + " - " + pspans.start() + " - "+ pspans.end());
Collection payloads = pspans.getPayload();
Collection<byte[]> payloads = pspans.getPayload();
sawZero |= pspans.start() == 0;
for (Iterator it = payloads.iterator(); it.hasNext();) {
for (@SuppressWarnings("unused") byte[] bytes : payloads) {
count++;
it.next();
//System.out.println(new String((byte[]) it.next()));
//System.out.println(new String(bytes));
}
}
assertEquals(5, count);
@ -302,10 +302,10 @@ public class TestPositionIncrement extends LuceneTestCase {
sawZero = false;
PayloadSpanUtil psu = new PayloadSpanUtil(is.getIndexReader());
Collection pls = psu.getPayloadsForQuery(snq);
Collection<byte[]> pls = psu.getPayloadsForQuery(snq);
count = pls.size();
for (Iterator it = pls.iterator(); it.hasNext();) {
String s = new String((byte[]) it.next());
for (byte[] bytes : pls) {
String s = new String(bytes);
//System.out.println(s);
sawZero |= s.equals("pos: 0");
}