mirror of https://github.com/apache/lucene.git
LUCENE-2094: Prepare CharArraySet for Unicode 4.0
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@885592 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c155a0c477
commit
9edfb3b66a
|
@ -25,6 +25,13 @@ Bug fixes
|
|||
|
||||
New features
|
||||
|
||||
* LUCENE-2069: Added Unicode 4 support to CharArraySet. Due to the switch
|
||||
to Java 5, supplementary characters are now lowercased correctly if the
|
||||
set is created as case insensitive.
|
||||
CharArraySet now requires a Version argument to preserve
|
||||
backwards compatibility. If Version < 3.1 is passed to the constructor,
|
||||
CharArraySet yields the old behavior. (Simon Willnauer)
|
||||
|
||||
* LUCENE-2069: Added Unicode 4 support to LowerCaseFilter. Due to the switch
|
||||
to Java 5, supplementary characters are now lowercased correctly.
|
||||
LowerCaseFilter now requires a Version argument to preserve
|
||||
|
|
|
@ -129,7 +129,7 @@ public final class ArabicAnalyzer extends Analyzer {
|
|||
* a stopword set
|
||||
*/
|
||||
public ArabicAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
|
@ -138,7 +138,7 @@ public final class ArabicAnalyzer extends Analyzer {
|
|||
* @deprecated use {@link #ArabicAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public ArabicAnalyzer( Version matchVersion, String... stopwords ) {
|
||||
this(matchVersion, StopFilter.makeStopSet( stopwords ));
|
||||
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords ));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -170,8 +170,7 @@ public final class ArabicAnalyzer extends Analyzer {
|
|||
TokenStream result = new ArabicLetterTokenizer( reader );
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
// the order here is important: the stopword list is not normalized!
|
||||
result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
result, stoptable );
|
||||
result = new StopFilter( matchVersion, result, stoptable );
|
||||
result = new ArabicNormalizationFilter( result );
|
||||
result = new ArabicStemFilter( result );
|
||||
|
||||
|
@ -200,8 +199,7 @@ public final class ArabicAnalyzer extends Analyzer {
|
|||
streams.source = new ArabicLetterTokenizer(reader);
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.source);
|
||||
// the order here is important: the stopword list is not normalized!
|
||||
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
streams.result, stoptable);
|
||||
streams.result = new StopFilter( matchVersion, streams.result, stoptable);
|
||||
streams.result = new ArabicNormalizationFilter(streams.result);
|
||||
streams.result = new ArabicStemFilter(streams.result);
|
||||
setPreviousTokenStream(streams);
|
||||
|
|
|
@ -87,8 +87,8 @@ public final class BrazilianAnalyzer extends Analyzer {
|
|||
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET = CharArraySet
|
||||
.unmodifiableSet(new CharArraySet(Arrays.asList(BRAZILIAN_STOP_WORDS),
|
||||
false));
|
||||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList(BRAZILIAN_STOP_WORDS), false));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -120,7 +120,7 @@ public final class BrazilianAnalyzer extends Analyzer {
|
|||
* a stopword set
|
||||
*/
|
||||
public BrazilianAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
|
@ -136,7 +136,7 @@ public final class BrazilianAnalyzer extends Analyzer {
|
|||
Set<?> stemExclusionSet) {
|
||||
this(matchVersion, stopwords);
|
||||
excltable = CharArraySet.unmodifiableSet(CharArraySet
|
||||
.copy(stemExclusionSet));
|
||||
.copy(matchVersion, stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -144,7 +144,7 @@ public final class BrazilianAnalyzer extends Analyzer {
|
|||
* @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public BrazilianAnalyzer(Version matchVersion, String... stopwords) {
|
||||
this(matchVersion, StopFilter.makeStopSet(stopwords));
|
||||
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -169,7 +169,7 @@ public final class BrazilianAnalyzer extends Analyzer {
|
|||
* @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable( String... exclusionlist ) {
|
||||
excltable = StopFilter.makeStopSet( exclusionlist );
|
||||
excltable = StopFilter.makeStopSet( matchVersion, exclusionlist );
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
/**
|
||||
|
@ -201,8 +201,7 @@ public final class BrazilianAnalyzer extends Analyzer {
|
|||
TokenStream result = new StandardTokenizer( matchVersion, reader );
|
||||
result = new LowerCaseFilter( matchVersion, result );
|
||||
result = new StandardFilter( result );
|
||||
result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
result, stoptable );
|
||||
result = new StopFilter( matchVersion, result, stoptable );
|
||||
result = new BrazilianStemFilter( result, excltable );
|
||||
return result;
|
||||
}
|
||||
|
@ -229,8 +228,7 @@ public final class BrazilianAnalyzer extends Analyzer {
|
|||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.source);
|
||||
streams.result = new StandardFilter(streams.result);
|
||||
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
streams.result, stoptable);
|
||||
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
|
||||
streams.result = new BrazilianStemFilter(streams.result, excltable);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
|
|
|
@ -68,7 +68,7 @@ public class CJKAnalyzer extends Analyzer {
|
|||
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET = CharArraySet
|
||||
.unmodifiableSet(new CharArraySet(Arrays.asList(STOP_WORDS),
|
||||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS),
|
||||
false));
|
||||
}
|
||||
/**
|
||||
|
@ -95,7 +95,7 @@ public class CJKAnalyzer extends Analyzer {
|
|||
* a stopword set
|
||||
*/
|
||||
public CJKAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
stopTable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
stopTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
|
@ -106,7 +106,7 @@ public class CJKAnalyzer extends Analyzer {
|
|||
* @deprecated use {@link #CJKAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public CJKAnalyzer(Version matchVersion, String... stopWords) {
|
||||
stopTable = StopFilter.makeStopSet(stopWords);
|
||||
stopTable = StopFilter.makeStopSet(matchVersion, stopWords);
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
|
@ -122,8 +122,7 @@ public class CJKAnalyzer extends Analyzer {
|
|||
*/
|
||||
@Override
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
new CJKTokenizer(reader), stopTable);
|
||||
return new StopFilter(matchVersion, new CJKTokenizer(reader), stopTable);
|
||||
}
|
||||
|
||||
private class SavedStreams {
|
||||
|
@ -147,8 +146,7 @@ public class CJKAnalyzer extends Analyzer {
|
|||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new CJKTokenizer(reader);
|
||||
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
streams.source, stopTable);
|
||||
streams.result = new StopFilter(matchVersion, streams.source, stopTable);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.CharArraySet;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} with a stop word table.
|
||||
|
@ -63,7 +64,7 @@ public final class ChineseFilter extends TokenFilter {
|
|||
public ChineseFilter(TokenStream in) {
|
||||
super(in);
|
||||
|
||||
stopTable = new CharArraySet(Arrays.asList(STOP_WORDS), false);
|
||||
stopTable = new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS), false);
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.compound;
|
|||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -34,9 +33,18 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Base class for decomposition token filters.
|
||||
* Base class for decomposition token filters. <a name="version"/>
|
||||
* <p>
|
||||
* You must specify the required {@link Version} compatibility when creating
|
||||
* CompoundWordTokenFilterBase:
|
||||
* <ul>
|
||||
* <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
|
||||
* supplementary characters in strings and char arrays provided as compound word
|
||||
* dictionaries.
|
||||
* </ul>
|
||||
*/
|
||||
public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||
/**
|
||||
|
@ -55,7 +63,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
|
||||
|
||||
protected final CharArraySet dictionary;
|
||||
protected final LinkedList tokens;
|
||||
protected final LinkedList<Token> tokens;
|
||||
protected final int minWordSize;
|
||||
protected final int minSubwordSize;
|
||||
protected final int maxSubwordSize;
|
||||
|
@ -69,31 +77,72 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
private PayloadAttribute payloadAtt;
|
||||
|
||||
private final Token wrapper = new Token();
|
||||
|
||||
/**
|
||||
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], int, int, int, boolean) instead
|
||||
*/
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
this(input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
|
||||
this(Version.LUCENE_30, input, makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], boolean) instead
|
||||
*/
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
|
||||
this(input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
|
||||
this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, boolean onlyLongestMatch) {
|
||||
this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
|
||||
|
||||
/**
|
||||
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set, boolean) instead
|
||||
*/
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {
|
||||
this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[]) instead
|
||||
*/
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) {
|
||||
this(input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set) instead
|
||||
*/
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary) {
|
||||
this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary) {
|
||||
this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
/**
|
||||
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set[], int, int, int, boolean) instead
|
||||
*/
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
this(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
this(matchVersion, input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
|
||||
this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary, boolean onlyLongestMatch) {
|
||||
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) {
|
||||
this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary) {
|
||||
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(input);
|
||||
|
||||
this.tokens=new LinkedList();
|
||||
this.tokens=new LinkedList<Token>();
|
||||
this.minWordSize=minWordSize;
|
||||
this.minSubwordSize=minSubwordSize;
|
||||
this.maxSubwordSize=maxSubwordSize;
|
||||
|
@ -102,7 +151,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
if (dictionary instanceof CharArraySet) {
|
||||
this.dictionary = (CharArraySet) dictionary;
|
||||
} else {
|
||||
this.dictionary = new CharArraySet(dictionary.size(), false);
|
||||
this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false);
|
||||
addAllLowerCase(this.dictionary, dictionary);
|
||||
}
|
||||
|
||||
|
@ -121,9 +170,13 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
* @param dictionary
|
||||
* @return {@link Set} of lowercased terms
|
||||
*/
|
||||
public static final Set makeDictionary(final String[] dictionary) {
|
||||
public static final Set<?> makeDictionary(final String[] dictionary) {
|
||||
return makeDictionary(Version.LUCENE_30, dictionary);
|
||||
}
|
||||
|
||||
public static final Set<?> makeDictionary(final Version matchVersion, final String[] dictionary) {
|
||||
// is the below really case insensitive?
|
||||
CharArraySet dict = new CharArraySet(dictionary.length, false);
|
||||
CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false);
|
||||
addAllLowerCase(dict, Arrays.asList(dictionary));
|
||||
return dict;
|
||||
}
|
||||
|
@ -140,11 +193,11 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (tokens.size() > 0) {
|
||||
setToken((Token)tokens.removeFirst());
|
||||
setToken(tokens.removeFirst());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (input.incrementToken() == false)
|
||||
if (!input.incrementToken())
|
||||
return false;
|
||||
|
||||
wrapper.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||
|
@ -158,18 +211,16 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
decompose(wrapper);
|
||||
|
||||
if (tokens.size() > 0) {
|
||||
setToken((Token)tokens.removeFirst());
|
||||
setToken(tokens.removeFirst());
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
protected static final void addAllLowerCase(Set target, Collection col) {
|
||||
Iterator iter=col.iterator();
|
||||
|
||||
while (iter.hasNext()) {
|
||||
target.add(((String)iter.next()).toLowerCase());
|
||||
protected static final void addAllLowerCase(Set<Object> target, Collection<String> col) {
|
||||
for (String string : col) {
|
||||
target.add(string.toLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Set;
|
|||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter; // for javadocs
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
|
||||
|
@ -33,7 +34,9 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
* </p>
|
||||
*/
|
||||
public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
|
||||
|
||||
/**
|
||||
* Creates a new {@link DictionaryCompoundWordTokenFilter}
|
||||
*
|
||||
* @param input the {@link TokenStream} to process
|
||||
* @param dictionary the word dictionary to match against
|
||||
|
@ -41,33 +44,39 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
|||
* @param minSubwordSize only subwords longer than this get to the output stream
|
||||
* @param maxSubwordSize only subwords shorter than this get to the output stream
|
||||
* @param onlyLongestMatch Add only the longest matching subword to the stream
|
||||
* @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, String[], int, int, int, boolean)} instead
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary,
|
||||
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Creates a new {@link DictionaryCompoundWordTokenFilter}
|
||||
*
|
||||
* @param input the {@link TokenStream} to process
|
||||
* @param dictionary the word dictionary to match against
|
||||
* @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, String[])} instead
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary) {
|
||||
super(input, dictionary);
|
||||
super(Version.LUCENE_30, input, dictionary);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Creates a new {@link DictionaryCompoundWordTokenFilter}
|
||||
*
|
||||
* @param input the {@link TokenStream} to process
|
||||
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
|
||||
* lower case strings.
|
||||
* lower case strings.
|
||||
* @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, Set)} instead
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary) {
|
||||
super(input, dictionary);
|
||||
super(Version.LUCENE_30, input, dictionary);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Creates a new {@link DictionaryCompoundWordTokenFilter}
|
||||
*
|
||||
* @param input the {@link TokenStream} to process
|
||||
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
|
||||
* lower case strings.
|
||||
|
@ -75,10 +84,104 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
|||
* @param minSubwordSize only subwords longer than this get to the output stream
|
||||
* @param maxSubwordSize only subwords shorter than this get to the output stream
|
||||
* @param onlyLongestMatch Add only the longest matching subword to the stream
|
||||
* @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, Set, int, int, int, boolean)} instead
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary,
|
||||
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link DictionaryCompoundWordTokenFilter}
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the
|
||||
* dictionaries if Version > 3.0. See <a
|
||||
* href="CompoundWordTokenFilterBase#version"
|
||||
* >CompoundWordTokenFilterBase</a> for details.
|
||||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* @param dictionary
|
||||
* the word dictionary to match against
|
||||
* @param minWordSize
|
||||
* only words longer than this get processed
|
||||
* @param minSubwordSize
|
||||
* only subwords longer than this get to the output stream
|
||||
* @param maxSubwordSize
|
||||
* only subwords shorter than this get to the output stream
|
||||
* @param onlyLongestMatch
|
||||
* Add only the longest matching subword to the stream
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary,
|
||||
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link DictionaryCompoundWordTokenFilter}
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the
|
||||
* dictionaries if Version > 3.0. See <a
|
||||
* href="CompoundWordTokenFilterBase#version"
|
||||
* >CompoundWordTokenFilterBase</a> for details.
|
||||
*
|
||||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* @param dictionary
|
||||
* the word dictionary to match against
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary) {
|
||||
super(matchVersion, input, dictionary);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link DictionaryCompoundWordTokenFilter}
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the
|
||||
* dictionaries if Version > 3.0. See <a
|
||||
* href="CompoundWordTokenFilterBase#version"
|
||||
* >CompoundWordTokenFilterBase</a> for details.
|
||||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* @param dictionary
|
||||
* the word dictionary to match against. If this is a
|
||||
* {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
|
||||
* must have set ignoreCase=false and only contain lower case
|
||||
* strings.
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary) {
|
||||
super(matchVersion, input, dictionary);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link DictionaryCompoundWordTokenFilter}
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the
|
||||
* dictionaries if Version > 3.0. See <a
|
||||
* href="CompoundWordTokenFilterBase#version"
|
||||
* >CompoundWordTokenFilterBase</a> for details.
|
||||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* @param dictionary
|
||||
* the word dictionary to match against. If this is a
|
||||
* {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
|
||||
* must have set ignoreCase=false and only contain lower case
|
||||
* strings.
|
||||
* @param minWordSize
|
||||
* only words longer than this get processed
|
||||
* @param minSubwordSize
|
||||
* only subwords longer than this get to the output stream
|
||||
* @param maxSubwordSize
|
||||
* only subwords shorter than this get to the output stream
|
||||
* @param onlyLongestMatch
|
||||
* Add only the longest matching subword to the stream
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary,
|
||||
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.analysis.TokenFilter; // for javadocs
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
/**
|
||||
|
@ -41,20 +42,31 @@ import org.xml.sax.InputSource;
|
|||
public class HyphenationCompoundWordTokenFilter extends
|
||||
CompoundWordTokenFilterBase {
|
||||
private HyphenationTree hyphenator;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @param input the {@link TokenStream} to process
|
||||
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary the word dictionary to match against
|
||||
* @param minWordSize only words longer than this get processed
|
||||
* @param minSubwordSize only subwords longer than this get to the output
|
||||
* stream
|
||||
* @param maxSubwordSize only subwords shorter than this get to the output
|
||||
* stream
|
||||
* @param onlyLongestMatch Add only the longest matching subword to the stream
|
||||
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the
|
||||
* dictionaries if Version > 3.0. See <a
|
||||
* href="CompoundWordTokenFilterBase#version"
|
||||
* >CompoundWordTokenFilterBase</a> for details.
|
||||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* @param hyphenator
|
||||
* the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary
|
||||
* the word dictionary to match against
|
||||
* @param minWordSize
|
||||
* only words longer than this get processed
|
||||
* @param minSubwordSize
|
||||
* only subwords longer than this get to the output stream
|
||||
* @param maxSubwordSize
|
||||
* only subwords shorter than this get to the output stream
|
||||
* @param onlyLongestMatch
|
||||
* Add only the longest matching subword to the stream
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||
HyphenationTree hyphenator, String[] dictionary, int minWordSize,
|
||||
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
this(input, hyphenator, makeDictionary(dictionary), minWordSize,
|
||||
|
@ -62,32 +74,138 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param input the {@link TokenStream} to process
|
||||
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary the word dictionary to match against
|
||||
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the
|
||||
* dictionaries if Version > 3.0. See <a
|
||||
* href="CompoundWordTokenFilterBase#version"
|
||||
* >CompoundWordTokenFilterBase</a> for details.
|
||||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* @param hyphenator
|
||||
* the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary
|
||||
* the word dictionary to match against
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||
HyphenationTree hyphenator, String[] dictionary) {
|
||||
this(input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
|
||||
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
|
||||
*
|
||||
* @param input the {@link TokenStream} to process
|
||||
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
|
||||
* lower case strings.
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the
|
||||
* dictionaries if Version > 3.0. See <a
|
||||
* href="CompoundWordTokenFilterBase#version"
|
||||
* >CompoundWordTokenFilterBase</a> for details.
|
||||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* @param hyphenator
|
||||
* the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary
|
||||
* the word dictionary to match against. If this is a
|
||||
* {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
|
||||
* must have set ignoreCase=false and only contain lower case
|
||||
* strings.
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||
HyphenationTree hyphenator, Set dictionary) {
|
||||
this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
|
||||
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the
|
||||
* dictionaries if Version > 3.0. See <a
|
||||
* href="CompoundWordTokenFilterBase#version"
|
||||
* >CompoundWordTokenFilterBase</a> for details.
|
||||
* @param input
|
||||
* the {@link TokenStream} to process
|
||||
* @param hyphenator
|
||||
* the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary
|
||||
* the word dictionary to match against. If this is a
|
||||
* {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
|
||||
* must have set ignoreCase=false and only contain lower case
|
||||
* strings.
|
||||
* @param minWordSize
|
||||
* only words longer than this get processed
|
||||
* @param minSubwordSize
|
||||
* only subwords longer than this get to the output stream
|
||||
* @param maxSubwordSize
|
||||
* only subwords shorter than this get to the output stream
|
||||
* @param onlyLongestMatch
|
||||
* Add only the longest matching subword to the stream
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||
HyphenationTree hyphenator, Set dictionary, int minWordSize,
|
||||
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
|
||||
onlyLongestMatch);
|
||||
|
||||
this.hyphenator = hyphenator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
|
||||
*
|
||||
* @param input the {@link TokenStream} to process
|
||||
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary the word dictionary to match against
|
||||
* @param minWordSize only words longer than this get processed
|
||||
* @param minSubwordSize only subwords longer than this get to the output
|
||||
* stream
|
||||
* @param maxSubwordSize only subwords shorter than this get to the output
|
||||
* stream
|
||||
* @param onlyLongestMatch Add only the longest matching subword to the stream
|
||||
* @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, String[], int, int, int, boolean)} instead.
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
HyphenationTree hyphenator, String[] dictionary, int minWordSize,
|
||||
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
this(Version.LUCENE_30, input, hyphenator, makeDictionary(dictionary), minWordSize,
|
||||
minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
|
||||
*
|
||||
* @param input the {@link TokenStream} to process
|
||||
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary the word dictionary to match against
|
||||
* @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, String[])} instead.
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
HyphenationTree hyphenator, String[] dictionary) {
|
||||
this(Version.LUCENE_30, input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
|
||||
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
|
||||
*
|
||||
* @param input the {@link TokenStream} to process
|
||||
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
|
||||
* lower case strings.
|
||||
* @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set)} instead.
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
HyphenationTree hyphenator, Set dictionary) {
|
||||
this(Version.LUCENE_30, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
|
||||
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
|
||||
*
|
||||
* @param input the {@link TokenStream} to process
|
||||
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
|
||||
|
@ -98,11 +216,12 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
* @param maxSubwordSize only subwords shorter than this get to the output
|
||||
* stream
|
||||
* @param onlyLongestMatch Add only the longest matching subword to the stream
|
||||
* @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)} instead.
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
HyphenationTree hyphenator, Set dictionary, int minWordSize,
|
||||
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
|
||||
super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
|
||||
onlyLongestMatch);
|
||||
|
||||
this.hyphenator = hyphenator;
|
||||
|
|
|
@ -92,7 +92,7 @@ public final class CzechAnalyzer extends Analyzer {
|
|||
|
||||
private static class DefaultSetHolder {
|
||||
private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
|
||||
Arrays.asList(CZECH_STOP_WORDS), false));
|
||||
Version.LUCENE_CURRENT, Arrays.asList(CZECH_STOP_WORDS), false));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -121,7 +121,7 @@ public final class CzechAnalyzer extends Analyzer {
|
|||
*/
|
||||
public CzechAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
this.matchVersion = matchVersion;
|
||||
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
}
|
||||
|
||||
|
||||
|
@ -134,7 +134,7 @@ public final class CzechAnalyzer extends Analyzer {
|
|||
* @deprecated use {@link #CzechAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public CzechAnalyzer(Version matchVersion, String... stopwords) {
|
||||
this(matchVersion, StopFilter.makeStopSet( stopwords ));
|
||||
this(matchVersion, StopFilter.makeStopSet( matchVersion, stopwords ));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -206,8 +206,7 @@ public final class CzechAnalyzer extends Analyzer {
|
|||
TokenStream result = new StandardTokenizer( matchVersion, reader );
|
||||
result = new StandardFilter( result );
|
||||
result = new LowerCaseFilter( matchVersion, result );
|
||||
result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
result, stoptable );
|
||||
result = new StopFilter( matchVersion, result, stoptable );
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
result = new CzechStemFilter(result);
|
||||
return result;
|
||||
|
@ -236,8 +235,7 @@ public final class CzechAnalyzer extends Analyzer {
|
|||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.result);
|
||||
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
streams.result, stoptable);
|
||||
streams.result = new StopFilter( matchVersion, streams.result, stoptable);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
streams.result = new CzechStemFilter(streams.result);
|
||||
setPreviousTokenStream(streams);
|
||||
|
|
|
@ -83,7 +83,7 @@ public class GermanAnalyzer extends Analyzer {
|
|||
|
||||
private static class DefaultSetHolder {
|
||||
private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
|
||||
Arrays.asList(GERMAN_STOP_WORDS), false));
|
||||
Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -131,8 +131,8 @@ public class GermanAnalyzer extends Analyzer {
|
|||
* a stemming exclusion set
|
||||
*/
|
||||
public GermanAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
|
||||
setOverridesTokenStreamMethod(GermanAnalyzer.class);
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
@ -142,7 +142,7 @@ public class GermanAnalyzer extends Analyzer {
|
|||
* @deprecated use {@link #GermanAnalyzer(Version, Set)}
|
||||
*/
|
||||
public GermanAnalyzer(Version matchVersion, String... stopwords) {
|
||||
this(matchVersion, StopFilter.makeStopSet(stopwords));
|
||||
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -167,7 +167,7 @@ public class GermanAnalyzer extends Analyzer {
|
|||
* @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable(String[] exclusionlist) {
|
||||
exclusionSet = StopFilter.makeStopSet(exclusionlist);
|
||||
exclusionSet = StopFilter.makeStopSet(matchVersion, exclusionlist);
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
|
||||
|
@ -175,8 +175,8 @@ public class GermanAnalyzer extends Analyzer {
|
|||
* Builds an exclusionlist from a {@link Map}
|
||||
* @deprecated use {@link #GermanAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable(Map exclusionlist) {
|
||||
exclusionSet = new HashSet(exclusionlist.keySet());
|
||||
public void setStemExclusionTable(Map<?,?> exclusionlist) {
|
||||
exclusionSet = new HashSet<Object>(exclusionlist.keySet());
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
|
||||
|
@ -201,8 +201,7 @@ public class GermanAnalyzer extends Analyzer {
|
|||
TokenStream result = new StandardTokenizer(matchVersion, reader);
|
||||
result = new StandardFilter(result);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
result, stopSet);
|
||||
result = new StopFilter( matchVersion, result, stopSet);
|
||||
result = new GermanStemFilter(result, exclusionSet);
|
||||
return result;
|
||||
}
|
||||
|
@ -235,8 +234,7 @@ public class GermanAnalyzer extends Analyzer {
|
|||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.result);
|
||||
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
streams.result, stopSet);
|
||||
streams.result = new StopFilter( matchVersion, streams.result, stopSet);
|
||||
streams.result = new GermanStemFilter(streams.result, exclusionSet);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
|
|
|
@ -70,7 +70,7 @@ public final class GreekAnalyzer extends Analyzer
|
|||
|
||||
private static class DefaultSetHolder {
|
||||
private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
|
||||
Arrays.asList(GREEK_STOP_WORDS), false));
|
||||
Version.LUCENE_CURRENT, Arrays.asList(GREEK_STOP_WORDS), false));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -93,7 +93,7 @@ public final class GreekAnalyzer extends Analyzer
|
|||
* a stopword set
|
||||
*/
|
||||
public GreekAnalyzer(Version matchVersion, Set<?> stopwords) {
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
|
@ -104,7 +104,7 @@ public final class GreekAnalyzer extends Analyzer
|
|||
*/
|
||||
public GreekAnalyzer(Version matchVersion, String... stopwords)
|
||||
{
|
||||
this(matchVersion, StopFilter.makeStopSet(stopwords));
|
||||
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -127,8 +127,7 @@ public final class GreekAnalyzer extends Analyzer
|
|||
{
|
||||
TokenStream result = new StandardTokenizer(matchVersion, reader);
|
||||
result = new GreekLowerCaseFilter(result);
|
||||
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
result, stopSet);
|
||||
result = new StopFilter(matchVersion, result, stopSet);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -152,8 +151,7 @@ public final class GreekAnalyzer extends Analyzer
|
|||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new GreekLowerCaseFilter(streams.source);
|
||||
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
streams.result, stopSet);
|
||||
streams.result = new StopFilter(matchVersion, streams.result, stopSet);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
|
|
|
@ -126,7 +126,7 @@ public final class PersianAnalyzer extends Analyzer {
|
|||
* a stopword set
|
||||
*/
|
||||
public PersianAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
|
@ -135,7 +135,7 @@ public final class PersianAnalyzer extends Analyzer {
|
|||
* @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public PersianAnalyzer(Version matchVersion, String... stopwords) {
|
||||
this(matchVersion, StopFilter.makeStopSet(stopwords));
|
||||
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -175,8 +175,7 @@ public final class PersianAnalyzer extends Analyzer {
|
|||
* the order here is important: the stopword list is normalized with the
|
||||
* above!
|
||||
*/
|
||||
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
result, stoptable);
|
||||
result = new StopFilter(matchVersion, result, stoptable);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -209,8 +208,7 @@ public final class PersianAnalyzer extends Analyzer {
|
|||
* the order here is important: the stopword list is normalized with the
|
||||
* above!
|
||||
*/
|
||||
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
streams.result, stoptable);
|
||||
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.CharArraySet;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be
|
||||
|
@ -35,44 +36,77 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|||
* @see <a href="http://fr.wikipedia.org/wiki/%C3%89lision">Elision in Wikipedia</a>
|
||||
*/
|
||||
public final class ElisionFilter extends TokenFilter {
|
||||
private CharArraySet articles = null;
|
||||
private TermAttribute termAtt;
|
||||
private CharArraySet articles = CharArraySet.EMPTY_SET;
|
||||
private final TermAttribute termAtt;
|
||||
private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
|
||||
new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(
|
||||
"l", "m", "t", "qu", "n", "s", "j"), true));
|
||||
|
||||
private static char[] apostrophes = {'\'', '’'};
|
||||
private static char[] apostrophes = {'\'', '\u2019'};
|
||||
|
||||
/**
|
||||
* Set the stopword articles
|
||||
* @param matchVersion the lucene backwards compatibility version
|
||||
* @param articles a set of articles
|
||||
* @deprecated use {@link #ElisionFilter(Version, TokenStream, Set)} instead
|
||||
*/
|
||||
public void setArticles(Version matchVersion, Set<?> articles) {
|
||||
this.articles = CharArraySet.unmodifiableSet(
|
||||
CharArraySet.copy(matchVersion, articles));
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the stopword articles
|
||||
* @param articles a set of articles
|
||||
* @deprecated use {@link #setArticles(Version, Set)} instead
|
||||
*/
|
||||
public void setArticles(Set<?> articles) {
|
||||
if (articles instanceof CharArraySet)
|
||||
this.articles = (CharArraySet) articles;
|
||||
else
|
||||
this.articles = new CharArraySet(articles, true);
|
||||
setArticles(Version.LUCENE_CURRENT, articles);
|
||||
}
|
||||
/**
|
||||
* Constructs an elision filter with standard stop words
|
||||
*/
|
||||
protected ElisionFilter(Version matchVersion, TokenStream input) {
|
||||
this(matchVersion, input, DEFAULT_ARTICLES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs an elision filter with standard stop words
|
||||
* @deprecated use {@link #ElisionFilter(Version, TokenStream)} instead
|
||||
*/
|
||||
protected ElisionFilter(TokenStream input) {
|
||||
super(input);
|
||||
this.articles = new CharArraySet(Arrays.asList(
|
||||
"l", "m", "t", "qu", "n", "s", "j"), true);
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
this(Version.LUCENE_30, input);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs an elision filter with a Set of stop words
|
||||
* @deprecated use {@link #ElisionFilter(Version, TokenStream, Set)} instead
|
||||
*/
|
||||
public ElisionFilter(TokenStream input, Set<?> articles) {
|
||||
this(Version.LUCENE_30, input, articles);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs an elision filter with a Set of stop words
|
||||
* @param matchVersion the lucene backwards compatibility version
|
||||
* @param input the source {@link TokenStream}
|
||||
* @param articles a set of stopword articles
|
||||
*/
|
||||
public ElisionFilter(Version matchVersion, TokenStream input, Set<?> articles) {
|
||||
super(input);
|
||||
setArticles(articles);
|
||||
this.articles = CharArraySet.unmodifiableSet(
|
||||
new CharArraySet(matchVersion, articles, true));
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs an elision filter with an array of stop words
|
||||
* @deprecated use {@link #ElisionFilter(Version, TokenStream, Set)} instead
|
||||
*/
|
||||
public ElisionFilter(TokenStream input, String[] articles) {
|
||||
super(input);
|
||||
this.articles = new CharArraySet(Arrays.asList(articles), true);
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
this(Version.LUCENE_CURRENT, input,
|
||||
new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList(articles), true));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -34,6 +34,7 @@ import java.io.IOException;
|
|||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -98,7 +99,7 @@ public final class FrenchAnalyzer extends Analyzer {
|
|||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
//TODO make this final in 3.0
|
||||
private Set<?> excltable = new HashSet();
|
||||
private Set<?> excltable = Collections.<Object>emptySet();
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
|
@ -112,7 +113,7 @@ public final class FrenchAnalyzer extends Analyzer {
|
|||
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET = CharArraySet
|
||||
.unmodifiableSet(new CharArraySet(Arrays.asList(FRENCH_STOP_WORDS),
|
||||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(FRENCH_STOP_WORDS),
|
||||
false));
|
||||
}
|
||||
|
||||
|
@ -148,9 +149,10 @@ public final class FrenchAnalyzer extends Analyzer {
|
|||
public FrenchAnalyzer(Version matchVersion, Set<?> stopwords,
|
||||
Set<?> stemExclutionSet) {
|
||||
this.matchVersion = matchVersion;
|
||||
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
this.stoptable = CharArraySet.unmodifiableSet(CharArraySet
|
||||
.copy(matchVersion, stopwords));
|
||||
this.excltable = CharArraySet.unmodifiableSet(CharArraySet
|
||||
.copy(stemExclutionSet));
|
||||
.copy(matchVersion, stemExclutionSet));
|
||||
}
|
||||
|
||||
|
||||
|
@ -159,7 +161,7 @@ public final class FrenchAnalyzer extends Analyzer {
|
|||
* @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public FrenchAnalyzer(Version matchVersion, String... stopwords) {
|
||||
this(matchVersion, StopFilter.makeStopSet(stopwords));
|
||||
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -176,7 +178,7 @@ public final class FrenchAnalyzer extends Analyzer {
|
|||
* @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable(String... exclusionlist) {
|
||||
excltable = StopFilter.makeStopSet(exclusionlist);
|
||||
excltable = StopFilter.makeStopSet(matchVersion, exclusionlist);
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
|
||||
|
@ -184,8 +186,8 @@ public final class FrenchAnalyzer extends Analyzer {
|
|||
* Builds an exclusionlist from a Map.
|
||||
* @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable(Map exclusionlist) {
|
||||
excltable = new HashSet(exclusionlist.keySet());
|
||||
public void setStemExclusionTable(Map<?,?> exclusionlist) {
|
||||
excltable = new HashSet<Object>(exclusionlist.keySet());
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
|
||||
|
@ -195,7 +197,7 @@ public final class FrenchAnalyzer extends Analyzer {
|
|||
* @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable(File exclusionlist) throws IOException {
|
||||
excltable = new HashSet(WordlistLoader.getWordSet(exclusionlist));
|
||||
excltable = new HashSet<Object>(WordlistLoader.getWordSet(exclusionlist));
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
|
||||
|
@ -211,8 +213,7 @@ public final class FrenchAnalyzer extends Analyzer {
|
|||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer(matchVersion, reader);
|
||||
result = new StandardFilter(result);
|
||||
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
result, stoptable);
|
||||
result = new StopFilter(matchVersion, result, stoptable);
|
||||
result = new FrenchStemFilter(result, excltable);
|
||||
// Convert to lowercase after stemming!
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
|
@ -240,8 +241,7 @@ public final class FrenchAnalyzer extends Analyzer {
|
|||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
streams.result, stoptable);
|
||||
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
|
||||
streams.result = new FrenchStemFilter(streams.result, excltable);
|
||||
// Convert to lowercase after stemming!
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.result);
|
||||
|
|
|
@ -73,7 +73,8 @@ public class PatternAnalyzer extends Analyzer {
|
|||
public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
|
||||
|
||||
private static final CharArraySet EXTENDED_ENGLISH_STOP_WORDS =
|
||||
CharArraySet.unmodifiableSet(new CharArraySet(Arrays.asList(
|
||||
CharArraySet.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList(
|
||||
"a", "about", "above", "across", "adj", "after", "afterwards",
|
||||
"again", "against", "albeit", "all", "almost", "alone", "along",
|
||||
"already", "also", "although", "always", "among", "amongst", "an",
|
||||
|
@ -153,7 +154,7 @@ public class PatternAnalyzer extends Analyzer {
|
|||
* if non-null, ignores all tokens that are contained in the
|
||||
* given stop set (after previously having applied toLowerCase()
|
||||
* if applicable). For example, created via
|
||||
* {@link StopFilter#makeStopSet(String[])}and/or
|
||||
* {@link StopFilter#makeStopSet(Version, String[])}and/or
|
||||
* {@link org.apache.lucene.analysis.WordlistLoader}as in
|
||||
* <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
|
||||
* or <a href="http://www.unine.ch/info/clef/">other stop words
|
||||
|
@ -199,7 +200,7 @@ public class PatternAnalyzer extends Analyzer {
|
|||
}
|
||||
else {
|
||||
stream = new PatternTokenizer(text, pattern, toLowerCase);
|
||||
if (stopWords != null) stream = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), stream, stopWords);
|
||||
if (stopWords != null) stream = new StopFilter(matchVersion, stream, stopWords);
|
||||
}
|
||||
|
||||
return stream;
|
||||
|
@ -387,12 +388,12 @@ public class PatternAnalyzer extends Analyzer {
|
|||
private int pos;
|
||||
private final boolean isLetter;
|
||||
private final boolean toLowerCase;
|
||||
private final Set stopWords;
|
||||
private final Set<?> stopWords;
|
||||
private static final Locale locale = Locale.getDefault();
|
||||
private TermAttribute termAtt = addAttribute(TermAttribute.class);
|
||||
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
|
||||
public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) {
|
||||
this.str = str;
|
||||
this.isLetter = isLetter;
|
||||
this.toLowerCase = toLowerCase;
|
||||
|
|
|
@ -80,8 +80,8 @@ public class DutchAnalyzer extends Analyzer {
|
|||
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET = CharArraySet
|
||||
.unmodifiableSet(new CharArraySet(Arrays.asList(DUTCH_STOP_WORDS),
|
||||
false));
|
||||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList(DUTCH_STOP_WORDS), false));
|
||||
}
|
||||
|
||||
|
||||
|
@ -116,8 +116,8 @@ public class DutchAnalyzer extends Analyzer {
|
|||
}
|
||||
|
||||
public DutchAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionTable){
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionTable));
|
||||
stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
|
||||
this.matchVersion = matchVersion;
|
||||
setOverridesTokenStreamMethod(DutchAnalyzer.class);
|
||||
}
|
||||
|
@ -130,7 +130,7 @@ public class DutchAnalyzer extends Analyzer {
|
|||
* @deprecated use {@link #DutchAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public DutchAnalyzer(Version matchVersion, String... stopwords) {
|
||||
this(matchVersion, StopFilter.makeStopSet(stopwords));
|
||||
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -168,7 +168,7 @@ public class DutchAnalyzer extends Analyzer {
|
|||
* @deprecated use {@link #DutchAnalyzer(Version, Set, Set)} instead
|
||||
*/
|
||||
public void setStemExclusionTable(String... exclusionlist) {
|
||||
excltable = StopFilter.makeStopSet(exclusionlist);
|
||||
excltable = StopFilter.makeStopSet(matchVersion, exclusionlist);
|
||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||
}
|
||||
|
||||
|
@ -222,8 +222,7 @@ public class DutchAnalyzer extends Analyzer {
|
|||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer(matchVersion, reader);
|
||||
result = new StandardFilter(result);
|
||||
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
result, stoptable);
|
||||
result = new StopFilter(matchVersion, result, stoptable);
|
||||
result = new DutchStemFilter(result, excltable, stemdict);
|
||||
return result;
|
||||
}
|
||||
|
@ -256,8 +255,7 @@ public class DutchAnalyzer extends Analyzer {
|
|||
streams = new SavedStreams();
|
||||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
streams.result, stoptable);
|
||||
streams.result = new StopFilter(matchVersion, streams.result, stoptable);
|
||||
streams.result = new DutchStemFilter(streams.result, excltable, stemdict);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
|
|
|
@ -179,8 +179,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
|
|||
}
|
||||
HashSet<String> stopWords = stopWordsPerField.get(fieldName);
|
||||
if (stopWords != null) {
|
||||
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
result, stopWords);
|
||||
result = new StopFilter(matchVersion, result, stopWords);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -223,8 +222,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
|
|||
/* if there are any stopwords for the field, save the stopfilter */
|
||||
HashSet<String> stopWords = stopWordsPerField.get(fieldName);
|
||||
if (stopWords != null)
|
||||
streams.withStopFilter = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
streams.wrapped, stopWords);
|
||||
streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
|
||||
else
|
||||
streams.withStopFilter = streams.wrapped;
|
||||
|
||||
|
@ -245,8 +243,7 @@ public class QueryAutoStopWordAnalyzer extends Analyzer {
|
|||
streams.wrapped = result;
|
||||
HashSet<String> stopWords = stopWordsPerField.get(fieldName);
|
||||
if (stopWords != null)
|
||||
streams.withStopFilter = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
streams.wrapped, stopWords);
|
||||
streams.withStopFilter = new StopFilter(matchVersion, streams.wrapped, stopWords);
|
||||
else
|
||||
streams.withStopFilter = streams.wrapped;
|
||||
}
|
||||
|
|
|
@ -59,8 +59,8 @@ public final class RussianAnalyzer extends Analyzer
|
|||
|
||||
private static class DefaultSetHolder {
|
||||
static final Set<?> DEFAULT_STOP_SET = CharArraySet
|
||||
.unmodifiableSet(new CharArraySet(Arrays.asList(RUSSIAN_STOP_WORDS),
|
||||
false));
|
||||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList(RUSSIAN_STOP_WORDS), false));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -79,7 +79,7 @@ public final class RussianAnalyzer extends Analyzer
|
|||
* @deprecated use {@link #RussianAnalyzer(Version, Set)} instead
|
||||
*/
|
||||
public RussianAnalyzer(Version matchVersion, String... stopwords) {
|
||||
this(matchVersion, StopFilter.makeStopSet(stopwords));
|
||||
this(matchVersion, StopFilter.makeStopSet(matchVersion, stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -91,7 +91,7 @@ public final class RussianAnalyzer extends Analyzer
|
|||
* a stopword set
|
||||
*/
|
||||
public RussianAnalyzer(Version matchVersion, Set<?> stopwords){
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
|
@ -119,8 +119,7 @@ public final class RussianAnalyzer extends Analyzer
|
|||
{
|
||||
TokenStream result = new RussianLetterTokenizer(reader);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
result, stopSet);
|
||||
result = new StopFilter(matchVersion, result, stopSet);
|
||||
result = new RussianStemFilter(result);
|
||||
return result;
|
||||
}
|
||||
|
@ -147,8 +146,7 @@ public final class RussianAnalyzer extends Analyzer
|
|||
streams = new SavedStreams();
|
||||
streams.source = new RussianLetterTokenizer(reader);
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.source);
|
||||
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
streams.result, stopSet);
|
||||
streams.result = new StopFilter(matchVersion, streams.result, stopSet);
|
||||
streams.result = new RussianStemFilter(streams.result);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
|
|
|
@ -48,8 +48,7 @@ public class ThaiAnalyzer extends Analyzer {
|
|||
TokenStream ts = new StandardTokenizer(matchVersion, reader);
|
||||
ts = new StandardFilter(ts);
|
||||
ts = new ThaiWordFilter(ts);
|
||||
ts = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
ts = new StopFilter(matchVersion, ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
return ts;
|
||||
}
|
||||
|
||||
|
@ -73,8 +72,7 @@ public class ThaiAnalyzer extends Analyzer {
|
|||
streams.source = new StandardTokenizer(matchVersion, reader);
|
||||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new ThaiWordFilter(streams.result);
|
||||
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
streams.result, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
streams.result = new StopFilter(matchVersion, streams.result, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
|
|
|
@ -42,7 +42,7 @@ public class TestElision extends BaseTokenStreamTestCase {
|
|||
Set articles = new HashSet();
|
||||
articles.add("l");
|
||||
articles.add("M");
|
||||
TokenFilter filter = new ElisionFilter(tokenizer, articles);
|
||||
TokenFilter filter = new ElisionFilter(Version.LUCENE_CURRENT, tokenizer, articles);
|
||||
List tas = filtre(filter);
|
||||
assertEquals("embrouille", tas.get(4));
|
||||
assertEquals("O'brian", tas.get(6));
|
||||
|
|
|
@ -153,8 +153,7 @@ public class SmartChineseAnalyzer extends Analyzer {
|
|||
// The porter stemming is too strict, this is not a bug, this is a feature:)
|
||||
result = new PorterStemFilter(result);
|
||||
if (!stopWords.isEmpty()) {
|
||||
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
result, stopWords, false);
|
||||
result = new StopFilter(matchVersion, result, stopWords, false);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -175,8 +174,7 @@ public class SmartChineseAnalyzer extends Analyzer {
|
|||
streams.filteredTokenStream = new WordTokenFilter(streams.tokenStream);
|
||||
streams.filteredTokenStream = new PorterStemFilter(streams.filteredTokenStream);
|
||||
if (!stopWords.isEmpty()) {
|
||||
streams.filteredTokenStream = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
streams.filteredTokenStream, stopWords, false);
|
||||
streams.filteredTokenStream = new StopFilter(matchVersion, streams.filteredTokenStream, stopWords, false);
|
||||
}
|
||||
} else {
|
||||
streams.tokenStream.reset(reader);
|
||||
|
|
|
@ -1076,7 +1076,7 @@ public class TestQPHelper extends LocalizedTestCase {
|
|||
public void testStopwords() throws Exception {
|
||||
StandardQueryParser qp = new StandardQueryParser();
|
||||
qp.setAnalyzer(
|
||||
new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet("the", "foo" )));
|
||||
new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet(Version.LUCENE_CURRENT, "the", "foo" )));
|
||||
|
||||
Query result = qp.parse("a:the OR a:foo", "a");
|
||||
assertNotNull("result is null and it shouldn't be", result);
|
||||
|
@ -1099,7 +1099,7 @@ public class TestQPHelper extends LocalizedTestCase {
|
|||
public void testPositionIncrement() throws Exception {
|
||||
StandardQueryParser qp = new StandardQueryParser();
|
||||
qp.setAnalyzer(
|
||||
new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet("the", "in", "are", "this" )));
|
||||
new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet(Version.LUCENE_CURRENT, "the", "in", "are", "this" )));
|
||||
|
||||
qp.setEnablePositionIncrements(true);
|
||||
|
||||
|
|
|
@ -1056,7 +1056,7 @@ public class TestQueryParserWrapper extends LocalizedTestCase {
|
|||
}
|
||||
|
||||
public void testStopwords() throws Exception {
|
||||
QueryParserWrapper qp = new QueryParserWrapper("a", new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet("the", "foo")));
|
||||
QueryParserWrapper qp = new QueryParserWrapper("a", new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet(Version.LUCENE_CURRENT, "the", "foo")));
|
||||
Query result = qp.parse("a:the OR a:foo");
|
||||
assertNotNull("result is null and it shouldn't be", result);
|
||||
assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
|
||||
|
@ -1075,7 +1075,7 @@ public class TestQueryParserWrapper extends LocalizedTestCase {
|
|||
}
|
||||
|
||||
public void testPositionIncrement() throws Exception {
|
||||
QueryParserWrapper qp = new QueryParserWrapper("a", new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet("the", "in", "are", "this")));
|
||||
QueryParserWrapper qp = new QueryParserWrapper("a", new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet(Version.LUCENE_CURRENT, "the", "in", "are", "this")));
|
||||
qp.setEnablePositionIncrements(true);
|
||||
String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\"";
|
||||
// 0 2 5 7 8
|
||||
|
|
|
@ -50,7 +50,7 @@ public class SnowballAnalyzer extends Analyzer {
|
|||
/** Builds the named analyzer with the given stop words. */
|
||||
public SnowballAnalyzer(Version matchVersion, String name, String[] stopWords) {
|
||||
this(matchVersion, name);
|
||||
stopSet = StopFilter.makeStopSet(stopWords);
|
||||
stopSet = StopFilter.makeStopSet(matchVersion, stopWords);
|
||||
}
|
||||
|
||||
/** Constructs a {@link StandardTokenizer} filtered by a {@link
|
||||
|
@ -62,7 +62,7 @@ public class SnowballAnalyzer extends Analyzer {
|
|||
result = new StandardFilter(result);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
if (stopSet != null)
|
||||
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
result = new StopFilter(matchVersion,
|
||||
result, stopSet);
|
||||
result = new SnowballFilter(result, name);
|
||||
return result;
|
||||
|
@ -93,7 +93,7 @@ public class SnowballAnalyzer extends Analyzer {
|
|||
streams.result = new StandardFilter(streams.source);
|
||||
streams.result = new LowerCaseFilter(matchVersion, streams.result);
|
||||
if (stopSet != null)
|
||||
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
|
||||
streams.result = new StopFilter(matchVersion,
|
||||
streams.result, stopSet);
|
||||
streams.result = new SnowballFilter(streams.result, name);
|
||||
setPreviousTokenStream(streams);
|
||||
|
|
|
@ -6,6 +6,9 @@ import java.util.Collections;
|
|||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.CharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -32,45 +35,113 @@ import java.util.Set;
|
|||
* etc. It is designed to be quick to test if a char[]
|
||||
* is in the set without the necessity of converting it
|
||||
* to a String first.
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating {@link CharArraySet}:
|
||||
* <ul>
|
||||
* <li> As of 3.1, supplementary characters are
|
||||
* properly lowercased.</li>
|
||||
* </ul>
|
||||
* Before 3.1 supplementary characters could not be
|
||||
* lowercased correctly due to the lack of Unicode 4
|
||||
* support in JDK 1.4. To use instances of
|
||||
* {@link CharArraySet} with the behavior before Lucene
|
||||
* 3.1 pass a {@link Version} < 3.1 to the constructors.
|
||||
* <P>
|
||||
* <em>Please note:</em> This class implements {@link java.util.Set Set} but
|
||||
* does not behave like it should in all cases. The generic type is
|
||||
* {@code Set<Object>}, because you can add any object to it,
|
||||
* that has a string representation. The add methods will use
|
||||
* {@link Object#toString} and store the result using a {@code char[]}
|
||||
* buffer. The same behaviour have the {@code contains()} methods.
|
||||
* buffer. The same behavior have the {@code contains()} methods.
|
||||
* The {@link #iterator()} returns an {@code Iterator<String>}.
|
||||
* For type safety also {@link #stringIterator()} is provided.
|
||||
*/
|
||||
|
||||
public class CharArraySet extends AbstractSet<Object> {
|
||||
private final static int INIT_SIZE = 8;
|
||||
private char[][] entries;
|
||||
private int count;
|
||||
private final boolean ignoreCase;
|
||||
public static final CharArraySet EMPTY_SET = CharArraySet.unmodifiableSet(new CharArraySet(0, false));
|
||||
public static final CharArraySet EMPTY_SET = CharArraySet.unmodifiableSet(
|
||||
new CharArraySet(Version.LUCENE_CURRENT, 0, false));
|
||||
|
||||
private final CharacterUtils charUtils;
|
||||
private final Version matchVersion;
|
||||
|
||||
/** Create set with enough capacity to hold startSize
|
||||
* terms */
|
||||
public CharArraySet(int startSize, boolean ignoreCase) {
|
||||
/**
|
||||
* Create set with enough capacity to hold startSize terms
|
||||
*
|
||||
* @param matchVersion
|
||||
* compatibility match version see <a href="#version">Version
|
||||
* note</a> above for details.
|
||||
* @param startSize
|
||||
* the initial capacity
|
||||
* @param ignoreCase
|
||||
* <code>false</code> if and only if the set should be case sensitive
|
||||
* otherwise <code>true</code>.
|
||||
*/
|
||||
public CharArraySet(Version matchVersion, int startSize, boolean ignoreCase) {
|
||||
this.ignoreCase = ignoreCase;
|
||||
int size = INIT_SIZE;
|
||||
while(startSize + (startSize>>2) > size)
|
||||
size <<= 1;
|
||||
entries = new char[size][];
|
||||
this.charUtils = CharacterUtils.getInstance(matchVersion);
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/** Create set from a Collection of char[] or String */
|
||||
/**
|
||||
* Creates a set from a Collection of objects.
|
||||
*
|
||||
* @param matchVersion
|
||||
* compatibility match version see <a href="#version">Version
|
||||
* note</a> above for details.
|
||||
* @param c
|
||||
* a collection whose elements to be placed into the set
|
||||
* @param ignoreCase
|
||||
* <code>false</code> if and only if the set should be case sensitive
|
||||
* otherwise <code>true</code>.
|
||||
*/
|
||||
public CharArraySet(Version matchVersion, Collection<? extends Object> c, boolean ignoreCase) {
|
||||
this(matchVersion, c.size(), ignoreCase);
|
||||
addAll(c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a set with enough capacity to hold startSize terms
|
||||
*
|
||||
* @param startSize
|
||||
* the initial capacity
|
||||
* @param ignoreCase
|
||||
* <code>false</code> if and only if the set should be case sensitive
|
||||
* otherwise <code>true</code>.
|
||||
* @deprecated use {@link #CharArraySet(Version, int, boolean)} instead
|
||||
*/
|
||||
public CharArraySet(int startSize, boolean ignoreCase) {
|
||||
this(Version.LUCENE_30, startSize, ignoreCase);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a set from a Collection of objects.
|
||||
*
|
||||
* @param c
|
||||
* a collection whose elements to be placed into the set
|
||||
* @param ignoreCase
|
||||
* <code>false</code> if and only if the set should be case sensitive
|
||||
* otherwise <code>true</code>.
|
||||
* @deprecated use {@link #CharArraySet(Version, Collection, boolean)} instead
|
||||
*/
|
||||
public CharArraySet(Collection<? extends Object> c, boolean ignoreCase) {
|
||||
this(c.size(), ignoreCase);
|
||||
this(Version.LUCENE_30, c.size(), ignoreCase);
|
||||
addAll(c);
|
||||
}
|
||||
|
||||
/** Create set from entries */
|
||||
private CharArraySet(char[][] entries, boolean ignoreCase, int count){
|
||||
private CharArraySet(Version matchVersion, char[][] entries, boolean ignoreCase, int count){
|
||||
this.entries = entries;
|
||||
this.ignoreCase = ignoreCase;
|
||||
this.count = count;
|
||||
this.charUtils = CharacterUtils.getInstance(matchVersion);
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/** true if the <code>len</code> chars of <code>text</code> starting at <code>off</code>
|
||||
|
@ -131,8 +202,11 @@ public class CharArraySet extends AbstractSet<Object> {
|
|||
*/
|
||||
public boolean add(char[] text) {
|
||||
if (ignoreCase)
|
||||
for(int i=0;i<text.length;i++)
|
||||
text[i] = Character.toLowerCase(text[i]);
|
||||
for(int i=0;i<text.length;){
|
||||
i += Character.toChars(
|
||||
Character.toLowerCase(
|
||||
charUtils.codePointAt(text, i)), text, i);
|
||||
}
|
||||
int slot = getSlot(text, 0, text.length);
|
||||
if (entries[slot] != null) return false;
|
||||
entries[slot] = text;
|
||||
|
@ -148,10 +222,13 @@ public class CharArraySet extends AbstractSet<Object> {
|
|||
private boolean equals(char[] text1, int off, int len, char[] text2) {
|
||||
if (len != text2.length)
|
||||
return false;
|
||||
final int limit = off+len;
|
||||
if (ignoreCase) {
|
||||
for(int i=0;i<len;i++) {
|
||||
if (Character.toLowerCase(text1[off+i]) != text2[i])
|
||||
for(int i=0;i<len;) {
|
||||
final int codePointAt = charUtils.codePointAt(text1, off+i, limit);
|
||||
if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i))
|
||||
return false;
|
||||
i += Character.charCount(codePointAt);
|
||||
}
|
||||
} else {
|
||||
for(int i=0;i<len;i++) {
|
||||
|
@ -167,9 +244,11 @@ public class CharArraySet extends AbstractSet<Object> {
|
|||
if (len != text2.length)
|
||||
return false;
|
||||
if (ignoreCase) {
|
||||
for(int i=0;i<len;i++) {
|
||||
if (Character.toLowerCase(text1.charAt(i)) != text2[i])
|
||||
for(int i=0;i<len;) {
|
||||
final int codePointAt = charUtils.codePointAt(text1, i);
|
||||
if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i))
|
||||
return false;
|
||||
i += Character.charCount(codePointAt);
|
||||
}
|
||||
} else {
|
||||
for(int i=0;i<len;i++) {
|
||||
|
@ -179,6 +258,8 @@ public class CharArraySet extends AbstractSet<Object> {
|
|||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
private void rehash() {
|
||||
final int newSize = 2*entries.length;
|
||||
|
@ -198,8 +279,10 @@ public class CharArraySet extends AbstractSet<Object> {
|
|||
int code = 0;
|
||||
final int stop = offset + len;
|
||||
if (ignoreCase) {
|
||||
for (int i=offset; i<stop; i++) {
|
||||
code = code*31 + Character.toLowerCase(text[i]);
|
||||
for (int i=offset; i<stop;) {
|
||||
final int codePointAt = charUtils.codePointAt(text, i, stop);
|
||||
code = code*31 + Character.toLowerCase(codePointAt);
|
||||
i += Character.charCount(codePointAt);
|
||||
}
|
||||
} else {
|
||||
for (int i=offset; i<stop; i++) {
|
||||
|
@ -213,8 +296,10 @@ public class CharArraySet extends AbstractSet<Object> {
|
|||
int code = 0;
|
||||
int len = text.length();
|
||||
if (ignoreCase) {
|
||||
for (int i=0; i<len; i++) {
|
||||
code = code*31 + Character.toLowerCase(text.charAt(i));
|
||||
for (int i=0; i<len;) {
|
||||
int codePointAt = charUtils.codePointAt(text, i);
|
||||
code = code*31 + Character.toLowerCase(codePointAt);
|
||||
i += Character.charCount(codePointAt);
|
||||
}
|
||||
} else {
|
||||
for (int i=0; i<len; i++) {
|
||||
|
@ -274,7 +359,7 @@ public class CharArraySet extends AbstractSet<Object> {
|
|||
* Instead of delegating calls to the given set copy the low-level values to
|
||||
* the unmodifiable Subclass
|
||||
*/
|
||||
return new UnmodifiableCharArraySet(set.entries, set.ignoreCase, set.count);
|
||||
return new UnmodifiableCharArraySet(set.matchVersion, set.entries, set.ignoreCase, set.count);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -286,15 +371,33 @@ public class CharArraySet extends AbstractSet<Object> {
|
|||
* @return a copy of the given set as a {@link CharArraySet}. If the given set
|
||||
* is a {@link CharArraySet} the ignoreCase property will be
|
||||
* preserved.
|
||||
* @deprecated use {@link #copy(Version, Set)} instead
|
||||
*/
|
||||
public static CharArraySet copy(Set<?> set) {
|
||||
return copy(Version.LUCENE_30, set);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a copy of the given set as a {@link CharArraySet}. If the given set
|
||||
* is a {@link CharArraySet} the ignoreCase property will be preserved.
|
||||
*
|
||||
* @param matchVersion
|
||||
* compatibility match version see <a href="#version">Version
|
||||
* note</a> above for details.
|
||||
* @param set
|
||||
* a set to copy
|
||||
* @return a copy of the given set as a {@link CharArraySet}. If the given set
|
||||
* is a {@link CharArraySet} the ignoreCase property will be
|
||||
* preserved.
|
||||
*/
|
||||
public static CharArraySet copy(Version matchVersion, Set<?> set) {
|
||||
if (set == null)
|
||||
throw new NullPointerException("Given set is null");
|
||||
if(set == EMPTY_SET)
|
||||
return EMPTY_SET;
|
||||
final boolean ignoreCase = set instanceof CharArraySet ? ((CharArraySet) set).ignoreCase
|
||||
: false;
|
||||
return new CharArraySet(set, ignoreCase);
|
||||
return new CharArraySet(matchVersion, set, ignoreCase);
|
||||
}
|
||||
|
||||
|
||||
|
@ -356,9 +459,9 @@ public class CharArraySet extends AbstractSet<Object> {
|
|||
*/
|
||||
private static final class UnmodifiableCharArraySet extends CharArraySet {
|
||||
|
||||
private UnmodifiableCharArraySet(char[][] entries, boolean ignoreCase,
|
||||
private UnmodifiableCharArraySet(Version matchVersion, char[][] entries, boolean ignoreCase,
|
||||
int count) {
|
||||
super(entries, ignoreCase, count);
|
||||
super(matchVersion, entries, ignoreCase, count);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -32,13 +32,15 @@ import org.apache.lucene.util.Version;
|
|||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating StopAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.1, StopFilter correctly handles Unicode 4.0
|
||||
* supplementary characters in stopwords
|
||||
* <li> As of 2.9, position increments are preserved
|
||||
* </ul>
|
||||
*/
|
||||
|
||||
public final class StopAnalyzer extends Analyzer {
|
||||
private final Set<?> stopWords;
|
||||
private final boolean enablePositionIncrements;
|
||||
private final Version matchVersion;
|
||||
|
||||
/** An unmodifiable set containing some common English words that are not usually useful
|
||||
for searching.*/
|
||||
|
@ -52,7 +54,8 @@ public final class StopAnalyzer extends Analyzer {
|
|||
"that", "the", "their", "then", "there", "these",
|
||||
"they", "this", "to", "was", "will", "with"
|
||||
);
|
||||
final CharArraySet stopSet = new CharArraySet(stopWords.size(), false);
|
||||
final CharArraySet stopSet = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
stopWords.size(), false);
|
||||
stopSet.addAll(stopWords);
|
||||
ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
|
||||
}
|
||||
|
@ -63,7 +66,7 @@ public final class StopAnalyzer extends Analyzer {
|
|||
*/
|
||||
public StopAnalyzer(Version matchVersion) {
|
||||
stopWords = ENGLISH_STOP_WORDS_SET;
|
||||
enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given set.
|
||||
|
@ -71,7 +74,7 @@ public final class StopAnalyzer extends Analyzer {
|
|||
* @param stopWords Set of stop words */
|
||||
public StopAnalyzer(Version matchVersion, Set<?> stopWords) {
|
||||
this.stopWords = stopWords;
|
||||
enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given file.
|
||||
|
@ -80,7 +83,7 @@ public final class StopAnalyzer extends Analyzer {
|
|||
* @param stopwordsFile File to load stop words from */
|
||||
public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException {
|
||||
stopWords = WordlistLoader.getWordSet(stopwordsFile);
|
||||
this.enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the stop words from the given reader.
|
||||
|
@ -89,13 +92,14 @@ public final class StopAnalyzer extends Analyzer {
|
|||
* @param stopwords Reader to load stop words from */
|
||||
public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
|
||||
stopWords = WordlistLoader.getWordSet(stopwords);
|
||||
this.enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/** Filters LowerCaseTokenizer with StopFilter. */
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new StopFilter(enablePositionIncrements, new LowerCaseTokenizer(reader), stopWords);
|
||||
return new StopFilter(matchVersion,
|
||||
new LowerCaseTokenizer(reader), stopWords);
|
||||
}
|
||||
|
||||
/** Filters LowerCaseTokenizer with StopFilter. */
|
||||
|
@ -109,7 +113,8 @@ public final class StopAnalyzer extends Analyzer {
|
|||
if (streams == null) {
|
||||
streams = new SavedStreams();
|
||||
streams.source = new LowerCaseTokenizer(reader);
|
||||
streams.result = new StopFilter(enablePositionIncrements, streams.source, stopWords);
|
||||
streams.result = new StopFilter(matchVersion,
|
||||
streams.source, stopWords);
|
||||
setPreviousTokenStream(streams);
|
||||
} else
|
||||
streams.source.reset(reader);
|
||||
|
|
|
@ -29,8 +29,16 @@ import org.apache.lucene.util.Version;
|
|||
|
||||
/**
|
||||
* Removes stop words from a token stream.
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating StopFilter:
|
||||
* <ul>
|
||||
* <li> As of 3.1, StopFilter correctly handles Unicode 4.0
|
||||
* supplementary characters in stopwords and position
|
||||
* increments are preserved
|
||||
* </ul>
|
||||
*/
|
||||
|
||||
public final class StopFilter extends TokenFilter {
|
||||
|
||||
private final CharArraySet stopWords;
|
||||
|
@ -54,16 +62,46 @@ public final class StopFilter extends TokenFilter {
|
|||
* @param input Input TokenStream
|
||||
* @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords
|
||||
* @param ignoreCase if true, all words are lower cased first
|
||||
* @deprecated use {@link #StopFilter(Version, TokenStream, Set, boolean)} instead
|
||||
*/
|
||||
public StopFilter(boolean enablePositionIncrements, TokenStream input, Set<?> stopWords, boolean ignoreCase)
|
||||
{
|
||||
this(Version.LUCENE_30, enablePositionIncrements, input, stopWords, ignoreCase);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a token stream filtering the given input. If
|
||||
* <code>stopWords</code> is an instance of {@link CharArraySet} (true if
|
||||
* <code>makeStopSet()</code> was used to construct the set) it will be
|
||||
* directly used and <code>ignoreCase</code> will be ignored since
|
||||
* <code>CharArraySet</code> directly controls case sensitivity.
|
||||
* <p/>
|
||||
* If <code>stopWords</code> is not an instance of {@link CharArraySet}, a new
|
||||
* CharArraySet will be constructed and <code>ignoreCase</code> will be used
|
||||
* to specify the case sensitivity of that set.
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the stop
|
||||
* set if Version > 3.0. See <a href="#version">above</a> for details.
|
||||
* @param input
|
||||
* Input TokenStream
|
||||
* @param stopWords
|
||||
* A Set of Strings or char[] or any other toString()-able set
|
||||
* representing the stopwords
|
||||
* @param ignoreCase
|
||||
* if true, all words are lower cased first
|
||||
*/
|
||||
public StopFilter(Version matchVersion, TokenStream input, Set<?> stopWords, boolean ignoreCase)
|
||||
{
|
||||
this(matchVersion, matchVersion.onOrAfter(Version.LUCENE_29), input, stopWords, ignoreCase);
|
||||
}
|
||||
|
||||
/*
|
||||
* convenience ctor to enable deprecated ctors to set posInc explicitly
|
||||
*/
|
||||
private StopFilter(Version matchVersion, boolean enablePositionIncrements, TokenStream input, Set<?> stopWords, boolean ignoreCase){
|
||||
super(input);
|
||||
if (stopWords instanceof CharArraySet) {
|
||||
this.stopWords = (CharArraySet)stopWords;
|
||||
} else {
|
||||
this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
|
||||
this.stopWords.addAll(stopWords);
|
||||
}
|
||||
this.stopWords = CharArraySet.unmodifiableSet(new CharArraySet(matchVersion, stopWords, ignoreCase));
|
||||
this.enablePositionIncrements = enablePositionIncrements;
|
||||
termAtt = addAttribute(TermAttribute.class);
|
||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
@ -76,10 +114,29 @@ public final class StopFilter extends TokenFilter {
|
|||
* @param enablePositionIncrements true if token positions should record the removed stop words
|
||||
* @param in Input stream
|
||||
* @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords
|
||||
* @see #makeStopSet(java.lang.String[])
|
||||
* @see #makeStopSet(Version, java.lang.String[])
|
||||
* @deprecated use {@link #StopFilter(Version, TokenStream, Set)} instead
|
||||
*/
|
||||
public StopFilter(boolean enablePositionIncrements, TokenStream in, Set<?> stopWords) {
|
||||
this(enablePositionIncrements, in, stopWords, false);
|
||||
this(Version.LUCENE_CURRENT, enablePositionIncrements, in, stopWords, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a filter which removes words from the input TokenStream that are
|
||||
* named in the Set.
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to enable correct Unicode 4.0 behavior in the stop
|
||||
* set if Version > 3.0. See <a href="#version">above</a> for details.
|
||||
* @param in
|
||||
* Input stream
|
||||
* @param stopWords
|
||||
* A Set of Strings or char[] or any other toString()-able set
|
||||
* representing the stopwords
|
||||
* @see #makeStopSet(Version, java.lang.String[])
|
||||
*/
|
||||
public StopFilter(Version matchVersion, TokenStream in, Set<?> stopWords) {
|
||||
this(matchVersion, in, stopWords, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -88,12 +145,27 @@ public final class StopFilter extends TokenFilter {
|
|||
* This permits this stopWords construction to be cached once when
|
||||
* an Analyzer is constructed.
|
||||
*
|
||||
* @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
|
||||
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
|
||||
* @deprecated use {@link #makeStopSet(Version, String...)} instead
|
||||
*/
|
||||
public static final Set<Object> makeStopSet(String... stopWords) {
|
||||
return makeStopSet(stopWords, false);
|
||||
return makeStopSet(Version.LUCENE_30, stopWords, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a Set from an array of stop words,
|
||||
* appropriate for passing into the StopFilter constructor.
|
||||
* This permits this stopWords construction to be cached once when
|
||||
* an Analyzer is constructed.
|
||||
*
|
||||
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
|
||||
* @param stopWords An array of stopwords
|
||||
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
|
||||
*/
|
||||
public static final Set<Object> makeStopSet(Version matchVersion, String... stopWords) {
|
||||
return makeStopSet(matchVersion, stopWords, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a Set from an array of stop words,
|
||||
* appropriate for passing into the StopFilter constructor.
|
||||
|
@ -101,32 +173,72 @@ public final class StopFilter extends TokenFilter {
|
|||
* an Analyzer is constructed.
|
||||
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
|
||||
* @return A Set ({@link CharArraySet}) containing the words
|
||||
* @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
|
||||
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
|
||||
* @deprecated use {@link #makeStopSet(Version, List)} instead
|
||||
*/
|
||||
public static final Set<Object> makeStopSet(List<?> stopWords) {
|
||||
return makeStopSet(stopWords, false);
|
||||
return makeStopSet(Version.LUCENE_30, stopWords, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a Set from an array of stop words,
|
||||
* appropriate for passing into the StopFilter constructor.
|
||||
* This permits this stopWords construction to be cached once when
|
||||
* an Analyzer is constructed.
|
||||
*
|
||||
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
|
||||
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
|
||||
* @return A Set ({@link CharArraySet}) containing the words
|
||||
* @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
|
||||
*/
|
||||
public static final Set<Object> makeStopSet(Version matchVersion, List<?> stopWords) {
|
||||
return makeStopSet(matchVersion, stopWords, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a stopword set from the given stopword array.
|
||||
* @param stopWords An array of stopwords
|
||||
* @param ignoreCase If true, all words are lower cased first.
|
||||
* @return a Set containing the words
|
||||
* @deprecated use {@link #makeStopSet(Version, String[], boolean)} instead;
|
||||
*/
|
||||
public static final Set<Object> makeStopSet(String[] stopWords, boolean ignoreCase) {
|
||||
return makeStopSet(Version.LUCENE_30, stopWords, ignoreCase);
|
||||
}
|
||||
/**
|
||||
* Creates a stopword set from the given stopword array.
|
||||
*
|
||||
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
|
||||
* @param stopWords An array of stopwords
|
||||
* @param ignoreCase If true, all words are lower cased first.
|
||||
* @return a Set containing the words
|
||||
*/
|
||||
public static final Set<Object> makeStopSet(String[] stopWords, boolean ignoreCase) {
|
||||
CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
|
||||
public static final Set<Object> makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) {
|
||||
CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.length, ignoreCase);
|
||||
stopSet.addAll(Arrays.asList(stopWords));
|
||||
return stopSet;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a stopword set from the given stopword list.
|
||||
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
|
||||
* @param ignoreCase if true, all words are lower cased first
|
||||
* @return A Set ({@link CharArraySet}) containing the words
|
||||
* @deprecated use {@link #makeStopSet(Version, List, boolean)} instead
|
||||
*/
|
||||
public static final Set<Object> makeStopSet(List<?> stopWords, boolean ignoreCase){
|
||||
return makeStopSet(Version.LUCENE_30, stopWords, ignoreCase);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Creates a stopword set from the given stopword list.
|
||||
* @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
|
||||
* @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
|
||||
* @param ignoreCase if true, all words are lower cased first
|
||||
* @return A Set ({@link CharArraySet}) containing the words
|
||||
*/
|
||||
public static final Set<Object> makeStopSet(List<?> stopWords, boolean ignoreCase){
|
||||
CharArraySet stopSet = new CharArraySet(stopWords.size(), ignoreCase);
|
||||
public static final Set<Object> makeStopSet(Version matchVersion, List<?> stopWords, boolean ignoreCase){
|
||||
CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.size(), ignoreCase);
|
||||
stopSet.addAll(stopWords);
|
||||
return stopSet;
|
||||
}
|
||||
|
@ -157,13 +269,14 @@ public final class StopFilter extends TokenFilter {
|
|||
* StopFilter use this method when creating the
|
||||
* StopFilter. Prior to 2.9, this returns false. On 2.9
|
||||
* or later, it returns true.
|
||||
* @deprecated use {@link #StopFilter(Version, TokenStream, Set)} instead
|
||||
*/
|
||||
public static boolean getEnablePositionIncrementsVersionDefault(Version matchVersion) {
|
||||
return matchVersion.onOrAfter(Version.LUCENE_29);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #setEnablePositionIncrements(boolean).
|
||||
* @see #setEnablePositionIncrements(boolean)
|
||||
*/
|
||||
public boolean getEnablePositionIncrements() {
|
||||
return enablePositionIncrements;
|
||||
|
|
|
@ -34,6 +34,8 @@ import java.util.Set;
|
|||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating StandardAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.1, StopFilter correctly handles Unicode 4.0
|
||||
* supplementary characters in stopwords
|
||||
* <li> As of 2.9, StopFilter preserves position
|
||||
* increments
|
||||
* <li> As of 2.4, Tokens incorrectly identified as acronyms
|
||||
|
@ -47,7 +49,7 @@ public class StandardAnalyzer extends Analyzer {
|
|||
* Specifies whether deprecated acronyms should be replaced with HOST type.
|
||||
* See {@linkplain https://issues.apache.org/jira/browse/LUCENE-1068}
|
||||
*/
|
||||
private final boolean replaceInvalidAcronym,enableStopPositionIncrements;
|
||||
private final boolean replaceInvalidAcronym;
|
||||
|
||||
/** An unmodifiable set containing some common English words that are usually not
|
||||
useful for searching. */
|
||||
|
@ -70,7 +72,6 @@ public class StandardAnalyzer extends Analyzer {
|
|||
public StandardAnalyzer(Version matchVersion, Set<?> stopWords) {
|
||||
stopSet = stopWords;
|
||||
setOverridesTokenStreamMethod(StandardAnalyzer.class);
|
||||
enableStopPositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
|
||||
replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_24);
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
@ -101,7 +102,7 @@ public class StandardAnalyzer extends Analyzer {
|
|||
tokenStream.setMaxTokenLength(maxTokenLength);
|
||||
TokenStream result = new StandardFilter(tokenStream);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(enableStopPositionIncrements, result, stopSet);
|
||||
result = new StopFilter(matchVersion, result, stopSet);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -148,8 +149,7 @@ public class StandardAnalyzer extends Analyzer {
|
|||
streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
|
||||
streams.filteredTokenStream = new LowerCaseFilter(matchVersion,
|
||||
streams.filteredTokenStream);
|
||||
streams.filteredTokenStream = new StopFilter(enableStopPositionIncrements,
|
||||
streams.filteredTokenStream, stopSet);
|
||||
streams.filteredTokenStream = new StopFilter(matchVersion, streams.filteredTokenStream, stopSet);
|
||||
} else {
|
||||
streams.tokenStream.reset(reader);
|
||||
}
|
||||
|
|
|
@ -35,7 +35,7 @@ public abstract class CharacterUtils {
|
|||
* @return a {@link CharacterUtils} implementation according to the given
|
||||
* {@link Version} instance.
|
||||
*/
|
||||
public static CharacterUtils getInstance(Version matchVersion) {
|
||||
public static CharacterUtils getInstance(final Version matchVersion) {
|
||||
return matchVersion.onOrAfter(Version.LUCENE_31) ? JAVA_5 : JAVA_4;
|
||||
}
|
||||
|
||||
|
@ -58,7 +58,7 @@ public abstract class CharacterUtils {
|
|||
* - if the value offset is negative or not less than the length of
|
||||
* the char array.
|
||||
*/
|
||||
public abstract int codePointAt(char[] chars, int offset);
|
||||
public abstract int codePointAt(final char[] chars, final int offset);
|
||||
|
||||
/**
|
||||
* Returns the code point at the given index of the {@link CharSequence}.
|
||||
|
@ -79,21 +79,52 @@ public abstract class CharacterUtils {
|
|||
* - if the value offset is negative or not less than the length of
|
||||
* the character sequence.
|
||||
*/
|
||||
public abstract int codePointAt(CharSequence seq, int offset);
|
||||
public abstract int codePointAt(final CharSequence seq, final int offset);
|
||||
|
||||
/**
|
||||
* Returns the code point at the given index of the char array where only elements
|
||||
* with index less than the limit are used.
|
||||
* Depending on the {@link Version} passed to
|
||||
* {@link CharacterUtils#getInstance(Version)} this method mimics the behavior
|
||||
* of {@link Character#codePointAt(char[], int)} as it would have been
|
||||
* available on a Java 1.4 JVM or on a later virtual machine version.
|
||||
*
|
||||
* @param chars
|
||||
* a character array
|
||||
* @param offset
|
||||
* the offset to the char values in the chars array to be converted
|
||||
* @param limit the index afer the last element that should be used to calculate
|
||||
* codepoint.
|
||||
*
|
||||
* @return the Unicode code point at the given index
|
||||
* @throws NullPointerException
|
||||
* - if the array is null.
|
||||
* @throws IndexOutOfBoundsException
|
||||
* - if the value offset is negative or not less than the length of
|
||||
* the char array.
|
||||
*/
|
||||
public abstract int codePointAt(final char[] chars, final int offset, final int limit);
|
||||
|
||||
private static final class Java5CharacterUtils extends CharacterUtils {
|
||||
Java5CharacterUtils() {
|
||||
};
|
||||
|
||||
@Override
|
||||
public final int codePointAt(char[] chars, int offset) {
|
||||
public final int codePointAt(final char[] chars, final int offset) {
|
||||
return Character.codePointAt(chars, offset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointAt(CharSequence seq, int offset) {
|
||||
public int codePointAt(final CharSequence seq, final int offset) {
|
||||
return Character.codePointAt(seq, offset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointAt(final char[] chars, final int offset, final int limit) {
|
||||
return Character.codePointAt(chars, offset, limit);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
private static final class Java4CharacterUtils extends CharacterUtils {
|
||||
|
@ -101,14 +132,22 @@ public abstract class CharacterUtils {
|
|||
};
|
||||
|
||||
@Override
|
||||
public final int codePointAt(char[] chars, int offset) {
|
||||
public final int codePointAt(final char[] chars, final int offset) {
|
||||
return chars[offset];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointAt(CharSequence seq, int offset) {
|
||||
public int codePointAt(final CharSequence seq, final int offset) {
|
||||
return seq.charAt(offset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointAt(final char[] chars, final int offset, final int limit) {
|
||||
if(offset >= limit)
|
||||
throw new IndexOutOfBoundsException("offset must be less than limit");
|
||||
return chars[offset];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis;
|
|||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestCharArraySet extends LuceneTestCase {
|
||||
|
||||
|
@ -33,7 +34,7 @@ public class TestCharArraySet extends LuceneTestCase {
|
|||
|
||||
|
||||
public void testRehash() throws Exception {
|
||||
CharArraySet cas = new CharArraySet(0, true);
|
||||
CharArraySet cas = new CharArraySet(Version.LUCENE_CURRENT, 0, true);
|
||||
for(int i=0;i<TEST_STOP_WORDS.length;i++)
|
||||
cas.add(TEST_STOP_WORDS[i]);
|
||||
assertEquals(TEST_STOP_WORDS.length, cas.size());
|
||||
|
@ -44,7 +45,7 @@ public class TestCharArraySet extends LuceneTestCase {
|
|||
public void testNonZeroOffset() {
|
||||
String[] words={"Hello","World","this","is","a","test"};
|
||||
char[] findme="xthisy".toCharArray();
|
||||
CharArraySet set=new CharArraySet(10,true);
|
||||
CharArraySet set=new CharArraySet(Version.LUCENE_CURRENT, 10,true);
|
||||
set.addAll(Arrays.asList(words));
|
||||
assertTrue(set.contains(findme, 1, 4));
|
||||
assertTrue(set.contains(new String(findme,1,4)));
|
||||
|
@ -56,7 +57,7 @@ public class TestCharArraySet extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testObjectContains() {
|
||||
CharArraySet set = new CharArraySet(10, true);
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 10, true);
|
||||
Integer val = Integer.valueOf(1);
|
||||
set.add(val);
|
||||
assertTrue(set.contains(val));
|
||||
|
@ -68,7 +69,7 @@ public class TestCharArraySet extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testClear(){
|
||||
CharArraySet set=new CharArraySet(10,true);
|
||||
CharArraySet set=new CharArraySet(Version.LUCENE_CURRENT, 10,true);
|
||||
set.addAll(Arrays.asList(TEST_STOP_WORDS));
|
||||
assertEquals("Not all words added", TEST_STOP_WORDS.length, set.size());
|
||||
try{
|
||||
|
@ -81,7 +82,7 @@ public class TestCharArraySet extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testModifyOnUnmodifiable(){
|
||||
CharArraySet set=new CharArraySet(10,true);
|
||||
CharArraySet set=new CharArraySet(Version.LUCENE_CURRENT, 10,true);
|
||||
set.addAll(Arrays.asList(TEST_STOP_WORDS));
|
||||
final int size = set.size();
|
||||
set = CharArraySet.unmodifiableSet(set);
|
||||
|
@ -162,7 +163,7 @@ public class TestCharArraySet extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testUnmodifiableSet(){
|
||||
CharArraySet set=new CharArraySet(10,true);
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 10,true);
|
||||
set.addAll(Arrays.asList(TEST_STOP_WORDS));
|
||||
final int size = set.size();
|
||||
set = CharArraySet.unmodifiableSet(set);
|
||||
|
@ -175,4 +176,129 @@ public class TestCharArraySet extends LuceneTestCase {
|
|||
// expected
|
||||
}
|
||||
}
|
||||
|
||||
public void testSupplementaryChars() {
|
||||
String missing = "Term %s is missing in the set";
|
||||
String falsePos = "Term %s is in the set but shouldn't";
|
||||
// for reference see
|
||||
// http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[[%3ACase_Sensitive%3DTrue%3A]%26[^[\u0000-\uFFFF]]]&esc=on
|
||||
String[] upperArr = new String[] {"Abc\ud801\udc1c",
|
||||
"\ud801\udc1c\ud801\udc1cCDE", "A\ud801\udc1cB"};
|
||||
String[] lowerArr = new String[] {"abc\ud801\udc44",
|
||||
"\ud801\udc44\ud801\udc44cde", "a\ud801\udc44b"};
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_31, Arrays.asList(TEST_STOP_WORDS), true);
|
||||
for (String upper : upperArr) {
|
||||
set.add(upper);
|
||||
}
|
||||
for (int i = 0; i < upperArr.length; i++) {
|
||||
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
|
||||
assertTrue(String.format(missing, lowerArr[i]), set.contains(lowerArr[i]));
|
||||
}
|
||||
set = new CharArraySet(Version.LUCENE_31, Arrays.asList(TEST_STOP_WORDS), false);
|
||||
for (String upper : upperArr) {
|
||||
set.add(upper);
|
||||
}
|
||||
for (int i = 0; i < upperArr.length; i++) {
|
||||
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
|
||||
assertFalse(String.format(falsePos, lowerArr[i]), set.contains(lowerArr[i]));
|
||||
}
|
||||
}
|
||||
|
||||
public void testSingleHighSurrogate() {
|
||||
String missing = "Term %s is missing in the set";
|
||||
String falsePos = "Term %s is in the set but shouldn't";
|
||||
String[] upperArr = new String[] { "ABC\uD800", "ABC\uD800EfG",
|
||||
"\uD800EfG", "\uD800\ud801\udc1cB" };
|
||||
|
||||
String[] lowerArr = new String[] { "abc\uD800", "abc\uD800efg",
|
||||
"\uD800efg", "\uD800\ud801\udc44b" };
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_31, Arrays
|
||||
.asList(TEST_STOP_WORDS), true);
|
||||
for (String upper : upperArr) {
|
||||
set.add(upper);
|
||||
}
|
||||
for (int i = 0; i < upperArr.length; i++) {
|
||||
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
|
||||
assertTrue(String.format(missing, lowerArr[i]), set.contains(lowerArr[i]));
|
||||
}
|
||||
set = new CharArraySet(Version.LUCENE_31, Arrays.asList(TEST_STOP_WORDS),
|
||||
false);
|
||||
for (String upper : upperArr) {
|
||||
set.add(upper);
|
||||
}
|
||||
for (int i = 0; i < upperArr.length; i++) {
|
||||
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
|
||||
assertFalse(String.format(falsePos, upperArr[i]), set
|
||||
.contains(lowerArr[i]));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated remove this test when lucene 3.0 "broken unicode 4" support is
|
||||
* no longer needed.
|
||||
*/
|
||||
public void testSupplementaryCharsBWCompat() {
|
||||
String missing = "Term %s is missing in the set";
|
||||
String falsePos = "Term %s is in the set but shouldn't";
|
||||
// for reference see
|
||||
// http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[[%3ACase_Sensitive%3DTrue%3A]%26[^[\u0000-\uFFFF]]]&esc=on
|
||||
String[] upperArr = new String[] {"Abc\ud801\udc1c",
|
||||
"\ud801\udc1c\ud801\udc1cCDE", "A\ud801\udc1cB"};
|
||||
String[] lowerArr = new String[] {"abc\ud801\udc44",
|
||||
"\ud801\udc44\ud801\udc44cde", "a\ud801\udc44b"};
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_30, Arrays.asList(TEST_STOP_WORDS), true);
|
||||
for (String upper : upperArr) {
|
||||
set.add(upper);
|
||||
}
|
||||
for (int i = 0; i < upperArr.length; i++) {
|
||||
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
|
||||
assertFalse(String.format(falsePos, lowerArr[i]), set.contains(lowerArr[i]));
|
||||
}
|
||||
set = new CharArraySet(Version.LUCENE_30, Arrays.asList(TEST_STOP_WORDS), false);
|
||||
for (String upper : upperArr) {
|
||||
set.add(upper);
|
||||
}
|
||||
for (int i = 0; i < upperArr.length; i++) {
|
||||
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
|
||||
assertFalse(String.format(falsePos, lowerArr[i]), set.contains(lowerArr[i]));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated remove this test when lucene 3.0 "broken unicode 4" support is
|
||||
* no longer needed.
|
||||
*/
|
||||
public void testSingleHighSurrogateBWComapt() {
|
||||
String missing = "Term %s is missing in the set";
|
||||
String falsePos = "Term %s is in the set but shouldn't";
|
||||
String[] upperArr = new String[] { "ABC\uD800", "ABC\uD800EfG",
|
||||
"\uD800EfG", "\uD800\ud801\udc1cB" };
|
||||
|
||||
String[] lowerArr = new String[] { "abc\uD800", "abc\uD800efg",
|
||||
"\uD800efg", "\uD800\ud801\udc44b" };
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_30, Arrays
|
||||
.asList(TEST_STOP_WORDS), true);
|
||||
for (String upper : upperArr) {
|
||||
set.add(upper);
|
||||
}
|
||||
for (int i = 0; i < upperArr.length; i++) {
|
||||
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
|
||||
if (i == lowerArr.length - 1)
|
||||
assertFalse(String.format(falsePos, lowerArr[i]), set
|
||||
.contains(lowerArr[i]));
|
||||
else
|
||||
assertTrue(String.format(missing, lowerArr[i]), set
|
||||
.contains(lowerArr[i]));
|
||||
}
|
||||
set = new CharArraySet(Version.LUCENE_30, Arrays.asList(TEST_STOP_WORDS),
|
||||
false);
|
||||
for (String upper : upperArr) {
|
||||
set.add(upper);
|
||||
}
|
||||
for (int i = 0; i < upperArr.length; i++) {
|
||||
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
|
||||
assertFalse(String.format(falsePos, lowerArr[i]), set
|
||||
.contains(lowerArr[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis;
|
|||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.English;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
@ -37,7 +38,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
public void testExactCase() throws IOException {
|
||||
StringReader reader = new StringReader("Now is The Time");
|
||||
Set<String> stopWords = new HashSet(Arrays.asList("is", "the", "Time"));
|
||||
TokenStream stream = new StopFilter(false, new WhitespaceTokenizer(reader), stopWords, false);
|
||||
TokenStream stream = new StopFilter(Version.LUCENE_CURRENT, new WhitespaceTokenizer(reader), stopWords, false);
|
||||
final TermAttribute termAtt = stream.getAttribute(TermAttribute.class);
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals("Now", termAtt.term());
|
||||
|
@ -49,7 +50,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
public void testIgnoreCase() throws IOException {
|
||||
StringReader reader = new StringReader("Now is The Time");
|
||||
Set<String> stopWords = new HashSet(Arrays.asList( "is", "the", "Time" ));
|
||||
TokenStream stream = new StopFilter(false, new WhitespaceTokenizer(reader), stopWords, true);
|
||||
TokenStream stream = new StopFilter(Version.LUCENE_CURRENT, new WhitespaceTokenizer(reader), stopWords, true);
|
||||
final TermAttribute termAtt = stream.getAttribute(TermAttribute.class);
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals("Now", termAtt.term());
|
||||
|
@ -59,8 +60,8 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
public void testStopFilt() throws IOException {
|
||||
StringReader reader = new StringReader("Now is The Time");
|
||||
String[] stopWords = new String[] { "is", "the", "Time" };
|
||||
Set stopSet = StopFilter.makeStopSet(stopWords);
|
||||
TokenStream stream = new StopFilter(false, new WhitespaceTokenizer(reader), stopSet);
|
||||
Set stopSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords);
|
||||
TokenStream stream = new StopFilter(Version.LUCENE_CURRENT, new WhitespaceTokenizer(reader), stopSet);
|
||||
final TermAttribute termAtt = stream.getAttribute(TermAttribute.class);
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals("Now", termAtt.term());
|
||||
|
@ -83,14 +84,14 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
log(sb.toString());
|
||||
String stopWords[] = (String[]) a.toArray(new String[0]);
|
||||
for (int i=0; i<a.size(); i++) log("Stop: "+stopWords[i]);
|
||||
Set stopSet = StopFilter.makeStopSet(stopWords);
|
||||
Set stopSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords);
|
||||
// with increments
|
||||
StringReader reader = new StringReader(sb.toString());
|
||||
StopFilter stpf = new StopFilter(false, new WhitespaceTokenizer(reader), stopSet);
|
||||
StopFilter stpf = new StopFilter(Version.LUCENE_24, new WhitespaceTokenizer(reader), stopSet);
|
||||
doTestStopPositons(stpf,true);
|
||||
// without increments
|
||||
reader = new StringReader(sb.toString());
|
||||
stpf = new StopFilter(false, new WhitespaceTokenizer(reader), stopSet);
|
||||
stpf = new StopFilter(Version.LUCENE_CURRENT, new WhitespaceTokenizer(reader), stopSet);
|
||||
doTestStopPositons(stpf,false);
|
||||
// with increments, concatenating two stop filters
|
||||
ArrayList a0 = new ArrayList();
|
||||
|
@ -106,12 +107,12 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
for (int i=0; i<a0.size(); i++) log("Stop0: "+stopWords0[i]);
|
||||
String stopWords1[] = (String[]) a1.toArray(new String[0]);
|
||||
for (int i=0; i<a1.size(); i++) log("Stop1: "+stopWords1[i]);
|
||||
Set stopSet0 = StopFilter.makeStopSet(stopWords0);
|
||||
Set stopSet1 = StopFilter.makeStopSet(stopWords1);
|
||||
Set stopSet0 = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords0);
|
||||
Set stopSet1 = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords1);
|
||||
reader = new StringReader(sb.toString());
|
||||
StopFilter stpf0 = new StopFilter(false, new WhitespaceTokenizer(reader), stopSet0); // first part of the set
|
||||
StopFilter stpf0 = new StopFilter(Version.LUCENE_CURRENT, new WhitespaceTokenizer(reader), stopSet0); // first part of the set
|
||||
stpf0.setEnablePositionIncrements(true);
|
||||
StopFilter stpf01 = new StopFilter(false, stpf0, stopSet1); // two stop filters concatenated!
|
||||
StopFilter stpf01 = new StopFilter(Version.LUCENE_CURRENT, stpf0, stopSet1); // two stop filters concatenated!
|
||||
doTestStopPositons(stpf01,true);
|
||||
}
|
||||
|
||||
|
|
|
@ -956,7 +956,7 @@ public class TestQueryParser extends LocalizedTestCase {
|
|||
}
|
||||
|
||||
public void testStopwords() throws Exception {
|
||||
QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "a", new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet("the", "foo")));
|
||||
QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "a", new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet(Version.LUCENE_CURRENT, "the", "foo")));
|
||||
Query result = qp.parse("a:the OR a:foo");
|
||||
assertNotNull("result is null and it shouldn't be", result);
|
||||
assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
|
||||
|
@ -972,7 +972,7 @@ public class TestQueryParser extends LocalizedTestCase {
|
|||
}
|
||||
|
||||
public void testPositionIncrement() throws Exception {
|
||||
QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "a", new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet("the", "in", "are", "this")));
|
||||
QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "a", new StopAnalyzer(Version.LUCENE_CURRENT, StopFilter.makeStopSet(Version.LUCENE_CURRENT, "the", "in", "are", "this")));
|
||||
qp.setEnablePositionIncrements(true);
|
||||
String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\"";
|
||||
// 0 2 5 7 8
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
|
@ -232,7 +231,8 @@ public class TestPositionIncrement extends LuceneTestCase {
|
|||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream ts = a.tokenStream(fieldName,reader);
|
||||
return new StopFilter(enablePositionIncrements, ts, new CharArraySet(Collections.singleton("stop"), true));
|
||||
return new StopFilter(enablePositionIncrements?Version.LUCENE_CURRENT:Version.LUCENE_24, ts,
|
||||
new CharArraySet(Version.LUCENE_CURRENT, Collections.singleton("stop"), true));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -275,12 +275,12 @@ public class TestPositionIncrement extends LuceneTestCase {
|
|||
Spans pspans = snq.getSpans(is.getIndexReader());
|
||||
while (pspans.next()) {
|
||||
//System.out.println(pspans.doc() + " - " + pspans.start() + " - "+ pspans.end());
|
||||
Collection payloads = pspans.getPayload();
|
||||
Collection<byte[]> payloads = pspans.getPayload();
|
||||
sawZero |= pspans.start() == 0;
|
||||
for (Iterator it = payloads.iterator(); it.hasNext();) {
|
||||
for (@SuppressWarnings("unused") byte[] bytes : payloads) {
|
||||
count++;
|
||||
it.next();
|
||||
//System.out.println(new String((byte[]) it.next()));
|
||||
//System.out.println(new String(bytes));
|
||||
|
||||
}
|
||||
}
|
||||
assertEquals(5, count);
|
||||
|
@ -302,10 +302,10 @@ public class TestPositionIncrement extends LuceneTestCase {
|
|||
|
||||
sawZero = false;
|
||||
PayloadSpanUtil psu = new PayloadSpanUtil(is.getIndexReader());
|
||||
Collection pls = psu.getPayloadsForQuery(snq);
|
||||
Collection<byte[]> pls = psu.getPayloadsForQuery(snq);
|
||||
count = pls.size();
|
||||
for (Iterator it = pls.iterator(); it.hasNext();) {
|
||||
String s = new String((byte[]) it.next());
|
||||
for (byte[] bytes : pls) {
|
||||
String s = new String(bytes);
|
||||
//System.out.println(s);
|
||||
sawZero |= s.equals("pos: 0");
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue