mirror of https://github.com/apache/lucene.git
LUCENE-4963: Deprecate broken TokenFilter options.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1479148 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ec7317a8d6
commit
8a7f2b6cc4
|
@ -59,6 +59,16 @@ Changes in backwards compatibility policy
|
|||
completely refactored to allow for a better implementation of TimSort.
|
||||
(Adrien Grand, Uwe Schindler, Dawid Weiss)
|
||||
|
||||
* LUCENE-4963: Some TokenFilter options that generate broken TokenStreams have
|
||||
been deprecated: updateOffsets=true on TrimFilter and
|
||||
enablePositionIncrements=false on all classes that inherit from
|
||||
FilteringTokenFilter: JapanesePartOfSpeechStopFilter, KeepWordFilter,
|
||||
LengthFilter, StopFilter and TypeTokenFilter. (Adrien Grand)
|
||||
|
||||
* LUCENE-4963: In order not to take position increments into account in
|
||||
suggesters, you now need to call setPreservePositionIncrements(false) instead
|
||||
of configuring the token filters to not increment positions. (Adrien Grand)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-4935: CustomScoreQuery wrongly applied its query boost twice
|
||||
|
|
|
@ -57,7 +57,7 @@ public final class StopFilter extends FilteringTokenFilter {
|
|||
* @see #makeStopSet(Version, java.lang.String...)
|
||||
*/
|
||||
public StopFilter(Version matchVersion, TokenStream in, CharArraySet stopWords) {
|
||||
super(true, in);
|
||||
super(matchVersion, in);
|
||||
this.stopWords = stopWords;
|
||||
}
|
||||
|
||||
|
|
|
@ -51,7 +51,7 @@ public class StopFilterFactory extends TokenFilterFactory implements ResourceLoa
|
|||
stopWordFiles = get(args, "words");
|
||||
format = get(args, "format");
|
||||
ignoreCase = getBoolean(args, "ignoreCase", false);
|
||||
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
|
||||
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
|
|
@ -17,12 +17,12 @@ package org.apache.lucene.analysis.core;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Removes tokens whose types appear in a set of blocked types from a token stream.
|
||||
|
@ -33,14 +33,41 @@ public final class TypeTokenFilter extends FilteringTokenFilter {
|
|||
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
|
||||
private final boolean useWhiteList;
|
||||
|
||||
public TypeTokenFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes, boolean useWhiteList) {
|
||||
super(enablePositionIncrements, input);
|
||||
/** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
|
||||
@Deprecated
|
||||
public TypeTokenFilter(Version version, boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes, boolean useWhiteList) {
|
||||
super(version, enablePositionIncrements, input);
|
||||
this.stopTypes = stopTypes;
|
||||
this.useWhiteList = useWhiteList;
|
||||
}
|
||||
|
||||
public TypeTokenFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes) {
|
||||
this(enablePositionIncrements, input, stopTypes, false);
|
||||
/** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
|
||||
@Deprecated
|
||||
public TypeTokenFilter(Version version, boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes) {
|
||||
this(version, enablePositionIncrements, input, stopTypes, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new {@link TypeTokenFilter}.
|
||||
* @param version the Lucene match version
|
||||
* @param input the {@link TokenStream} to consume
|
||||
* @param stopTypes the types to filter
|
||||
* @param useWhiteList if true, then tokens whose type is in stopTypes will
|
||||
* be kept, otherwise they will be filtered out
|
||||
*/
|
||||
public TypeTokenFilter(Version version, TokenStream input, Set<String> stopTypes, boolean useWhiteList) {
|
||||
super(version, input);
|
||||
this.stopTypes = stopTypes;
|
||||
this.useWhiteList = useWhiteList;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new {@link TypeTokenFilter} that filters tokens out
|
||||
* (useWhiteList=false).
|
||||
* @see #TypeTokenFilter(Version, TokenStream, Set, boolean)
|
||||
*/
|
||||
public TypeTokenFilter(Version version, TokenStream input, Set<String> stopTypes) {
|
||||
this(version, input, stopTypes, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -35,7 +35,7 @@ import java.util.Set;
|
|||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.TypeTokenFilterFactory" types="stoptypes.txt"
|
||||
* enablePositionIncrements="true" useWhitelist="false"/>
|
||||
* useWhitelist="false"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
|
@ -49,7 +49,7 @@ public class TypeTokenFilterFactory extends TokenFilterFactory implements Resour
|
|||
public TypeTokenFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
stopTypesFiles = require(args, "types");
|
||||
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
|
||||
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
|
||||
useWhitelist = getBoolean(args, "useWhitelist", false);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
|
@ -78,6 +78,8 @@ public class TypeTokenFilterFactory extends TokenFilterFactory implements Resour
|
|||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new TypeTokenFilter(enablePositionIncrements, input, stopTypes, useWhitelist);
|
||||
@SuppressWarnings("deprecation")
|
||||
final TokenStream filter = new TypeTokenFilter(luceneMatchVersion, enablePositionIncrements, input, stopTypes, useWhitelist);
|
||||
return filter;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -138,7 +138,9 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
|
|||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS);
|
||||
if (!matchVersion.onOrAfter(Version.LUCENE_44)) {
|
||||
s.setEnablePositionIncrements(false);
|
||||
}
|
||||
result = s;
|
||||
result = new ElisionFilter(result, DEFAULT_ARTICLES);
|
||||
result = new IrishLowerCaseFilter(result);
|
||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* A TokenFilter that only keeps tokens with text contained in the
|
||||
|
@ -32,10 +33,23 @@ public final class KeepWordFilter extends FilteringTokenFilter {
|
|||
private final CharArraySet words;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/** The words set passed to this constructor will be directly used by this filter
|
||||
* and should not be modified, */
|
||||
public KeepWordFilter(boolean enablePositionIncrements, TokenStream in, CharArraySet words) {
|
||||
super(enablePositionIncrements, in);
|
||||
/** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
|
||||
@Deprecated
|
||||
public KeepWordFilter(Version version, boolean enablePositionIncrements, TokenStream in, CharArraySet words) {
|
||||
super(version, enablePositionIncrements, in);
|
||||
this.words = words;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new {@link KeepWordFilter}.
|
||||
* <p><b>NOTE</b>: The words set passed to this constructor will be directly
|
||||
* used by this filter and should not be modified.
|
||||
* @param version the Lucene match version
|
||||
* @param in the {@link TokenStream} to consume
|
||||
* @param words the words to keep
|
||||
*/
|
||||
public KeepWordFilter(Version version, TokenStream in, CharArraySet words) {
|
||||
super(version, in);
|
||||
this.words = words;
|
||||
}
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ import java.io.IOException;
|
|||
* <fieldType name="text_keepword" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.KeepWordFilterFactory" words="keepwords.txt" ignoreCase="false" enablePositionIncrements="false"/>
|
||||
* <filter class="solr.KeepWordFilterFactory" words="keepwords.txt" ignoreCase="false"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
|
@ -48,7 +48,7 @@ public class KeepWordFilterFactory extends TokenFilterFactory implements Resourc
|
|||
assureMatchVersion();
|
||||
wordFiles = get(args, "words");
|
||||
ignoreCase = getBoolean(args, "ignoreCase", false);
|
||||
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
|
||||
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -76,6 +76,12 @@ public class KeepWordFilterFactory extends TokenFilterFactory implements Resourc
|
|||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
// if the set is null, it means it was empty
|
||||
return words == null ? input : new KeepWordFilter(enablePositionIncrements, input, words);
|
||||
if (words == null) {
|
||||
return input;
|
||||
} else {
|
||||
@SuppressWarnings("deprecation")
|
||||
final TokenStream filter = new KeepWordFilter(luceneMatchVersion, enablePositionIncrements, input, words);
|
||||
return filter;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Removes words that are too long or too short from the stream.
|
||||
|
@ -34,12 +35,25 @@ public final class LengthFilter extends FilteringTokenFilter {
|
|||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
|
||||
@Deprecated
|
||||
public LengthFilter(Version version, boolean enablePositionIncrements, TokenStream in, int min, int max) {
|
||||
super(version, enablePositionIncrements, in);
|
||||
this.min = min;
|
||||
this.max = max;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a filter that removes words that are too long or too
|
||||
* short from the text.
|
||||
* Create a new {@link LengthFilter}. This will filter out tokens whose
|
||||
* {@link CharTermAttribute} is either too short ({@link CharTermAttribute#length()}
|
||||
* < min) or too long ({@link CharTermAttribute#length()} > max).
|
||||
* @param version the Lucene match version
|
||||
* @param in the {@link TokenStream} to consume
|
||||
* @param min the minimum length
|
||||
* @param max the maximum length
|
||||
*/
|
||||
public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) {
|
||||
super(enablePositionIncrements, in);
|
||||
public LengthFilter(Version version, TokenStream in, int min, int max) {
|
||||
super(version, in);
|
||||
this.min = min;
|
||||
this.max = max;
|
||||
}
|
||||
|
|
|
@ -17,18 +17,18 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Factory for {@link LengthFilter}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_lngth" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.LengthFilterFactory" min="0" max="1" enablePositionIncrements="false"/>
|
||||
* <filter class="solr.LengthFilterFactory" min="0" max="1" />
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
|
@ -44,7 +44,7 @@ public class LengthFilterFactory extends TokenFilterFactory {
|
|||
super(args);
|
||||
min = requireInt(args, MIN_KEY);
|
||||
max = requireInt(args, MAX_KEY);
|
||||
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
|
||||
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -52,6 +52,8 @@ public class LengthFilterFactory extends TokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public LengthFilter create(TokenStream input) {
|
||||
return new LengthFilter(enablePositionIncrements, input,min,max);
|
||||
@SuppressWarnings("deprecation")
|
||||
final LengthFilter filter = new LengthFilter(luceneMatchVersion, enablePositionIncrements, input,min,max);
|
||||
return filter;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,11 +21,14 @@ import org.apache.lucene.analysis.TokenFilter;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Trims leading and trailing whitespace from Tokens in the stream.
|
||||
* <p>As of Lucene 4.4, this filter does not support updateOffsets=true anymore
|
||||
* as it can lead to broken token streams.
|
||||
*/
|
||||
public final class TrimFilter extends TokenFilter {
|
||||
|
||||
|
@ -33,12 +36,27 @@ public final class TrimFilter extends TokenFilter {
|
|||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
|
||||
public TrimFilter(TokenStream in, boolean updateOffsets) {
|
||||
/**
|
||||
* Create a new {@link TrimFilter}.
|
||||
* @param version the Lucene match version
|
||||
* @param in the stream to consume
|
||||
* @param updateOffsets whether to update offsets
|
||||
* @deprecated Offset updates are not supported anymore as of Lucene 4.4.
|
||||
*/
|
||||
@Deprecated
|
||||
public TrimFilter(Version version, TokenStream in, boolean updateOffsets) {
|
||||
super(in);
|
||||
if (updateOffsets && version.onOrAfter(Version.LUCENE_44)) {
|
||||
throw new IllegalArgumentException("updateOffsets=true is not supported anymore as of Lucene 4.4");
|
||||
}
|
||||
this.updateOffsets = updateOffsets;
|
||||
}
|
||||
|
||||
/** Create a new {@link TrimFilter} on top of <code>in</code>. */
|
||||
public TrimFilter(Version version, TokenStream in) {
|
||||
this(version, in, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken()) return false;
|
||||
|
@ -55,11 +73,10 @@ public final class TrimFilter extends TokenFilter {
|
|||
int endOff = 0;
|
||||
|
||||
// eat the first characters
|
||||
//QUESTION: Should we use Character.isWhitespace() instead?
|
||||
for (start = 0; start < len && termBuffer[start] <= ' '; start++) {
|
||||
for (start = 0; start < len && Character.isWhitespace(termBuffer[start]); start++) {
|
||||
}
|
||||
// eat the end characters
|
||||
for (end = len; end >= start && termBuffer[end - 1] <= ' '; end--) {
|
||||
for (end = len; end >= start && Character.isWhitespace(termBuffer[end - 1]); end--) {
|
||||
endOff++;
|
||||
}
|
||||
if (start > 0 || end < len) {
|
||||
|
|
|
@ -29,7 +29,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* <fieldType name="text_trm" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.NGramTokenizerFactory"/>
|
||||
* <filter class="solr.TrimFilterFactory" updateOffsets="false"/>
|
||||
* <filter class="solr.TrimFilterFactory" />
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
|
@ -50,6 +50,8 @@ public class TrimFilterFactory extends TokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public TrimFilter create(TokenStream input) {
|
||||
return new TrimFilter(input, updateOffsets);
|
||||
@SuppressWarnings("deprecation")
|
||||
final TrimFilter filter = new TrimFilter(luceneMatchVersion, input, updateOffsets);
|
||||
return filter;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -73,7 +73,7 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
|
||||
super(new LengthFilter(true, input, minGram, Integer.MAX_VALUE));
|
||||
super(new LengthFilter(version, input, minGram, Integer.MAX_VALUE));
|
||||
this.version = version;
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
|
|
|
@ -22,24 +22,54 @@ import java.io.IOException;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Abstract base class for TokenFilters that may remove tokens.
|
||||
* You have to implement {@link #accept} and return a boolean if the current
|
||||
* token should be preserved. {@link #incrementToken} uses this method
|
||||
* to decide if a token should be passed to the caller.
|
||||
* <p><a name="version" />As of Lucene 4.4, an {@link IllegalArgumentException}
|
||||
* is thrown when trying to disable position increments when filtering terms.
|
||||
*/
|
||||
public abstract class FilteringTokenFilter extends TokenFilter {
|
||||
|
||||
private static void checkPositionIncrement(Version version, boolean enablePositionIncrements) {
|
||||
if (!enablePositionIncrements && version.onOrAfter(Version.LUCENE_44)) {
|
||||
throw new IllegalArgumentException("enablePositionIncrements=false is not supported anymore as of Lucene 4.4 as it can create broken token streams");
|
||||
}
|
||||
}
|
||||
|
||||
protected final Version version;
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
|
||||
private boolean first = true; // only used when not preserving gaps
|
||||
private boolean first = true;
|
||||
|
||||
public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
|
||||
super(input);
|
||||
/**
|
||||
* Create a new {@link FilteringTokenFilter}.
|
||||
* @param version the Lucene match <a href="#version">version</a>
|
||||
* @param enablePositionIncrements whether to increment position increments when filtering out terms
|
||||
* @param input the input to consume
|
||||
* @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4
|
||||
*/
|
||||
@Deprecated
|
||||
public FilteringTokenFilter(Version version, boolean enablePositionIncrements, TokenStream input){
|
||||
this(version, input);
|
||||
checkPositionIncrement(version, enablePositionIncrements);
|
||||
this.enablePositionIncrements = enablePositionIncrements;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new {@link FilteringTokenFilter}.
|
||||
* @param version the Lucene match version
|
||||
* @param in the {@link TokenStream} to consume
|
||||
*/
|
||||
public FilteringTokenFilter(Version version, TokenStream in) {
|
||||
super(in);
|
||||
this.version = version;
|
||||
this.enablePositionIncrements = true;
|
||||
}
|
||||
|
||||
/** Override this method and return if the current input token should be returned by {@link #incrementToken}. */
|
||||
protected abstract boolean accept() throws IOException;
|
||||
|
||||
|
@ -102,8 +132,11 @@ public abstract class FilteringTokenFilter extends TokenFilter {
|
|||
* <p> <b>NOTE</b>: be sure to also
|
||||
* set org.apache.lucene.queryparser.classic.QueryParser#setEnablePositionIncrements if
|
||||
* you use QueryParser to create queries.
|
||||
* @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4
|
||||
*/
|
||||
@Deprecated
|
||||
public void setEnablePositionIncrements(boolean enable) {
|
||||
checkPositionIncrement(version, enable);
|
||||
this.enablePositionIncrements = enable;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -161,8 +161,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
// startOffset thats > its endOffset
|
||||
// (see LUCENE-3738 for a list of other offenders here)
|
||||
// broken!
|
||||
Lucene43NGramTokenizer.class,
|
||||
// broken!
|
||||
EdgeNGramTokenizer.class,
|
||||
// broken!
|
||||
EdgeNGramTokenFilter.class,
|
||||
|
@ -182,55 +180,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
private static final Map<Constructor<?>,Predicate<Object[]>> brokenOffsetsConstructors = new HashMap<Constructor<?>, Predicate<Object[]>>();
|
||||
static {
|
||||
try {
|
||||
brokenOffsetsConstructors.put(
|
||||
TrimFilter.class.getConstructor(TokenStream.class, boolean.class),
|
||||
new Predicate<Object[]>() {
|
||||
@Override
|
||||
public boolean apply(Object[] args) {
|
||||
assert args.length == 2;
|
||||
return (Boolean) args[1]; // args are broken if updateOffsets is true
|
||||
}
|
||||
});
|
||||
brokenOffsetsConstructors.put(
|
||||
TypeTokenFilter.class.getConstructor(boolean.class, TokenStream.class, Set.class, boolean.class),
|
||||
new Predicate<Object[]>() {
|
||||
@Override
|
||||
public boolean apply(Object[] args) {
|
||||
assert args.length == 4;
|
||||
// LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
|
||||
return !(Boolean) args[0];
|
||||
}
|
||||
});
|
||||
brokenOffsetsConstructors.put(
|
||||
TypeTokenFilter.class.getConstructor(boolean.class, TokenStream.class, Set.class),
|
||||
new Predicate<Object[]>() {
|
||||
@Override
|
||||
public boolean apply(Object[] args) {
|
||||
assert args.length == 3;
|
||||
// LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
|
||||
return !(Boolean) args[0];
|
||||
}
|
||||
});
|
||||
brokenOffsetsConstructors.put(
|
||||
LengthFilter.class.getConstructor(boolean.class, TokenStream.class, int.class, int.class),
|
||||
new Predicate<Object[]>() {
|
||||
@Override
|
||||
public boolean apply(Object[] args) {
|
||||
assert args.length == 4;
|
||||
// LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
|
||||
return !(Boolean) args[0];
|
||||
}
|
||||
});
|
||||
brokenOffsetsConstructors.put(
|
||||
KeepWordFilter.class.getConstructor(boolean.class, TokenStream.class, CharArraySet.class),
|
||||
new Predicate<Object[]>() {
|
||||
@Override
|
||||
public boolean apply(Object[] args) {
|
||||
assert args.length == 3;
|
||||
// LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
|
||||
return !(Boolean) args[0];
|
||||
}
|
||||
});
|
||||
for (Class<?> c : Arrays.<Class<?>>asList(
|
||||
ReversePathHierarchyTokenizer.class,
|
||||
PathHierarchyTokenizer.class,
|
||||
|
|
|
@ -75,7 +75,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
doTestStopPositons(stpf,true);
|
||||
// without increments
|
||||
reader = new StringReader(sb.toString());
|
||||
stpf = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
|
||||
stpf = new StopFilter(Version.LUCENE_43, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
|
||||
doTestStopPositons(stpf,false);
|
||||
// with increments, concatenating two stop filters
|
||||
ArrayList<String> a0 = new ArrayList<String>();
|
||||
|
@ -166,7 +166,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenFilter filter = new MockSynonymFilter(tokenizer);
|
||||
StopFilter stopfilter = new StopFilter(TEST_VERSION_CURRENT, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
StopFilter stopfilter = new StopFilter(Version.LUCENE_43, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
stopfilter.setEnablePositionIncrements(false);
|
||||
return new TokenStreamComponents(tokenizer, stopfilter);
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.English;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
@ -36,7 +37,7 @@ public class TestTypeTokenFilter extends BaseTokenStreamTestCase {
|
|||
public void testTypeFilter() throws IOException {
|
||||
StringReader reader = new StringReader("121 is palindrome, while 123 is not");
|
||||
Set<String> stopTypes = asSet("<NUM>");
|
||||
TokenStream stream = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes);
|
||||
TokenStream stream = new TypeTokenFilter(TEST_VERSION_CURRENT, true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes);
|
||||
assertTokenStreamContents(stream, new String[]{"is", "palindrome", "while", "is", "not"});
|
||||
}
|
||||
|
||||
|
@ -59,12 +60,12 @@ public class TestTypeTokenFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
// with increments
|
||||
StringReader reader = new StringReader(sb.toString());
|
||||
TypeTokenFilter typeTokenFilter = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
|
||||
TypeTokenFilter typeTokenFilter = new TypeTokenFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
|
||||
testPositons(typeTokenFilter);
|
||||
|
||||
// without increments
|
||||
reader = new StringReader(sb.toString());
|
||||
typeTokenFilter = new TypeTokenFilter(false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
|
||||
typeTokenFilter = new TypeTokenFilter(Version.LUCENE_43, false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
|
||||
testPositons(typeTokenFilter);
|
||||
|
||||
}
|
||||
|
@ -87,7 +88,7 @@ public class TestTypeTokenFilter extends BaseTokenStreamTestCase {
|
|||
public void testTypeFilterWhitelist() throws IOException {
|
||||
StringReader reader = new StringReader("121 is palindrome, while 123 is not");
|
||||
Set<String> stopTypes = Collections.singleton("<NUM>");
|
||||
TokenStream stream = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes, true);
|
||||
TokenStream stream = new TypeTokenFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes, true);
|
||||
assertTokenStreamContents(stream, new String[]{"121", "123"});
|
||||
}
|
||||
|
||||
|
|
|
@ -50,7 +50,7 @@ public class TestTypeTokenFilterFactory extends BaseTokenStreamFactoryTestCase {
|
|||
public void testCreationWithBlackList() throws Exception {
|
||||
TokenFilterFactory factory = tokenFilterFactory("Type",
|
||||
"types", "stoptypes-1.txt, stoptypes-2.txt",
|
||||
"enablePositionIncrements", "false");
|
||||
"enablePositionIncrements", "true");
|
||||
NumericTokenStream input = new NumericTokenStream();
|
||||
input.setIntValue(123);
|
||||
factory.create(input);
|
||||
|
@ -59,7 +59,7 @@ public class TestTypeTokenFilterFactory extends BaseTokenStreamFactoryTestCase {
|
|||
public void testCreationWithWhiteList() throws Exception {
|
||||
TokenFilterFactory factory = tokenFilterFactory("Type",
|
||||
"types", "stoptypes-1.txt, stoptypes-2.txt",
|
||||
"enablePositionIncrements", "false",
|
||||
"enablePositionIncrements", "true",
|
||||
"useWhitelist", "true");
|
||||
NumericTokenStream input = new NumericTokenStream();
|
||||
input.setIntValue(123);
|
||||
|
|
|
@ -61,7 +61,7 @@ public class TestIrishAnalyzer extends BaseTokenStreamTestCase {
|
|||
Analyzer a = new IrishAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesTo(a, "n-athair",
|
||||
new String[] { "athair" },
|
||||
new int[] { 1 });
|
||||
new int[] { 2 });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/** Test {@link KeepWordFilter} */
|
||||
public class TestKeepWordFilter extends BaseTokenStreamTestCase {
|
||||
|
@ -42,22 +43,22 @@ public class TestKeepWordFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
// Test Stopwords
|
||||
TokenStream stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
||||
stream = new KeepWordFilter(TEST_VERSION_CURRENT, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 });
|
||||
|
||||
// Now force case
|
||||
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
|
||||
stream = new KeepWordFilter(TEST_VERSION_CURRENT, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 });
|
||||
|
||||
// Test Stopwords
|
||||
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
||||
stream = new KeepWordFilter(Version.LUCENE_43, false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 });
|
||||
|
||||
// Now force case
|
||||
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
|
||||
stream = new KeepWordFilter(Version.LUCENE_43, false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
|
||||
}
|
||||
|
||||
|
@ -72,7 +73,7 @@ public class TestKeepWordFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenStream stream = new KeepWordFilter(true, tokenizer, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
||||
TokenStream stream = new KeepWordFilter(TEST_VERSION_CURRENT, tokenizer, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
||||
return new TokenStreamComponents(tokenizer, stream);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
@ -29,7 +30,7 @@ public class TestLengthFilter extends BaseTokenStreamTestCase {
|
|||
public void testFilterNoPosIncr() throws Exception {
|
||||
TokenStream stream = new MockTokenizer(
|
||||
new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
|
||||
LengthFilter filter = new LengthFilter(false, stream, 2, 6);
|
||||
LengthFilter filter = new LengthFilter(Version.LUCENE_43, false, stream, 2, 6);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[]{"short", "ab", "foo"},
|
||||
new int[]{1, 1, 1}
|
||||
|
@ -39,7 +40,7 @@ public class TestLengthFilter extends BaseTokenStreamTestCase {
|
|||
public void testFilterWithPosIncr() throws Exception {
|
||||
TokenStream stream = new MockTokenizer(
|
||||
new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
|
||||
LengthFilter filter = new LengthFilter(true, stream, 2, 6);
|
||||
LengthFilter filter = new LengthFilter(TEST_VERSION_CURRENT, stream, 2, 6);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[]{"short", "ab", "foo"},
|
||||
new int[]{1, 4, 2}
|
||||
|
@ -51,7 +52,7 @@ public class TestLengthFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer, new LengthFilter(true, tokenizer, 0, 5));
|
||||
return new TokenStreamComponents(tokenizer, new LengthFilter(TEST_VERSION_CURRENT, tokenizer, 0, 5));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
|
|
|
@ -22,6 +22,8 @@ import java.io.StringReader;
|
|||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestLengthFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
|
||||
|
@ -29,8 +31,10 @@ public class TestLengthFilterFactory extends BaseTokenStreamFactoryTestCase {
|
|||
Reader reader = new StringReader("foo foobar super-duper-trooper");
|
||||
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
stream = tokenFilterFactory("Length",
|
||||
Version.LUCENE_43, new ClasspathResourceLoader(getClass()),
|
||||
"min", "4",
|
||||
"max", "10").create(stream);
|
||||
"max", "10",
|
||||
"enablePositionIncrements", "false").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 1 });
|
||||
}
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.*;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
*/
|
||||
|
@ -46,7 +47,7 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
|
|||
new Token(ccc, 0, ccc.length, 11, 15),
|
||||
new Token(whitespace, 0, whitespace.length, 16, 20),
|
||||
new Token(empty, 0, empty.length, 21, 21));
|
||||
ts = new TrimFilter(ts, false);
|
||||
ts = new TrimFilter(TEST_VERSION_CURRENT, ts, false);
|
||||
|
||||
assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""});
|
||||
|
||||
|
@ -59,7 +60,7 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
|
|||
new Token(b, 0, b.length, 0, 2),
|
||||
new Token(ccc, 0, ccc.length, 0, 3),
|
||||
new Token(whitespace, 0, whitespace.length, 0, 3));
|
||||
ts = new TrimFilter(ts, true);
|
||||
ts = new TrimFilter(Version.LUCENE_43, ts, true);
|
||||
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "a", "b", "c", "" },
|
||||
|
@ -120,7 +121,7 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
|
||||
return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, false));
|
||||
return new TokenStreamComponents(tokenizer, new TrimFilter(Version.LUCENE_43, tokenizer, true));
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
|
@ -130,7 +131,7 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
|
||||
return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, true));
|
||||
return new TokenStreamComponents(tokenizer, new TrimFilter(TEST_VERSION_CURRENT, tokenizer, false));
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
|
||||
|
@ -141,7 +142,9 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, random().nextBoolean()));
|
||||
final boolean updateOffsets = random().nextBoolean();
|
||||
final Version version = updateOffsets ? Version.LUCENE_43 : TEST_VERSION_CURRENT;
|
||||
return new TokenStreamComponents(tokenizer, new TrimFilter(version, tokenizer, updateOffsets));
|
||||
}
|
||||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
|
|
|
@ -306,7 +306,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
|||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
|
||||
tokenizer, StandardAnalyzer.STOP_WORDS_SET);
|
||||
filter.setEnablePositionIncrements(true);
|
||||
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(filter, flags, protWords));
|
||||
}
|
||||
};
|
||||
|
|
|
@ -89,7 +89,7 @@ public class JapaneseAnalyzer extends StopwordAnalyzerBase {
|
|||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(reader, userDict, true, mode);
|
||||
TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
|
||||
stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
|
||||
stream = new JapanesePartOfSpeechStopFilter(matchVersion, stream, stoptags);
|
||||
stream = new CJKWidthFilter(stream);
|
||||
stream = new StopFilter(matchVersion, stream, stopwords);
|
||||
stream = new JapaneseKatakanaStemFilter(stream);
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.util.Set;
|
|||
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
|
||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Removes tokens that match a set of part-of-speech tags.
|
||||
|
@ -30,8 +31,21 @@ public final class JapanesePartOfSpeechStopFilter extends FilteringTokenFilter {
|
|||
private final Set<String> stopTags;
|
||||
private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
|
||||
|
||||
public JapanesePartOfSpeechStopFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTags) {
|
||||
super(enablePositionIncrements, input);
|
||||
/** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
|
||||
@Deprecated
|
||||
public JapanesePartOfSpeechStopFilter(Version version, boolean enablePositionIncrements, TokenStream input, Set<String> stopTags) {
|
||||
super(version, enablePositionIncrements, input);
|
||||
this.stopTags = stopTags;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new {@link JapanesePartOfSpeechStopFilter}.
|
||||
* @param version the Lucene match version
|
||||
* @param input the {@link TokenStream} to consume
|
||||
* @param stopTags the part-of-speech tags that should be removed
|
||||
*/
|
||||
public JapanesePartOfSpeechStopFilter(Version version, TokenStream input, Set<String> stopTags) {
|
||||
super(version, input);
|
||||
this.stopTags = stopTags;
|
||||
}
|
||||
|
||||
|
|
|
@ -50,7 +50,7 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im
|
|||
public JapanesePartOfSpeechStopFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
stopTagFiles = get(args, "tags");
|
||||
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
|
||||
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -72,6 +72,12 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im
|
|||
@Override
|
||||
public TokenStream create(TokenStream stream) {
|
||||
// if stoptags is null, it means the file is empty
|
||||
return stopTags == null ? stream : new JapanesePartOfSpeechStopFilter(enablePositionIncrements, stream, stopTags);
|
||||
if (stopTags != null) {
|
||||
@SuppressWarnings("deprecation")
|
||||
final TokenStream filter = new JapanesePartOfSpeechStopFilter(luceneMatchVersion, enablePositionIncrements, stream, stopTags);
|
||||
return filter;
|
||||
} else {
|
||||
return stream;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,12 +17,8 @@ package org.apache.lucene.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
|
@ -43,8 +39,16 @@ import org.apache.lucene.util.automaton.Transition;
|
|||
* @lucene.experimental */
|
||||
public class TokenStreamToAutomaton {
|
||||
|
||||
private boolean preservePositionIncrements;
|
||||
|
||||
/** Sole constructor. */
|
||||
public TokenStreamToAutomaton() {
|
||||
this.preservePositionIncrements = true;
|
||||
}
|
||||
|
||||
/** Whether to generate holes in the automaton for missing positions, <code>true</code> by default. */
|
||||
public void setPreservePositionIncrements(boolean enablePositionIncrements) {
|
||||
this.preservePositionIncrements = enablePositionIncrements;
|
||||
}
|
||||
|
||||
private static class Position implements RollingBuffer.Resettable {
|
||||
|
@ -108,6 +112,9 @@ public class TokenStreamToAutomaton {
|
|||
int maxOffset = 0;
|
||||
while (in.incrementToken()) {
|
||||
int posInc = posIncAtt.getPositionIncrement();
|
||||
if (!preservePositionIncrements && posInc > 1) {
|
||||
posInc = 1;
|
||||
}
|
||||
assert pos > -1 || posInc > 0;
|
||||
|
||||
if (posInc > 0) {
|
||||
|
|
|
@ -282,18 +282,18 @@ and proximity searches (though sentence identification is not provided by Lucene
|
|||
<p>
|
||||
If the selected analyzer filters the stop words "is" and "the", then for a document
|
||||
containing the string "blue is the sky", only the tokens "blue", "sky" are indexed,
|
||||
with position("sky") = 1 + position("blue"). Now, a phrase query "blue is the sky"
|
||||
with position("sky") = 3 + position("blue"). Now, a phrase query "blue is the sky"
|
||||
would find that document, because the same analyzer filters the same stop words from
|
||||
that query. But also the phrase query "blue sky" would find that document.
|
||||
that query. But the phrase query "blue sky" would not find that document because the
|
||||
position increment between "blue" and "sky" is only 1.
|
||||
</p>
|
||||
<p>
|
||||
If this behavior does not fit the application needs, a modified analyzer can
|
||||
be used, that would increment further the positions of tokens following a
|
||||
removed stop word, using
|
||||
{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#setPositionIncrement(int)}.
|
||||
This can be done with something like the following (note, however, that
|
||||
StopFilter natively includes this capability by subclassing
|
||||
FilteringTokenFilter}:
|
||||
If this behavior does not fit the application needs, the query parser needs to be
|
||||
configured to not take position increments into account when generating phrase queries.
|
||||
</p>
|
||||
<p>
|
||||
Note that a StopFilter MUST increment the position increment in order not to generate corrupt
|
||||
tokenstream graphs. Here is the logic used by StopFilter to increment positions when filtering out tokens:
|
||||
</p>
|
||||
<PRE class="prettyprint">
|
||||
public TokenStream tokenStream(final String fieldName, Reader reader) {
|
||||
|
@ -308,7 +308,7 @@ and proximity searches (though sentence identification is not provided by Lucene
|
|||
boolean hasNext = ts.incrementToken();
|
||||
if (hasNext) {
|
||||
if (stopWords.contains(termAtt.toString())) {
|
||||
extraIncrement++; // filter this word
|
||||
extraIncrement += posIncrAtt.getPositionIncrement(); // filter this word
|
||||
continue;
|
||||
}
|
||||
if (extraIncrement>0) {
|
||||
|
@ -322,11 +322,6 @@ and proximity searches (though sentence identification is not provided by Lucene
|
|||
return res;
|
||||
}
|
||||
</PRE>
|
||||
<p>
|
||||
Now, with this modified analyzer, the phrase query "blue sky" would find that document.
|
||||
But note that this is yet not a perfect solution, because any phrase query "blue w1 w2 sky"
|
||||
where both w1 and w2 are stop words would match that document.
|
||||
</p>
|
||||
<p>
|
||||
A few more use cases for modifying position increments are:
|
||||
</p>
|
||||
|
@ -338,6 +333,72 @@ and proximity searches (though sentence identification is not provided by Lucene
|
|||
As result, all synonyms of a token would be considered to appear in exactly the
|
||||
same position as that token, and so would they be seen by phrase and proximity searches.</li>
|
||||
</ol>
|
||||
|
||||
<h3>Token Position Length</h3>
|
||||
<p>
|
||||
By default, all tokens created by Analyzers and Tokenizers have a
|
||||
{@link org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute#getPositionLength() position length} of one.
|
||||
This means that the token occupies a single position. This attribute is not indexed
|
||||
and thus not taken into account for positional queries, but is used by eg. suggesters.
|
||||
</p>
|
||||
<p>
|
||||
The main use case for positions lengths is multi-word synonyms. With single-word
|
||||
synonyms, setting the position increment to 0 is enough to denote the fact that two
|
||||
words are synonyms, for example:
|
||||
</p>
|
||||
<table>
|
||||
<tr><td>Term</td><td>red</td><td>magenta</td></tr>
|
||||
<tr><td>Position increment</td><td>1</td><td>0</td></tr>
|
||||
</table>
|
||||
<p>
|
||||
Given that position(magenta) = 0 + position(red), they are at the same position, so anything
|
||||
working with analyzers will return the exact same result if you replace "magenta" with "red"
|
||||
in the input. However, multi-word synonyms are more tricky. Let's say that you want to build
|
||||
a TokenStream where "IBM" is a synonym of "Internal Business Machines". Position increments
|
||||
are not enough anymore:
|
||||
</p>
|
||||
<table>
|
||||
<tr><td>Term</td><td>IBM</td><td>International</td><td>Business</td><td>Machines</td></tr>
|
||||
<tr><td>Position increment</td><td>1</td><td>0</td><td>1</td><td>1</td></tr>
|
||||
</table>
|
||||
<p>
|
||||
The problem with this token stream is that "IBM" is at the same position as "International"
|
||||
although it is a synonym with "International Business Machines" as a whole. Setting
|
||||
the position increment of "Business" and "Machines" to 0 wouldn't help as it would mean
|
||||
than "International" is a synonym of "Business". The only way to solve this issue is to
|
||||
make "IBM" span across 3 positions, this is where position lengths come to rescue.
|
||||
</p>
|
||||
<table>
|
||||
<tr><td>Term</td><td>IBM</td><td>International</td><td>Business</td><td>Machines</td></tr>
|
||||
<tr><td>Position increment</td><td>1</td><td>0</td><td>1</td><td>1</td></tr>
|
||||
<tr><td>Position length</td><td>3</td><td>1</td><td>1</td><td>1</td></tr>
|
||||
</table>
|
||||
<p>
|
||||
This new attribute makes clear that "IBM" and "International Business Machines" start and end
|
||||
at the same positions.
|
||||
</p>
|
||||
<a name="corrupt" />
|
||||
<h3>How to not write corrupt token streams</h3>
|
||||
<p>
|
||||
There are a few rules to observe when writing custom Tokenizers and TokenFilters:
|
||||
</p>
|
||||
<ul>
|
||||
<li>The first position increment must be > 0.</li>
|
||||
<li>Positions must not go backward.</li>
|
||||
<li>Tokens that have the same start position must have the same start offset.</li>
|
||||
<li>Tokens that have the same end position (taking into account the position length) must have the same end offset.</li>
|
||||
</ul>
|
||||
<p>
|
||||
Although these rules might seem easy to follow, problems can quickly happen when chaining
|
||||
badly implemented filters that play with positions and offsets, such as synonym or n-grams
|
||||
filters. Here are good practices for writing correct filters:
|
||||
</p>
|
||||
<ul>
|
||||
<li>Token filters should not modify offsets. If you feel that your filter would need to modify offsets, then it should probably be implemented as a tokenizer.</li>
|
||||
<li>Token filters should not insert positions. If a filter needs to add tokens, then they shoud all have a position increment of 0.</li>
|
||||
<li>When they remove tokens, token filters should increment the position increment of the following token.</li>
|
||||
<li>Token filters should preserve position lengths.</li>
|
||||
</ul>
|
||||
<h2>TokenStream API</h2>
|
||||
<p>
|
||||
"Flexible Indexing" summarizes the effort of making the Lucene indexer
|
||||
|
@ -382,6 +443,10 @@ and proximity searches (though sentence identification is not provided by Lucene
|
|||
<td>{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute}</td>
|
||||
<td>See above for detailed information about position increment.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>{@link org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute}</td>
|
||||
<td>The number of positions occupied by a token.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>{@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute}</td>
|
||||
<td>The payload that a Token can optionally have.</td>
|
||||
|
@ -532,20 +597,26 @@ public final class LengthFilter extends FilteringTokenFilter {
|
|||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/**
|
||||
* Build a filter that removes words that are too long or too
|
||||
* short from the text.
|
||||
* Create a new LengthFilter. This will filter out tokens whose
|
||||
* CharTermAttribute is either too short
|
||||
* (< min) or too long (> max).
|
||||
* @param version the Lucene match version
|
||||
* @param in the TokenStream to consume
|
||||
* @param min the minimum length
|
||||
* @param max the maximum length
|
||||
*/
|
||||
public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) {
|
||||
super(enablePositionIncrements, in);
|
||||
public LengthFilter(Version version, TokenStream in, int min, int max) {
|
||||
super(version, in);
|
||||
this.min = min;
|
||||
this.max = max;
|
||||
}
|
||||
|
||||
{@literal @Override}
|
||||
public boolean accept() throws IOException {
|
||||
public boolean accept() {
|
||||
final int len = termAtt.length();
|
||||
return (len >= min && len <= max);
|
||||
return (len >= min && len <= max);
|
||||
}
|
||||
|
||||
}
|
||||
</pre>
|
||||
<p>
|
||||
|
@ -573,19 +644,20 @@ public final class LengthFilter extends FilteringTokenFilter {
|
|||
public abstract class FilteringTokenFilter extends TokenFilter {
|
||||
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
|
||||
|
||||
public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
|
||||
super(input);
|
||||
this.enablePositionIncrements = enablePositionIncrements;
|
||||
/**
|
||||
* Create a new FilteringTokenFilter.
|
||||
* @param in the TokenStream to consume
|
||||
*/
|
||||
public FilteringTokenFilter(Version version, TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
/** Override this method and return if the current input token should be returned by {@literal {@link #incrementToken}}. */
|
||||
/** Override this method and return if the current input token should be returned by incrementToken. */
|
||||
protected abstract boolean accept() throws IOException;
|
||||
|
||||
{@literal @Override}
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (enablePositionIncrements) {
|
||||
int skippedPositions = 0;
|
||||
while (input.incrementToken()) {
|
||||
if (accept()) {
|
||||
|
@ -596,43 +668,15 @@ public abstract class FilteringTokenFilter extends TokenFilter {
|
|||
}
|
||||
skippedPositions += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
} else {
|
||||
while (input.incrementToken()) {
|
||||
if (accept()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// reached EOS -- return false
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@literal @see #setEnablePositionIncrements(boolean)}
|
||||
*/
|
||||
public boolean getEnablePositionIncrements() {
|
||||
return enablePositionIncrements;
|
||||
{@literal @Override}
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* If <code>true</code>, this TokenFilter will preserve
|
||||
* positions of the incoming tokens (ie, accumulate and
|
||||
* set position increments of the removed tokens).
|
||||
* Generally, <code>true</code> is best as it does not
|
||||
* lose information (positions of the original tokens)
|
||||
* during indexing.
|
||||
*
|
||||
* <p> When set, when a token is stopped
|
||||
* (omitted), the position increment of the following
|
||||
* token is incremented.
|
||||
*
|
||||
* <p> <b>NOTE</b>: be sure to also
|
||||
* set org.apache.lucene.queryparser.classic.QueryParser#setEnablePositionIncrements if
|
||||
* you use QueryParser to create queries.
|
||||
*/
|
||||
public void setEnablePositionIncrements(boolean enable) {
|
||||
this.enablePositionIncrements = enable;
|
||||
}
|
||||
}
|
||||
</pre>
|
||||
|
||||
|
|
|
@ -64,16 +64,10 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** Test a configuration that behaves a lot like StopAnalyzer */
|
||||
public void testStop() throws Exception {
|
||||
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
||||
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
assertAnalyzesTo(a, "the quick brown a fox",
|
||||
new String[] { "quick", "brown", "fox" },
|
||||
new int[] { 2, 1, 2 });
|
||||
|
||||
// disable positions
|
||||
a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, false);
|
||||
assertAnalyzesTo(a, "the quick brown a fox",
|
||||
new String[] { "quick", "brown", "fox" },
|
||||
new int[] { 1, 1, 1 });
|
||||
}
|
||||
|
||||
/** Test a configuration that behaves a lot like KeepWordFilter */
|
||||
|
@ -83,7 +77,7 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
|||
BasicOperations.complement(
|
||||
Automaton.union(
|
||||
Arrays.asList(BasicAutomata.makeString("foo"), BasicAutomata.makeString("bar")))));
|
||||
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords, true);
|
||||
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords);
|
||||
assertAnalyzesTo(a, "quick foo brown bar bar fox foo",
|
||||
new String[] { "foo", "bar", "bar", "foo" },
|
||||
new int[] { 2, 2, 1, 2 });
|
||||
|
@ -92,7 +86,7 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
|||
/** Test a configuration that behaves a lot like LengthFilter */
|
||||
public void testLength() throws Exception {
|
||||
CharacterRunAutomaton length5 = new CharacterRunAutomaton(new RegExp(".{5,}").toAutomaton());
|
||||
Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, length5, true);
|
||||
Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, length5);
|
||||
assertAnalyzesTo(a, "ok toolong fine notfine",
|
||||
new String[] { "ok", "fine" },
|
||||
new int[] { 1, 2 });
|
||||
|
|
|
@ -213,7 +213,7 @@ public class TestTermVectorsWriter extends LuceneTestCase {
|
|||
public void testEndOffsetPositionStopFilter() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
|
||||
TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)));
|
||||
TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
|
||||
Document doc = new Document();
|
||||
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
customType.setStoreTermVectors(true);
|
||||
|
|
|
@ -222,7 +222,7 @@ public class TestPhraseQuery extends LuceneTestCase {
|
|||
|
||||
public void testPhraseQueryWithStopAnalyzer() throws Exception {
|
||||
Directory directory = newDirectory();
|
||||
Analyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, false);
|
||||
Analyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
|
||||
newIndexWriterConfig( Version.LUCENE_40, stopAnalyzer));
|
||||
Document doc = new Document();
|
||||
|
@ -241,16 +241,6 @@ public class TestPhraseQuery extends LuceneTestCase {
|
|||
assertEquals(1, hits.length);
|
||||
QueryUtils.check(random(), query,searcher);
|
||||
|
||||
|
||||
// StopAnalyzer as of 2.4 does not leave "holes", so this matches.
|
||||
query = new PhraseQuery();
|
||||
query.add(new Term("field", "words"));
|
||||
query.add(new Term("field", "here"));
|
||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||
assertEquals(1, hits.length);
|
||||
QueryUtils.check(random(), query,searcher);
|
||||
|
||||
|
||||
reader.close();
|
||||
directory.close();
|
||||
}
|
||||
|
|
|
@ -37,7 +37,7 @@ public class TestSpanFirstQuery extends LuceneTestCase {
|
|||
|
||||
// mimic StopAnalyzer
|
||||
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|a|of").toAutomaton());
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true);
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
|
||||
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
|
||||
Document doc = new Document();
|
||||
|
|
|
@ -60,7 +60,7 @@ public class TestSpansAdvanced extends LuceneTestCase {
|
|||
mDirectory = newDirectory();
|
||||
final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT,
|
||||
new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true))
|
||||
new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET))
|
||||
.setMergePolicy(newLogMergePolicy()).setSimilarity(new DefaultSimilarity()));
|
||||
addDocument(writer, "1", "I think it should work.");
|
||||
addDocument(writer, "2", "I think it should work.");
|
||||
|
|
|
@ -49,7 +49,7 @@ public class TestSpansAdvanced2 extends TestSpansAdvanced {
|
|||
// create test index
|
||||
final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(),
|
||||
MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true))
|
||||
MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET))
|
||||
.setOpenMode(OpenMode.APPEND).setMergePolicy(newLogMergePolicy())
|
||||
.setSimilarity(new DefaultSimilarity()));
|
||||
addDocument(writer, "A", "Should we, could we, would we?");
|
||||
|
|
|
@ -247,7 +247,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
*/
|
||||
private String highlightField(Query query, String fieldName, String text)
|
||||
throws IOException, InvalidTokenOffsetsException {
|
||||
TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)
|
||||
TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)
|
||||
.tokenStream(fieldName, new StringReader(text));
|
||||
// Assuming "<B>", "</B>" used to highlight
|
||||
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
|
||||
|
@ -1308,7 +1308,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
}
|
||||
|
||||
public void testMaxSizeHighlight() throws Exception {
|
||||
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
||||
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
// we disable MockTokenizer checks because we will forcefully limit the
|
||||
// tokenstream and call end() before incrementToken() returns false.
|
||||
analyzer.setEnableChecks(false);
|
||||
|
@ -1343,7 +1343,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("stoppedtoken"));
|
||||
// we disable MockTokenizer checks because we will forcefully limit the
|
||||
// tokenstream and call end() before incrementToken() returns false.
|
||||
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords, true);
|
||||
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
|
||||
analyzer.setEnableChecks(false);
|
||||
TermQuery query = new TermQuery(new Term("data", goodWord));
|
||||
|
||||
|
@ -1394,7 +1394,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
Highlighter hg = getHighlighter(query, "text", fm);
|
||||
hg.setTextFragmenter(new NullFragmenter());
|
||||
hg.setMaxDocCharsToAnalyze(36);
|
||||
String match = hg.getBestFragment(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords, true), "text", text);
|
||||
String match = hg.getBestFragment(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords), "text", text);
|
||||
assertTrue(
|
||||
"Matched text should contain remainder of text after highlighted query ",
|
||||
match.endsWith("in it"));
|
||||
|
@ -1411,7 +1411,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
numHighlights = 0;
|
||||
// test to show how rewritten query can still be used
|
||||
searcher = newSearcher(reader);
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
|
||||
BooleanQuery query = new BooleanQuery();
|
||||
query.add(new WildcardQuery(new Term(FIELD_NAME, "jf?")), Occur.SHOULD);
|
||||
|
@ -1875,11 +1875,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
super.setUp();
|
||||
|
||||
a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
|
||||
analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
||||
analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
dir = newDirectory();
|
||||
ramDir = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(ramDir, newIndexWriterConfig(
|
||||
TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)));
|
||||
TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
|
||||
for (String text : texts) {
|
||||
addDoc(writer, text);
|
||||
}
|
||||
|
|
|
@ -89,7 +89,7 @@ public class HighlightCustomQueryTest extends LuceneTestCase {
|
|||
private String highlightField(Query query, String fieldName,
|
||||
String text) throws IOException, InvalidTokenOffsetsException {
|
||||
TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE,
|
||||
true, MockTokenFilter.ENGLISH_STOPSET, true).tokenStream(fieldName,
|
||||
true, MockTokenFilter.ENGLISH_STOPSET).tokenStream(fieldName,
|
||||
new StringReader(text));
|
||||
// Assuming "<B>", "</B>" used to highlight
|
||||
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
|
||||
|
|
|
@ -247,7 +247,7 @@ public class FastVectorHighlighterTest extends LuceneTestCase {
|
|||
|
||||
public void testCommonTermsQueryHighlightTest() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)));
|
||||
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
|
||||
FieldType type = new FieldType(TextField.TYPE_STORED);
|
||||
type.setStoreTermVectorOffsets(true);
|
||||
type.setStoreTermVectorPositions(true);
|
||||
|
|
|
@ -259,7 +259,7 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
|
|||
private Analyzer randomAnalyzer() {
|
||||
switch(random().nextInt(4)) {
|
||||
case 0: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||
case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
||||
case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
case 2: return new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
|
|
|
@ -546,7 +546,7 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
|
|||
|
||||
public void testBoost() throws Exception {
|
||||
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
|
||||
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true);
|
||||
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
|
||||
|
||||
PrecedenceQueryParser qp = new PrecedenceQueryParser();
|
||||
qp.setAnalyzer(oneStopAnalyzer);
|
||||
|
@ -561,7 +561,7 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
|
|||
q = qp.parse("\"on\"^1.0", "field");
|
||||
assertNotNull(q);
|
||||
|
||||
q = getParser(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)).parse("the^3",
|
||||
q = getParser(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)).parse("the^3",
|
||||
"field");
|
||||
assertNotNull(q);
|
||||
}
|
||||
|
|
|
@ -946,7 +946,7 @@ public class TestQPHelper extends LuceneTestCase {
|
|||
|
||||
public void testBoost() throws Exception {
|
||||
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
|
||||
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true);
|
||||
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
|
||||
StandardQueryParser qp = new StandardQueryParser();
|
||||
qp.setAnalyzer(oneStopAnalyzer);
|
||||
|
||||
|
@ -962,7 +962,7 @@ public class TestQPHelper extends LuceneTestCase {
|
|||
assertNotNull(q);
|
||||
|
||||
StandardQueryParser qp2 = new StandardQueryParser();
|
||||
qp2.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true));
|
||||
qp2.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));
|
||||
|
||||
q = qp2.parse("the^3", "field");
|
||||
// "the" is a stop word so the result is an empty query:
|
||||
|
@ -1179,7 +1179,7 @@ public class TestQPHelper extends LuceneTestCase {
|
|||
public void testStopwords() throws Exception {
|
||||
StandardQueryParser qp = new StandardQueryParser();
|
||||
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
|
||||
qp.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true));
|
||||
qp.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
|
||||
|
||||
Query result = qp.parse("a:the OR a:foo", "a");
|
||||
assertNotNull("result is null and it shouldn't be", result);
|
||||
|
@ -1203,7 +1203,7 @@ public class TestQPHelper extends LuceneTestCase {
|
|||
public void testPositionIncrement() throws Exception {
|
||||
StandardQueryParser qp = new StandardQueryParser();
|
||||
qp.setAnalyzer(
|
||||
new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true));
|
||||
new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));
|
||||
|
||||
qp.setEnablePositionIncrements(true);
|
||||
|
||||
|
|
|
@ -852,7 +852,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
|||
public void testBoost()
|
||||
throws Exception {
|
||||
CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
|
||||
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords, true);
|
||||
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
|
||||
CommonQueryParserConfiguration qp = getParserConfig(oneStopAnalyzer);
|
||||
Query q = getQuery("on^1.0",qp);
|
||||
assertNotNull(q);
|
||||
|
@ -865,7 +865,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
|||
q = getQuery("\"on\"^1.0",qp);
|
||||
assertNotNull(q);
|
||||
|
||||
Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
||||
Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
CommonQueryParserConfiguration qp2 = getParserConfig(a2);
|
||||
q = getQuery("the^3", qp2);
|
||||
// "the" is a stop word so the result is an empty query:
|
||||
|
@ -1007,7 +1007,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
|||
|
||||
public void testStopwords() throws Exception {
|
||||
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
|
||||
CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true));
|
||||
CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
|
||||
Query result = getQuery("field:the OR field:foo",qp);
|
||||
assertNotNull("result is null and it shouldn't be", result);
|
||||
assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
|
||||
|
@ -1023,7 +1023,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testPositionIncrement() throws Exception {
|
||||
CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true));
|
||||
CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));
|
||||
qp.setEnablePositionIncrements(true);
|
||||
String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\"";
|
||||
// 0 2 5 7 8
|
||||
|
@ -1070,7 +1070,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
|||
// "match"
|
||||
public void testPositionIncrements() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
||||
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, a));
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField("field", "the wizard of ozzy", Field.Store.NO));
|
||||
|
@ -1185,7 +1185,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testPhraseQueryToString() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
CommonQueryParserConfiguration qp = getParserConfig(analyzer);
|
||||
qp.setEnablePositionIncrements(true);
|
||||
PhraseQuery q = (PhraseQuery)getQuery("\"this hi this is a test is\"", qp);
|
||||
|
@ -1235,26 +1235,13 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
|||
CharacterRunAutomaton stopStopList =
|
||||
new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton());
|
||||
|
||||
CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList, false));
|
||||
|
||||
PhraseQuery phraseQuery = new PhraseQuery();
|
||||
phraseQuery.add(new Term("field", "1"));
|
||||
phraseQuery.add(new Term("field", "2"));
|
||||
|
||||
assertEquals(phraseQuery, getQuery("\"1 2\"",qp));
|
||||
assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp));
|
||||
|
||||
qp.setEnablePositionIncrements(true);
|
||||
assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp));
|
||||
|
||||
qp.setEnablePositionIncrements(false);
|
||||
assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp));
|
||||
CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList));
|
||||
|
||||
qp = getParserConfig(
|
||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList, true));
|
||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList));
|
||||
qp.setEnablePositionIncrements(true);
|
||||
|
||||
phraseQuery = new PhraseQuery();
|
||||
PhraseQuery phraseQuery = new PhraseQuery();
|
||||
phraseQuery.add(new Term("field", "1"));
|
||||
phraseQuery.add(new Term("field", "2"), 2);
|
||||
assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp));
|
||||
|
|
|
@ -58,7 +58,7 @@ public class TestParser extends LuceneTestCase {
|
|||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
// TODO: rewrite test (this needs to set QueryParser.enablePositionIncrements, too, for work with CURRENT):
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false);
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
//initialize the parser
|
||||
builder = new CorePlusExtensionsParser("contents", analyzer);
|
||||
|
||||
|
|
|
@ -75,9 +75,9 @@ import org.apache.lucene.util.fst.Util;
|
|||
* example, if you use an analyzer removing stop words,
|
||||
* then the partial text "ghost chr..." could see the
|
||||
* suggestion "The Ghost of Christmas Past". Note that
|
||||
* your {@code StopFilter} instance must NOT preserve
|
||||
* position increments for this example to work, so you should call
|
||||
* {@code setEnablePositionIncrements(false)} on it.
|
||||
* position increments MUST NOT be preserved for this example
|
||||
* to work, so you should call
|
||||
* {@link #setPreservePositionIncrements(boolean) setPreservePositionIncrements(false)}.
|
||||
*
|
||||
* <p>
|
||||
* If SynonymFilter is used to map wifi and wireless network to
|
||||
|
@ -185,6 +185,9 @@ public class AnalyzingSuggester extends Lookup {
|
|||
|
||||
private static final int PAYLOAD_SEP = '\u001f';
|
||||
|
||||
/** Whether position holes should appear in the automaton. */
|
||||
private boolean preservePositionIncrements;
|
||||
|
||||
/**
|
||||
* Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)
|
||||
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
|
||||
|
@ -241,6 +244,13 @@ public class AnalyzingSuggester extends Lookup {
|
|||
throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")");
|
||||
}
|
||||
this.maxGraphExpansions = maxGraphExpansions;
|
||||
preservePositionIncrements = true;
|
||||
}
|
||||
|
||||
/** Whether to take position holes (position increment > 1) into account when
|
||||
* building the automaton, <code>true</code> by default. */
|
||||
public void setPreservePositionIncrements(boolean preservePositionIncrements) {
|
||||
this.preservePositionIncrements = preservePositionIncrements;
|
||||
}
|
||||
|
||||
/** Returns byte size of the underlying FST. */
|
||||
|
@ -327,13 +337,16 @@ public class AnalyzingSuggester extends Lookup {
|
|||
}
|
||||
|
||||
TokenStreamToAutomaton getTokenStreamToAutomaton() {
|
||||
final TokenStreamToAutomaton tsta;
|
||||
if (preserveSep) {
|
||||
return new EscapingTokenStreamToAutomaton();
|
||||
tsta = new EscapingTokenStreamToAutomaton();
|
||||
} else {
|
||||
// When we're not preserving sep, we don't steal 0xff
|
||||
// byte, so we don't need to do any escaping:
|
||||
return new TokenStreamToAutomaton();
|
||||
tsta = new TokenStreamToAutomaton();
|
||||
}
|
||||
tsta.setPreservePositionIncrements(preservePositionIncrements);
|
||||
return tsta;
|
||||
}
|
||||
|
||||
private static class AnalyzingComparator implements Comparator<BytesRef> {
|
||||
|
|
|
@ -164,8 +164,9 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
new TermFreq("the ghost of christmas past", 50),
|
||||
};
|
||||
|
||||
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false);
|
||||
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
AnalyzingSuggester suggester = new AnalyzingSuggester(standard);
|
||||
suggester.setPreservePositionIncrements(false);
|
||||
suggester.build(new TermFreqArrayIterator(keys));
|
||||
|
||||
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
|
||||
|
@ -187,7 +188,7 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testEmpty() throws Exception {
|
||||
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false);
|
||||
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
AnalyzingSuggester suggester = new AnalyzingSuggester(standard);
|
||||
suggester.build(new TermFreqArrayIterator(new TermFreq[0]));
|
||||
|
||||
|
|
|
@ -153,8 +153,9 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
|||
new TermFreq("the ghost of christmas past", 50),
|
||||
};
|
||||
|
||||
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false);
|
||||
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
FuzzySuggester suggester = new FuzzySuggester(standard);
|
||||
suggester.setPreservePositionIncrements(false);
|
||||
suggester.build(new TermFreqArrayIterator(keys));
|
||||
|
||||
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
|
||||
|
|
|
@ -17,7 +17,6 @@ package org.apache.lucene.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
@ -46,7 +45,6 @@ public final class MockAnalyzer extends Analyzer {
|
|||
private final CharacterRunAutomaton runAutomaton;
|
||||
private final boolean lowerCase;
|
||||
private final CharacterRunAutomaton filter;
|
||||
private final boolean enablePositionIncrements;
|
||||
private int positionIncrementGap;
|
||||
private final Random random;
|
||||
private Map<String,Integer> previousMappings = new HashMap<String,Integer>();
|
||||
|
@ -60,30 +58,28 @@ public final class MockAnalyzer extends Analyzer {
|
|||
* @param runAutomaton DFA describing how tokenization should happen (e.g. [a-zA-Z]+)
|
||||
* @param lowerCase true if the tokenizer should lowercase terms
|
||||
* @param filter DFA describing how terms should be filtered (set of stopwords, etc)
|
||||
* @param enablePositionIncrements true if position increments should reflect filtered terms.
|
||||
*/
|
||||
public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase, CharacterRunAutomaton filter, boolean enablePositionIncrements) {
|
||||
public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase, CharacterRunAutomaton filter) {
|
||||
super(new PerFieldReuseStrategy());
|
||||
// TODO: this should be solved in a different way; Random should not be shared (!).
|
||||
this.random = new Random(random.nextLong());
|
||||
this.runAutomaton = runAutomaton;
|
||||
this.lowerCase = lowerCase;
|
||||
this.filter = filter;
|
||||
this.enablePositionIncrements = enablePositionIncrements;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton, boolean)
|
||||
* Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton)
|
||||
* MockAnalyzer(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false}).
|
||||
*/
|
||||
public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
|
||||
this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, true);
|
||||
this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a Whitespace-lowercasing analyzer with no stopwords removal.
|
||||
* <p>
|
||||
* Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton, boolean)
|
||||
* Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton)
|
||||
* MockAnalyzer(random, MockTokenizer.WHITESPACE, true, MockTokenFilter.EMPTY_STOPSET, false}).
|
||||
*/
|
||||
public MockAnalyzer(Random random) {
|
||||
|
@ -95,7 +91,6 @@ public final class MockAnalyzer extends Analyzer {
|
|||
MockTokenizer tokenizer = new MockTokenizer(reader, runAutomaton, lowerCase, maxTokenLength);
|
||||
tokenizer.setEnableChecks(enableChecks);
|
||||
MockTokenFilter filt = new MockTokenFilter(tokenizer, filter);
|
||||
filt.setEnablePositionIncrements(enablePositionIncrements);
|
||||
return new TokenStreamComponents(tokenizer, maybePayload(filt, fieldName));
|
||||
}
|
||||
|
||||
|
|
|
@ -55,7 +55,6 @@ public final class MockTokenFilter extends TokenFilter {
|
|||
makeString("with"))));
|
||||
|
||||
private final CharacterRunAutomaton filter;
|
||||
private boolean enablePositionIncrements = true;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
@ -80,9 +79,7 @@ public final class MockTokenFilter extends TokenFilter {
|
|||
int skippedPositions = 0;
|
||||
while (input.incrementToken()) {
|
||||
if (!filter.run(termAtt.buffer(), 0, termAtt.length())) {
|
||||
if (enablePositionIncrements) {
|
||||
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
skippedPositions += posIncrAtt.getPositionIncrement();
|
||||
|
@ -90,20 +87,4 @@ public final class MockTokenFilter extends TokenFilter {
|
|||
// reached EOS -- return false
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #setEnablePositionIncrements(boolean)
|
||||
*/
|
||||
public boolean getEnablePositionIncrements() {
|
||||
return enablePositionIncrements;
|
||||
}
|
||||
|
||||
/**
|
||||
* If <code>true</code>, this Filter will preserve
|
||||
* positions of the incoming tokens (ie, accumulate and
|
||||
* set position increments of the removed stop tokens).
|
||||
*/
|
||||
public void setEnablePositionIncrements(boolean enable) {
|
||||
this.enablePositionIncrements = enable;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -59,7 +59,7 @@ public abstract class SearchEquivalenceTestBase extends LuceneTestCase {
|
|||
directory = newDirectory();
|
||||
stopword = "" + randomChar();
|
||||
CharacterRunAutomaton stopset = new CharacterRunAutomaton(BasicAutomata.makeString(stopword));
|
||||
analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset, true);
|
||||
analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset);
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random, directory, analyzer);
|
||||
Document doc = new Document();
|
||||
Field id = new StringField("id", "", Field.Store.NO);
|
||||
|
|
|
@ -87,8 +87,8 @@ public class DisMaxRequestHandlerTest extends SolrTestCaseJ4 {
|
|||
req("cool stuff")
|
||||
,"//*[@numFound='3']"
|
||||
,"//result/doc[1]/int[@name='id'][.='42']"
|
||||
,"//result/doc[2]/int[@name='id'][.='666']"
|
||||
,"//result/doc[3]/int[@name='id'][.='8675309']"
|
||||
,"//result/doc[2]/int[@name='id'][.='8675309']"
|
||||
,"//result/doc[3]/int[@name='id'][.='666']"
|
||||
);
|
||||
|
||||
assertQ("multi qf",
|
||||
|
|
|
@ -323,16 +323,16 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe
|
|||
tokenList = valueResult.get("org.apache.lucene.analysis.core.StopFilter");
|
||||
assertNotNull("Expecting the 'StopFilter' to be applied on the index for the 'text' field", tokenList);
|
||||
assertEquals("Expecting 4 tokens after stop word removal", 4, tokenList.size());
|
||||
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 1, new int[]{2,2,2,1}, null, false));
|
||||
assertToken(tokenList.get(1), new TokenInfo("jumped", null, "<ALPHANUM>", 8, 14, 2, new int[]{3,3,3,2}, null, false));
|
||||
assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 3, new int[]{4,4,4,3}, null, false));
|
||||
assertToken(tokenList.get(3), new TokenInfo("dogs", null, "<ALPHANUM>", 24, 28, 4, new int[]{6,6,6,4}, null, false));
|
||||
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2,2,2,2}, null, false));
|
||||
assertToken(tokenList.get(1), new TokenInfo("jumped", null, "<ALPHANUM>", 8, 14, 3, new int[]{3,3,3,3}, null, false));
|
||||
assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4,4,4}, null, false));
|
||||
assertToken(tokenList.get(3), new TokenInfo("dogs", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6,6,6}, null, false));
|
||||
tokenList = valueResult.get("org.apache.lucene.analysis.en.PorterStemFilter");
|
||||
assertNotNull("Expecting the 'PorterStemFilter' to be applied on the index for the 'text' field", tokenList);
|
||||
assertEquals("Expecting 4 tokens", 4, tokenList.size());
|
||||
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 1, new int[]{2,2,2,1,1}, null, false));
|
||||
assertToken(tokenList.get(1), new TokenInfo("jump", null, "<ALPHANUM>", 8, 14, 2, new int[]{3,3,3,2,2}, null, true));
|
||||
assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 3, new int[]{4,4,4,3,3}, null, false));
|
||||
assertToken(tokenList.get(3), new TokenInfo("dog", null, "<ALPHANUM>", 24, 28, 4, new int[]{6,6,6,4,4}, null, false));
|
||||
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2,2,2,2,2}, null, false));
|
||||
assertToken(tokenList.get(1), new TokenInfo("jump", null, "<ALPHANUM>", 8, 14, 3, new int[]{3,3,3,3,3}, null, true));
|
||||
assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4,4,4,4}, null, false));
|
||||
assertToken(tokenList.get(3), new TokenInfo("dog", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6,6,6,6}, null, false));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -178,25 +178,25 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
|
|||
tokenList = indexPart.get("org.apache.lucene.analysis.core.StopFilter");
|
||||
assertNotNull("Expcting StopFilter analysis breakdown", tokenList);
|
||||
assertEquals(tokenList.size(), 8);
|
||||
assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 1, new int[]{2,2,2,1}, null, false));
|
||||
assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 2, new int[]{3,3,3,2}, null, false));
|
||||
assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 3, new int[]{4,4,4,3}, null, true));
|
||||
assertToken(tokenList.get(3), new TokenInfo("jumped", null, "<ALPHANUM>", 18, 24, 4, new int[]{5,5,5,4}, null, false));
|
||||
assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 5, new int[]{6,6,6,5}, null, false));
|
||||
assertToken(tokenList.get(5), new TokenInfo("lazy", null, "<ALPHANUM>", 34, 38, 6, new int[]{8,8,8,6}, null, false));
|
||||
assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 7, new int[]{9,9,9,7}, null, true));
|
||||
assertToken(tokenList.get(7), new TokenInfo("dogs", null, "<ALPHANUM>", 45, 49, 8, new int[]{10,10,10,8}, null, false));
|
||||
assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2,2,2}, null, false));
|
||||
assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 3, new int[]{3,3,3,3}, null, false));
|
||||
assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 4, new int[]{4,4,4,4}, null, true));
|
||||
assertToken(tokenList.get(3), new TokenInfo("jumped", null, "<ALPHANUM>", 18, 24, 5, new int[]{5,5,5,5}, null, false));
|
||||
assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 6, new int[]{6,6,6,6}, null, false));
|
||||
assertToken(tokenList.get(5), new TokenInfo("lazy", null, "<ALPHANUM>", 34, 38, 8, new int[]{8,8,8,8}, null, false));
|
||||
assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 9, new int[]{9,9,9,9}, null, true));
|
||||
assertToken(tokenList.get(7), new TokenInfo("dogs", null, "<ALPHANUM>", 45, 49, 10, new int[]{10,10,10,10}, null, false));
|
||||
tokenList = indexPart.get("org.apache.lucene.analysis.en.PorterStemFilter");
|
||||
assertNotNull("Expcting PorterStemFilter analysis breakdown", tokenList);
|
||||
assertEquals(tokenList.size(), 8);
|
||||
assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 1, new int[]{2,2,2,1,1}, null, false));
|
||||
assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 2, new int[]{3,3,3,2,2}, null, false));
|
||||
assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 3, new int[]{4,4,4,3,3}, null, true));
|
||||
assertToken(tokenList.get(3), new TokenInfo("jump", null, "<ALPHANUM>", 18, 24, 4, new int[]{5,5,5,4,4}, null, false));
|
||||
assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 5, new int[]{6,6,6,5,5}, null, false));
|
||||
assertToken(tokenList.get(5), new TokenInfo("lazi", null, "<ALPHANUM>", 34, 38, 6, new int[]{8,8,8,6,6}, null, false));
|
||||
assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 7, new int[]{9,9,9,7,7}, null, true));
|
||||
assertToken(tokenList.get(7), new TokenInfo("dog", null, "<ALPHANUM>", 45, 49, 8, new int[]{10,10,10,8,8}, null, false));
|
||||
assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2,2,2,2}, null, false));
|
||||
assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 3, new int[]{3,3,3,3,3}, null, false));
|
||||
assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 4, new int[]{4,4,4,4,4}, null, true));
|
||||
assertToken(tokenList.get(3), new TokenInfo("jump", null, "<ALPHANUM>", 18, 24, 5, new int[]{5,5,5,5,5}, null, false));
|
||||
assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 6, new int[]{6,6,6,6,6}, null, false));
|
||||
assertToken(tokenList.get(5), new TokenInfo("lazi", null, "<ALPHANUM>", 34, 38, 8, new int[]{8,8,8,8,8}, null, false));
|
||||
assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 9, new int[]{9,9,9,9,9}, null, true));
|
||||
assertToken(tokenList.get(7), new TokenInfo("dog", null, "<ALPHANUM>", 45, 49, 10, new int[]{10,10,10,10,10}, null, false));
|
||||
|
||||
NamedList<List<NamedList>> queryPart = textType.get("query");
|
||||
assertNotNull("expecting a query token analysis for field type 'text'", queryPart);
|
||||
|
|
|
@ -201,12 +201,12 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
|
|||
public void testOptions() throws Exception {
|
||||
assertJQ(req("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
|
||||
, TermVectorParams.TF, "true", TermVectorParams.DF, "true", TermVectorParams.OFFSETS, "true", TermVectorParams.POSITIONS, "true", TermVectorParams.TF_IDF, "true")
|
||||
,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':1}, 'df':2, 'tf-idf':0.5}"
|
||||
,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
|
||||
);
|
||||
|
||||
assertJQ(req("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
|
||||
, TermVectorParams.ALL, "true")
|
||||
,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':1}, 'df':2, 'tf-idf':0.5}"
|
||||
,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
|
||||
);
|
||||
|
||||
// test each combination at random
|
||||
|
@ -214,7 +214,7 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
|
|||
list.addAll(Arrays.asList("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"));
|
||||
String[][] options = new String[][] { { TermVectorParams.TF, "'tf':1" },
|
||||
{ TermVectorParams.OFFSETS, "'offsets':{'start':20, 'end':27}" },
|
||||
{ TermVectorParams.POSITIONS, "'positions':{'position':1}" },
|
||||
{ TermVectorParams.POSITIONS, "'positions':{'position':5}" },
|
||||
{ TermVectorParams.DF, "'df':2" },
|
||||
{ TermVectorParams.TF_IDF, "'tf-idf':0.5" } };
|
||||
StringBuilder expected = new StringBuilder("/termVectors/0/test_posofftv/anoth=={");
|
||||
|
@ -249,7 +249,7 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
|
|||
,"f.test_basictv." + TermVectorParams.TF_IDF, "false"
|
||||
)
|
||||
,"/termVectors/0/test_basictv=={'anoth':{},'titl':{}}"
|
||||
,"/termVectors/0/test_postv/anoth=={'tf':1, 'positions':{'position':1}, 'df':2, 'tf-idf':0.5}"
|
||||
,"/termVectors/0/test_postv/anoth=={'tf':1, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
|
||||
,"/termVectors/0/test_offtv/anoth=={'tf':1, 'df':2, 'tf-idf':0.5}"
|
||||
,"/termVectors/warnings=={ 'noTermVectors':['test_notv'], 'noPositions':['test_basictv', 'test_offtv'], 'noOffsets':['test_basictv', 'test_postv']}"
|
||||
);
|
||||
|
|
|
@ -53,7 +53,7 @@ public class TestSuggestSpellingConverter extends BaseTokenStreamTestCase {
|
|||
TokenStream filter = new PatternReplaceFilter(tokenizer,
|
||||
Pattern.compile("([^\\p{L}\\p{M}\\p{N}\\p{Cs}]*[\\p{L}\\p{M}\\p{N}\\p{Cs}\\_]+:)|([^\\p{L}\\p{M}\\p{N}\\p{Cs}])+"), " ", true);
|
||||
filter = new LowerCaseFilter(TEST_VERSION_CURRENT, filter);
|
||||
filter = new TrimFilter(filter, false);
|
||||
filter = new TrimFilter(TEST_VERSION_CURRENT, filter, false);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
});
|
||||
|
|
|
@ -202,13 +202,10 @@
|
|||
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
-->
|
||||
<!-- Case insensitive stop word removal.
|
||||
add enablePositionIncrements=true in both the index and query
|
||||
analyzers to leave a 'gap' for more accurate phrase queries.
|
||||
-->
|
||||
<filter class="solr.StopFilterFactory"
|
||||
ignoreCase="true"
|
||||
words="stopwords.txt"
|
||||
enablePositionIncrements="true"
|
||||
/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
|
@ -222,7 +219,6 @@
|
|||
<filter class="solr.StopFilterFactory"
|
||||
ignoreCase="true"
|
||||
words="stopwords.txt"
|
||||
enablePositionIncrements="true"
|
||||
/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
|
|
|
@ -440,7 +440,7 @@
|
|||
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
|
||||
<!-- in this example, we will only use synonyms at query time
|
||||
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
-->
|
||||
|
@ -448,7 +448,7 @@
|
|||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
|
||||
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
|
@ -466,13 +466,10 @@
|
|||
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
-->
|
||||
<!-- Case insensitive stop word removal.
|
||||
add enablePositionIncrements=true in both the index and query
|
||||
analyzers to leave a 'gap' for more accurate phrase queries.
|
||||
-->
|
||||
<filter class="solr.StopFilterFactory"
|
||||
ignoreCase="true"
|
||||
words="lang/stopwords_en.txt"
|
||||
enablePositionIncrements="true"
|
||||
/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPossessiveFilterFactory"/>
|
||||
|
@ -488,7 +485,6 @@
|
|||
<filter class="solr.StopFilterFactory"
|
||||
ignoreCase="true"
|
||||
words="lang/stopwords_en.txt"
|
||||
enablePositionIncrements="true"
|
||||
/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPossessiveFilterFactory"/>
|
||||
|
@ -516,13 +512,10 @@
|
|||
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
-->
|
||||
<!-- Case insensitive stop word removal.
|
||||
add enablePositionIncrements=true in both the index and query
|
||||
analyzers to leave a 'gap' for more accurate phrase queries.
|
||||
-->
|
||||
<filter class="solr.StopFilterFactory"
|
||||
ignoreCase="true"
|
||||
words="lang/stopwords_en.txt"
|
||||
enablePositionIncrements="true"
|
||||
/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
|
@ -535,7 +528,6 @@
|
|||
<filter class="solr.StopFilterFactory"
|
||||
ignoreCase="true"
|
||||
words="lang/stopwords_en.txt"
|
||||
enablePositionIncrements="true"
|
||||
/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
|
@ -566,7 +558,7 @@
|
|||
<fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
|
||||
maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
|
||||
|
@ -574,7 +566,7 @@
|
|||
<analyzer type="query">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
@ -730,7 +722,7 @@
|
|||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<!-- for any non-arabic -->
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ar.txt" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ar.txt" />
|
||||
<!-- normalizes ﻯ to ﻱ, etc -->
|
||||
<filter class="solr.ArabicNormalizationFilterFactory"/>
|
||||
<filter class="solr.ArabicStemFilterFactory"/>
|
||||
|
@ -742,7 +734,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_bg.txt" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_bg.txt" />
|
||||
<filter class="solr.BulgarianStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
@ -754,7 +746,7 @@
|
|||
<!-- removes l', etc -->
|
||||
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ca.txt"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ca.txt" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ca.txt" />
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Catalan"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
@ -776,7 +768,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_cz.txt" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_cz.txt" />
|
||||
<filter class="solr.CzechStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
@ -786,7 +778,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_da.txt" format="snowball" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_da.txt" format="snowball" />
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Danish"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
@ -796,7 +788,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" />
|
||||
<filter class="solr.GermanNormalizationFilterFactory"/>
|
||||
<filter class="solr.GermanLightStemFilterFactory"/>
|
||||
<!-- less aggressive: <filter class="solr.GermanMinimalStemFilterFactory"/> -->
|
||||
|
@ -810,7 +802,7 @@
|
|||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<!-- greek specific lowercase for sigma -->
|
||||
<filter class="solr.GreekLowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_el.txt" />
|
||||
<filter class="solr.GreekStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
@ -820,7 +812,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_es.txt" format="snowball" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_es.txt" format="snowball" />
|
||||
<filter class="solr.SpanishLightStemFilterFactory"/>
|
||||
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Spanish"/> -->
|
||||
</analyzer>
|
||||
|
@ -831,7 +823,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_eu.txt" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_eu.txt" />
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Basque"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
@ -845,7 +837,7 @@
|
|||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.ArabicNormalizationFilterFactory"/>
|
||||
<filter class="solr.PersianNormalizationFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fa.txt" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fa.txt" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
@ -854,7 +846,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fi.txt" format="snowball" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fi.txt" format="snowball" />
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Finnish"/>
|
||||
<!-- less aggressive: <filter class="solr.FinnishLightStemFilterFactory"/> -->
|
||||
</analyzer>
|
||||
|
@ -867,7 +859,7 @@
|
|||
<!-- removes l', etc -->
|
||||
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_fr.txt"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fr.txt" format="snowball" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fr.txt" format="snowball" />
|
||||
<filter class="solr.FrenchLightStemFilterFactory"/>
|
||||
<!-- less aggressive: <filter class="solr.FrenchMinimalStemFilterFactory"/> -->
|
||||
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="French"/> -->
|
||||
|
@ -881,9 +873,9 @@
|
|||
<!-- removes d', etc -->
|
||||
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt"/>
|
||||
<!-- removes n-, etc. position increments is intentionally false! -->
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt" enablePositionIncrements="false"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt"/>
|
||||
<filter class="solr.IrishLowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt"/>
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Irish"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
@ -893,7 +885,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_gl.txt" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_gl.txt" />
|
||||
<filter class="solr.GalicianStemFilterFactory"/>
|
||||
<!-- less aggressive: <filter class="solr.GalicianMinimalStemFilterFactory"/> -->
|
||||
</analyzer>
|
||||
|
@ -908,7 +900,7 @@
|
|||
<filter class="solr.IndicNormalizationFilterFactory"/>
|
||||
<!-- normalizes variation in spelling -->
|
||||
<filter class="solr.HindiNormalizationFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hi.txt" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hi.txt" />
|
||||
<filter class="solr.HindiStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
@ -918,7 +910,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hu.txt" format="snowball" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hu.txt" format="snowball" />
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Hungarian"/>
|
||||
<!-- less aggressive: <filter class="solr.HungarianLightStemFilterFactory"/> -->
|
||||
</analyzer>
|
||||
|
@ -929,7 +921,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hy.txt" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hy.txt" />
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Armenian"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
@ -939,7 +931,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_id.txt" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_id.txt" />
|
||||
<!-- for a less aggressive approach (only inflectional suffixes), set stemDerivational to false -->
|
||||
<filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/>
|
||||
</analyzer>
|
||||
|
@ -952,7 +944,7 @@
|
|||
<!-- removes l', etc -->
|
||||
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_it.txt"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt" format="snowball" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt" format="snowball" />
|
||||
<filter class="solr.ItalianLightStemFilterFactory"/>
|
||||
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> -->
|
||||
</analyzer>
|
||||
|
@ -999,11 +991,11 @@
|
|||
<!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
|
||||
<filter class="solr.JapaneseBaseFormFilterFactory"/>
|
||||
<!-- Removes tokens with certain part-of-speech tags -->
|
||||
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/>
|
||||
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" />
|
||||
<!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) -->
|
||||
<filter class="solr.CJKWidthFilterFactory"/>
|
||||
<!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" />
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" />
|
||||
<!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) -->
|
||||
<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
|
||||
<!-- Lower-cases romaji characters -->
|
||||
|
@ -1016,7 +1008,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_lv.txt" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_lv.txt" />
|
||||
<filter class="solr.LatvianStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
@ -1026,7 +1018,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_nl.txt" format="snowball" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_nl.txt" format="snowball" />
|
||||
<filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict_nl.txt" ignoreCase="false"/>
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Dutch"/>
|
||||
</analyzer>
|
||||
|
@ -1037,7 +1029,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball" />
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Norwegian"/>
|
||||
<!-- less aggressive: <filter class="solr.NorwegianLightStemFilterFactory"/> -->
|
||||
<!-- singular/plural: <filter class="solr.NorwegianMinimalStemFilterFactory"/> -->
|
||||
|
@ -1049,7 +1041,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" />
|
||||
<filter class="solr.PortugueseLightStemFilterFactory"/>
|
||||
<!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->
|
||||
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->
|
||||
|
@ -1062,7 +1054,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ro.txt" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ro.txt" />
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Romanian"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
@ -1072,7 +1064,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" />
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Russian"/>
|
||||
<!-- less aggressive: <filter class="solr.RussianLightStemFilterFactory"/> -->
|
||||
</analyzer>
|
||||
|
@ -1083,7 +1075,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_sv.txt" format="snowball" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_sv.txt" format="snowball" />
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Swedish"/>
|
||||
<!-- less aggressive: <filter class="solr.SwedishLightStemFilterFactory"/> -->
|
||||
</analyzer>
|
||||
|
@ -1095,7 +1087,7 @@
|
|||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.ThaiWordFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_th.txt" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_th.txt" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
@ -1104,7 +1096,7 @@
|
|||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.TurkishLowerCaseFilterFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt" enablePositionIncrements="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt" />
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Turkish"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
|
Loading…
Reference in New Issue