mirror of https://github.com/apache/lucene.git
LUCENE-4963: Deprecate broken TokenFilter options.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1479148 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ec7317a8d6
commit
8a7f2b6cc4
|
@ -59,6 +59,16 @@ Changes in backwards compatibility policy
|
||||||
completely refactored to allow for a better implementation of TimSort.
|
completely refactored to allow for a better implementation of TimSort.
|
||||||
(Adrien Grand, Uwe Schindler, Dawid Weiss)
|
(Adrien Grand, Uwe Schindler, Dawid Weiss)
|
||||||
|
|
||||||
|
* LUCENE-4963: Some TokenFilter options that generate broken TokenStreams have
|
||||||
|
been deprecated: updateOffsets=true on TrimFilter and
|
||||||
|
enablePositionIncrements=false on all classes that inherit from
|
||||||
|
FilteringTokenFilter: JapanesePartOfSpeechStopFilter, KeepWordFilter,
|
||||||
|
LengthFilter, StopFilter and TypeTokenFilter. (Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-4963: In order not to take position increments into account in
|
||||||
|
suggesters, you now need to call setPreservePositionIncrements(false) instead
|
||||||
|
of configuring the token filters to not increment positions. (Adrien Grand)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
|
|
||||||
* LUCENE-4935: CustomScoreQuery wrongly applied its query boost twice
|
* LUCENE-4935: CustomScoreQuery wrongly applied its query boost twice
|
||||||
|
|
|
@ -57,7 +57,7 @@ public final class StopFilter extends FilteringTokenFilter {
|
||||||
* @see #makeStopSet(Version, java.lang.String...)
|
* @see #makeStopSet(Version, java.lang.String...)
|
||||||
*/
|
*/
|
||||||
public StopFilter(Version matchVersion, TokenStream in, CharArraySet stopWords) {
|
public StopFilter(Version matchVersion, TokenStream in, CharArraySet stopWords) {
|
||||||
super(true, in);
|
super(matchVersion, in);
|
||||||
this.stopWords = stopWords;
|
this.stopWords = stopWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -51,7 +51,7 @@ public class StopFilterFactory extends TokenFilterFactory implements ResourceLoa
|
||||||
stopWordFiles = get(args, "words");
|
stopWordFiles = get(args, "words");
|
||||||
format = get(args, "format");
|
format = get(args, "format");
|
||||||
ignoreCase = getBoolean(args, "ignoreCase", false);
|
ignoreCase = getBoolean(args, "ignoreCase", false);
|
||||||
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
|
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
|
||||||
if (!args.isEmpty()) {
|
if (!args.isEmpty()) {
|
||||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,12 +17,12 @@ package org.apache.lucene.analysis.core;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes tokens whose types appear in a set of blocked types from a token stream.
|
* Removes tokens whose types appear in a set of blocked types from a token stream.
|
||||||
|
@ -33,14 +33,41 @@ public final class TypeTokenFilter extends FilteringTokenFilter {
|
||||||
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
|
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
|
||||||
private final boolean useWhiteList;
|
private final boolean useWhiteList;
|
||||||
|
|
||||||
public TypeTokenFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes, boolean useWhiteList) {
|
/** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
|
||||||
super(enablePositionIncrements, input);
|
@Deprecated
|
||||||
|
public TypeTokenFilter(Version version, boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes, boolean useWhiteList) {
|
||||||
|
super(version, enablePositionIncrements, input);
|
||||||
this.stopTypes = stopTypes;
|
this.stopTypes = stopTypes;
|
||||||
this.useWhiteList = useWhiteList;
|
this.useWhiteList = useWhiteList;
|
||||||
}
|
}
|
||||||
|
|
||||||
public TypeTokenFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes) {
|
/** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
|
||||||
this(enablePositionIncrements, input, stopTypes, false);
|
@Deprecated
|
||||||
|
public TypeTokenFilter(Version version, boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes) {
|
||||||
|
this(version, enablePositionIncrements, input, stopTypes, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new {@link TypeTokenFilter}.
|
||||||
|
* @param version the Lucene match version
|
||||||
|
* @param input the {@link TokenStream} to consume
|
||||||
|
* @param stopTypes the types to filter
|
||||||
|
* @param useWhiteList if true, then tokens whose type is in stopTypes will
|
||||||
|
* be kept, otherwise they will be filtered out
|
||||||
|
*/
|
||||||
|
public TypeTokenFilter(Version version, TokenStream input, Set<String> stopTypes, boolean useWhiteList) {
|
||||||
|
super(version, input);
|
||||||
|
this.stopTypes = stopTypes;
|
||||||
|
this.useWhiteList = useWhiteList;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new {@link TypeTokenFilter} that filters tokens out
|
||||||
|
* (useWhiteList=false).
|
||||||
|
* @see #TypeTokenFilter(Version, TokenStream, Set, boolean)
|
||||||
|
*/
|
||||||
|
public TypeTokenFilter(Version version, TokenStream input, Set<String> stopTypes) {
|
||||||
|
this(version, input, stopTypes, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -35,7 +35,7 @@ import java.util.Set;
|
||||||
* <analyzer>
|
* <analyzer>
|
||||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
* <filter class="solr.TypeTokenFilterFactory" types="stoptypes.txt"
|
* <filter class="solr.TypeTokenFilterFactory" types="stoptypes.txt"
|
||||||
* enablePositionIncrements="true" useWhitelist="false"/>
|
* useWhitelist="false"/>
|
||||||
* </analyzer>
|
* </analyzer>
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*/
|
*/
|
||||||
|
@ -49,7 +49,7 @@ public class TypeTokenFilterFactory extends TokenFilterFactory implements Resour
|
||||||
public TypeTokenFilterFactory(Map<String,String> args) {
|
public TypeTokenFilterFactory(Map<String,String> args) {
|
||||||
super(args);
|
super(args);
|
||||||
stopTypesFiles = require(args, "types");
|
stopTypesFiles = require(args, "types");
|
||||||
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
|
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
|
||||||
useWhitelist = getBoolean(args, "useWhitelist", false);
|
useWhitelist = getBoolean(args, "useWhitelist", false);
|
||||||
if (!args.isEmpty()) {
|
if (!args.isEmpty()) {
|
||||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
|
@ -78,6 +78,8 @@ public class TypeTokenFilterFactory extends TokenFilterFactory implements Resour
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream input) {
|
public TokenStream create(TokenStream input) {
|
||||||
return new TypeTokenFilter(enablePositionIncrements, input, stopTypes, useWhitelist);
|
@SuppressWarnings("deprecation")
|
||||||
|
final TokenStream filter = new TypeTokenFilter(luceneMatchVersion, enablePositionIncrements, input, stopTypes, useWhitelist);
|
||||||
|
return filter;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -138,7 +138,9 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
|
||||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||||
TokenStream result = new StandardFilter(matchVersion, source);
|
TokenStream result = new StandardFilter(matchVersion, source);
|
||||||
StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS);
|
StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS);
|
||||||
s.setEnablePositionIncrements(false);
|
if (!matchVersion.onOrAfter(Version.LUCENE_44)) {
|
||||||
|
s.setEnablePositionIncrements(false);
|
||||||
|
}
|
||||||
result = s;
|
result = s;
|
||||||
result = new ElisionFilter(result, DEFAULT_ARTICLES);
|
result = new ElisionFilter(result, DEFAULT_ARTICLES);
|
||||||
result = new IrishLowerCaseFilter(result);
|
result = new IrishLowerCaseFilter(result);
|
||||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A TokenFilter that only keeps tokens with text contained in the
|
* A TokenFilter that only keeps tokens with text contained in the
|
||||||
|
@ -32,10 +33,23 @@ public final class KeepWordFilter extends FilteringTokenFilter {
|
||||||
private final CharArraySet words;
|
private final CharArraySet words;
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
/** The words set passed to this constructor will be directly used by this filter
|
/** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
|
||||||
* and should not be modified, */
|
@Deprecated
|
||||||
public KeepWordFilter(boolean enablePositionIncrements, TokenStream in, CharArraySet words) {
|
public KeepWordFilter(Version version, boolean enablePositionIncrements, TokenStream in, CharArraySet words) {
|
||||||
super(enablePositionIncrements, in);
|
super(version, enablePositionIncrements, in);
|
||||||
|
this.words = words;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new {@link KeepWordFilter}.
|
||||||
|
* <p><b>NOTE</b>: The words set passed to this constructor will be directly
|
||||||
|
* used by this filter and should not be modified.
|
||||||
|
* @param version the Lucene match version
|
||||||
|
* @param in the {@link TokenStream} to consume
|
||||||
|
* @param words the words to keep
|
||||||
|
*/
|
||||||
|
public KeepWordFilter(Version version, TokenStream in, CharArraySet words) {
|
||||||
|
super(version, in);
|
||||||
this.words = words;
|
this.words = words;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -32,7 +32,7 @@ import java.io.IOException;
|
||||||
* <fieldType name="text_keepword" class="solr.TextField" positionIncrementGap="100">
|
* <fieldType name="text_keepword" class="solr.TextField" positionIncrementGap="100">
|
||||||
* <analyzer>
|
* <analyzer>
|
||||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
* <filter class="solr.KeepWordFilterFactory" words="keepwords.txt" ignoreCase="false" enablePositionIncrements="false"/>
|
* <filter class="solr.KeepWordFilterFactory" words="keepwords.txt" ignoreCase="false"/>
|
||||||
* </analyzer>
|
* </analyzer>
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*/
|
*/
|
||||||
|
@ -48,7 +48,7 @@ public class KeepWordFilterFactory extends TokenFilterFactory implements Resourc
|
||||||
assureMatchVersion();
|
assureMatchVersion();
|
||||||
wordFiles = get(args, "words");
|
wordFiles = get(args, "words");
|
||||||
ignoreCase = getBoolean(args, "ignoreCase", false);
|
ignoreCase = getBoolean(args, "ignoreCase", false);
|
||||||
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
|
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
|
||||||
if (!args.isEmpty()) {
|
if (!args.isEmpty()) {
|
||||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
}
|
}
|
||||||
|
@ -76,6 +76,12 @@ public class KeepWordFilterFactory extends TokenFilterFactory implements Resourc
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream input) {
|
public TokenStream create(TokenStream input) {
|
||||||
// if the set is null, it means it was empty
|
// if the set is null, it means it was empty
|
||||||
return words == null ? input : new KeepWordFilter(enablePositionIncrements, input, words);
|
if (words == null) {
|
||||||
|
return input;
|
||||||
|
} else {
|
||||||
|
@SuppressWarnings("deprecation")
|
||||||
|
final TokenStream filter = new KeepWordFilter(luceneMatchVersion, enablePositionIncrements, input, words);
|
||||||
|
return filter;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.miscellaneous;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes words that are too long or too short from the stream.
|
* Removes words that are too long or too short from the stream.
|
||||||
|
@ -34,16 +35,29 @@ public final class LengthFilter extends FilteringTokenFilter {
|
||||||
|
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
/**
|
/** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
|
||||||
* Build a filter that removes words that are too long or too
|
@Deprecated
|
||||||
* short from the text.
|
public LengthFilter(Version version, boolean enablePositionIncrements, TokenStream in, int min, int max) {
|
||||||
*/
|
super(version, enablePositionIncrements, in);
|
||||||
public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) {
|
|
||||||
super(enablePositionIncrements, in);
|
|
||||||
this.min = min;
|
this.min = min;
|
||||||
this.max = max;
|
this.max = max;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new {@link LengthFilter}. This will filter out tokens whose
|
||||||
|
* {@link CharTermAttribute} is either too short ({@link CharTermAttribute#length()}
|
||||||
|
* < min) or too long ({@link CharTermAttribute#length()} > max).
|
||||||
|
* @param version the Lucene match version
|
||||||
|
* @param in the {@link TokenStream} to consume
|
||||||
|
* @param min the minimum length
|
||||||
|
* @param max the maximum length
|
||||||
|
*/
|
||||||
|
public LengthFilter(Version version, TokenStream in, int min, int max) {
|
||||||
|
super(version, in);
|
||||||
|
this.min = min;
|
||||||
|
this.max = max;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean accept() {
|
public boolean accept() {
|
||||||
final int len = termAtt.length();
|
final int len = termAtt.length();
|
||||||
|
|
|
@ -17,18 +17,18 @@ package org.apache.lucene.analysis.miscellaneous;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Factory for {@link LengthFilter}.
|
* Factory for {@link LengthFilter}.
|
||||||
* <pre class="prettyprint">
|
* <pre class="prettyprint">
|
||||||
* <fieldType name="text_lngth" class="solr.TextField" positionIncrementGap="100">
|
* <fieldType name="text_lngth" class="solr.TextField" positionIncrementGap="100">
|
||||||
* <analyzer>
|
* <analyzer>
|
||||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
* <filter class="solr.LengthFilterFactory" min="0" max="1" enablePositionIncrements="false"/>
|
* <filter class="solr.LengthFilterFactory" min="0" max="1" />
|
||||||
* </analyzer>
|
* </analyzer>
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*/
|
*/
|
||||||
|
@ -44,7 +44,7 @@ public class LengthFilterFactory extends TokenFilterFactory {
|
||||||
super(args);
|
super(args);
|
||||||
min = requireInt(args, MIN_KEY);
|
min = requireInt(args, MIN_KEY);
|
||||||
max = requireInt(args, MAX_KEY);
|
max = requireInt(args, MAX_KEY);
|
||||||
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
|
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
|
||||||
if (!args.isEmpty()) {
|
if (!args.isEmpty()) {
|
||||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
}
|
}
|
||||||
|
@ -52,6 +52,8 @@ public class LengthFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public LengthFilter create(TokenStream input) {
|
public LengthFilter create(TokenStream input) {
|
||||||
return new LengthFilter(enablePositionIncrements, input,min,max);
|
@SuppressWarnings("deprecation")
|
||||||
|
final LengthFilter filter = new LengthFilter(luceneMatchVersion, enablePositionIncrements, input,min,max);
|
||||||
|
return filter;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,11 +21,14 @@ import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Trims leading and trailing whitespace from Tokens in the stream.
|
* Trims leading and trailing whitespace from Tokens in the stream.
|
||||||
|
* <p>As of Lucene 4.4, this filter does not support updateOffsets=true anymore
|
||||||
|
* as it can lead to broken token streams.
|
||||||
*/
|
*/
|
||||||
public final class TrimFilter extends TokenFilter {
|
public final class TrimFilter extends TokenFilter {
|
||||||
|
|
||||||
|
@ -33,12 +36,27 @@ public final class TrimFilter extends TokenFilter {
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
|
/**
|
||||||
public TrimFilter(TokenStream in, boolean updateOffsets) {
|
* Create a new {@link TrimFilter}.
|
||||||
|
* @param version the Lucene match version
|
||||||
|
* @param in the stream to consume
|
||||||
|
* @param updateOffsets whether to update offsets
|
||||||
|
* @deprecated Offset updates are not supported anymore as of Lucene 4.4.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public TrimFilter(Version version, TokenStream in, boolean updateOffsets) {
|
||||||
super(in);
|
super(in);
|
||||||
|
if (updateOffsets && version.onOrAfter(Version.LUCENE_44)) {
|
||||||
|
throw new IllegalArgumentException("updateOffsets=true is not supported anymore as of Lucene 4.4");
|
||||||
|
}
|
||||||
this.updateOffsets = updateOffsets;
|
this.updateOffsets = updateOffsets;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Create a new {@link TrimFilter} on top of <code>in</code>. */
|
||||||
|
public TrimFilter(Version version, TokenStream in) {
|
||||||
|
this(version, in, false);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (!input.incrementToken()) return false;
|
if (!input.incrementToken()) return false;
|
||||||
|
@ -55,11 +73,10 @@ public final class TrimFilter extends TokenFilter {
|
||||||
int endOff = 0;
|
int endOff = 0;
|
||||||
|
|
||||||
// eat the first characters
|
// eat the first characters
|
||||||
//QUESTION: Should we use Character.isWhitespace() instead?
|
for (start = 0; start < len && Character.isWhitespace(termBuffer[start]); start++) {
|
||||||
for (start = 0; start < len && termBuffer[start] <= ' '; start++) {
|
|
||||||
}
|
}
|
||||||
// eat the end characters
|
// eat the end characters
|
||||||
for (end = len; end >= start && termBuffer[end - 1] <= ' '; end--) {
|
for (end = len; end >= start && Character.isWhitespace(termBuffer[end - 1]); end--) {
|
||||||
endOff++;
|
endOff++;
|
||||||
}
|
}
|
||||||
if (start > 0 || end < len) {
|
if (start > 0 || end < len) {
|
||||||
|
|
|
@ -29,7 +29,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
* <fieldType name="text_trm" class="solr.TextField" positionIncrementGap="100">
|
* <fieldType name="text_trm" class="solr.TextField" positionIncrementGap="100">
|
||||||
* <analyzer>
|
* <analyzer>
|
||||||
* <tokenizer class="solr.NGramTokenizerFactory"/>
|
* <tokenizer class="solr.NGramTokenizerFactory"/>
|
||||||
* <filter class="solr.TrimFilterFactory" updateOffsets="false"/>
|
* <filter class="solr.TrimFilterFactory" />
|
||||||
* </analyzer>
|
* </analyzer>
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*
|
*
|
||||||
|
@ -50,6 +50,8 @@ public class TrimFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TrimFilter create(TokenStream input) {
|
public TrimFilter create(TokenStream input) {
|
||||||
return new TrimFilter(input, updateOffsets);
|
@SuppressWarnings("deprecation")
|
||||||
|
final TrimFilter filter = new TrimFilter(luceneMatchVersion, input, updateOffsets);
|
||||||
|
return filter;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -73,7 +73,7 @@ public final class NGramTokenFilter extends TokenFilter {
|
||||||
* @param maxGram the largest n-gram to generate
|
* @param maxGram the largest n-gram to generate
|
||||||
*/
|
*/
|
||||||
public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
|
public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
|
||||||
super(new LengthFilter(true, input, minGram, Integer.MAX_VALUE));
|
super(new LengthFilter(version, input, minGram, Integer.MAX_VALUE));
|
||||||
this.version = version;
|
this.version = version;
|
||||||
if (minGram < 1) {
|
if (minGram < 1) {
|
||||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||||
|
|
|
@ -22,24 +22,54 @@ import java.io.IOException;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Abstract base class for TokenFilters that may remove tokens.
|
* Abstract base class for TokenFilters that may remove tokens.
|
||||||
* You have to implement {@link #accept} and return a boolean if the current
|
* You have to implement {@link #accept} and return a boolean if the current
|
||||||
* token should be preserved. {@link #incrementToken} uses this method
|
* token should be preserved. {@link #incrementToken} uses this method
|
||||||
* to decide if a token should be passed to the caller.
|
* to decide if a token should be passed to the caller.
|
||||||
|
* <p><a name="version" />As of Lucene 4.4, an {@link IllegalArgumentException}
|
||||||
|
* is thrown when trying to disable position increments when filtering terms.
|
||||||
*/
|
*/
|
||||||
public abstract class FilteringTokenFilter extends TokenFilter {
|
public abstract class FilteringTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
|
private static void checkPositionIncrement(Version version, boolean enablePositionIncrements) {
|
||||||
|
if (!enablePositionIncrements && version.onOrAfter(Version.LUCENE_44)) {
|
||||||
|
throw new IllegalArgumentException("enablePositionIncrements=false is not supported anymore as of Lucene 4.4 as it can create broken token streams");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected final Version version;
|
||||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
|
private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
|
||||||
private boolean first = true; // only used when not preserving gaps
|
private boolean first = true;
|
||||||
|
|
||||||
public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
|
/**
|
||||||
super(input);
|
* Create a new {@link FilteringTokenFilter}.
|
||||||
|
* @param version the Lucene match <a href="#version">version</a>
|
||||||
|
* @param enablePositionIncrements whether to increment position increments when filtering out terms
|
||||||
|
* @param input the input to consume
|
||||||
|
* @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public FilteringTokenFilter(Version version, boolean enablePositionIncrements, TokenStream input){
|
||||||
|
this(version, input);
|
||||||
|
checkPositionIncrement(version, enablePositionIncrements);
|
||||||
this.enablePositionIncrements = enablePositionIncrements;
|
this.enablePositionIncrements = enablePositionIncrements;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new {@link FilteringTokenFilter}.
|
||||||
|
* @param version the Lucene match version
|
||||||
|
* @param in the {@link TokenStream} to consume
|
||||||
|
*/
|
||||||
|
public FilteringTokenFilter(Version version, TokenStream in) {
|
||||||
|
super(in);
|
||||||
|
this.version = version;
|
||||||
|
this.enablePositionIncrements = true;
|
||||||
|
}
|
||||||
|
|
||||||
/** Override this method and return if the current input token should be returned by {@link #incrementToken}. */
|
/** Override this method and return if the current input token should be returned by {@link #incrementToken}. */
|
||||||
protected abstract boolean accept() throws IOException;
|
protected abstract boolean accept() throws IOException;
|
||||||
|
|
||||||
|
@ -102,8 +132,11 @@ public abstract class FilteringTokenFilter extends TokenFilter {
|
||||||
* <p> <b>NOTE</b>: be sure to also
|
* <p> <b>NOTE</b>: be sure to also
|
||||||
* set org.apache.lucene.queryparser.classic.QueryParser#setEnablePositionIncrements if
|
* set org.apache.lucene.queryparser.classic.QueryParser#setEnablePositionIncrements if
|
||||||
* you use QueryParser to create queries.
|
* you use QueryParser to create queries.
|
||||||
|
* @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public void setEnablePositionIncrements(boolean enable) {
|
public void setEnablePositionIncrements(boolean enable) {
|
||||||
|
checkPositionIncrement(version, enable);
|
||||||
this.enablePositionIncrements = enable;
|
this.enablePositionIncrements = enable;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -161,8 +161,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
// startOffset thats > its endOffset
|
// startOffset thats > its endOffset
|
||||||
// (see LUCENE-3738 for a list of other offenders here)
|
// (see LUCENE-3738 for a list of other offenders here)
|
||||||
// broken!
|
// broken!
|
||||||
Lucene43NGramTokenizer.class,
|
|
||||||
// broken!
|
|
||||||
EdgeNGramTokenizer.class,
|
EdgeNGramTokenizer.class,
|
||||||
// broken!
|
// broken!
|
||||||
EdgeNGramTokenFilter.class,
|
EdgeNGramTokenFilter.class,
|
||||||
|
@ -182,55 +180,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
private static final Map<Constructor<?>,Predicate<Object[]>> brokenOffsetsConstructors = new HashMap<Constructor<?>, Predicate<Object[]>>();
|
private static final Map<Constructor<?>,Predicate<Object[]>> brokenOffsetsConstructors = new HashMap<Constructor<?>, Predicate<Object[]>>();
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
brokenOffsetsConstructors.put(
|
|
||||||
TrimFilter.class.getConstructor(TokenStream.class, boolean.class),
|
|
||||||
new Predicate<Object[]>() {
|
|
||||||
@Override
|
|
||||||
public boolean apply(Object[] args) {
|
|
||||||
assert args.length == 2;
|
|
||||||
return (Boolean) args[1]; // args are broken if updateOffsets is true
|
|
||||||
}
|
|
||||||
});
|
|
||||||
brokenOffsetsConstructors.put(
|
|
||||||
TypeTokenFilter.class.getConstructor(boolean.class, TokenStream.class, Set.class, boolean.class),
|
|
||||||
new Predicate<Object[]>() {
|
|
||||||
@Override
|
|
||||||
public boolean apply(Object[] args) {
|
|
||||||
assert args.length == 4;
|
|
||||||
// LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
|
|
||||||
return !(Boolean) args[0];
|
|
||||||
}
|
|
||||||
});
|
|
||||||
brokenOffsetsConstructors.put(
|
|
||||||
TypeTokenFilter.class.getConstructor(boolean.class, TokenStream.class, Set.class),
|
|
||||||
new Predicate<Object[]>() {
|
|
||||||
@Override
|
|
||||||
public boolean apply(Object[] args) {
|
|
||||||
assert args.length == 3;
|
|
||||||
// LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
|
|
||||||
return !(Boolean) args[0];
|
|
||||||
}
|
|
||||||
});
|
|
||||||
brokenOffsetsConstructors.put(
|
|
||||||
LengthFilter.class.getConstructor(boolean.class, TokenStream.class, int.class, int.class),
|
|
||||||
new Predicate<Object[]>() {
|
|
||||||
@Override
|
|
||||||
public boolean apply(Object[] args) {
|
|
||||||
assert args.length == 4;
|
|
||||||
// LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
|
|
||||||
return !(Boolean) args[0];
|
|
||||||
}
|
|
||||||
});
|
|
||||||
brokenOffsetsConstructors.put(
|
|
||||||
KeepWordFilter.class.getConstructor(boolean.class, TokenStream.class, CharArraySet.class),
|
|
||||||
new Predicate<Object[]>() {
|
|
||||||
@Override
|
|
||||||
public boolean apply(Object[] args) {
|
|
||||||
assert args.length == 3;
|
|
||||||
// LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
|
|
||||||
return !(Boolean) args[0];
|
|
||||||
}
|
|
||||||
});
|
|
||||||
for (Class<?> c : Arrays.<Class<?>>asList(
|
for (Class<?> c : Arrays.<Class<?>>asList(
|
||||||
ReversePathHierarchyTokenizer.class,
|
ReversePathHierarchyTokenizer.class,
|
||||||
PathHierarchyTokenizer.class,
|
PathHierarchyTokenizer.class,
|
||||||
|
|
|
@ -75,7 +75,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
||||||
doTestStopPositons(stpf,true);
|
doTestStopPositons(stpf,true);
|
||||||
// without increments
|
// without increments
|
||||||
reader = new StringReader(sb.toString());
|
reader = new StringReader(sb.toString());
|
||||||
stpf = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
|
stpf = new StopFilter(Version.LUCENE_43, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
|
||||||
doTestStopPositons(stpf,false);
|
doTestStopPositons(stpf,false);
|
||||||
// with increments, concatenating two stop filters
|
// with increments, concatenating two stop filters
|
||||||
ArrayList<String> a0 = new ArrayList<String>();
|
ArrayList<String> a0 = new ArrayList<String>();
|
||||||
|
@ -166,7 +166,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
TokenFilter filter = new MockSynonymFilter(tokenizer);
|
TokenFilter filter = new MockSynonymFilter(tokenizer);
|
||||||
StopFilter stopfilter = new StopFilter(TEST_VERSION_CURRENT, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
StopFilter stopfilter = new StopFilter(Version.LUCENE_43, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||||
stopfilter.setEnablePositionIncrements(false);
|
stopfilter.setEnablePositionIncrements(false);
|
||||||
return new TokenStreamComponents(tokenizer, stopfilter);
|
return new TokenStreamComponents(tokenizer, stopfilter);
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.util.English;
|
import org.apache.lucene.util.English;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
@ -36,7 +37,7 @@ public class TestTypeTokenFilter extends BaseTokenStreamTestCase {
|
||||||
public void testTypeFilter() throws IOException {
|
public void testTypeFilter() throws IOException {
|
||||||
StringReader reader = new StringReader("121 is palindrome, while 123 is not");
|
StringReader reader = new StringReader("121 is palindrome, while 123 is not");
|
||||||
Set<String> stopTypes = asSet("<NUM>");
|
Set<String> stopTypes = asSet("<NUM>");
|
||||||
TokenStream stream = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes);
|
TokenStream stream = new TypeTokenFilter(TEST_VERSION_CURRENT, true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes);
|
||||||
assertTokenStreamContents(stream, new String[]{"is", "palindrome", "while", "is", "not"});
|
assertTokenStreamContents(stream, new String[]{"is", "palindrome", "while", "is", "not"});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,12 +60,12 @@ public class TestTypeTokenFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
// with increments
|
// with increments
|
||||||
StringReader reader = new StringReader(sb.toString());
|
StringReader reader = new StringReader(sb.toString());
|
||||||
TypeTokenFilter typeTokenFilter = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
|
TypeTokenFilter typeTokenFilter = new TypeTokenFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
|
||||||
testPositons(typeTokenFilter);
|
testPositons(typeTokenFilter);
|
||||||
|
|
||||||
// without increments
|
// without increments
|
||||||
reader = new StringReader(sb.toString());
|
reader = new StringReader(sb.toString());
|
||||||
typeTokenFilter = new TypeTokenFilter(false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
|
typeTokenFilter = new TypeTokenFilter(Version.LUCENE_43, false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
|
||||||
testPositons(typeTokenFilter);
|
testPositons(typeTokenFilter);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -87,7 +88,7 @@ public class TestTypeTokenFilter extends BaseTokenStreamTestCase {
|
||||||
public void testTypeFilterWhitelist() throws IOException {
|
public void testTypeFilterWhitelist() throws IOException {
|
||||||
StringReader reader = new StringReader("121 is palindrome, while 123 is not");
|
StringReader reader = new StringReader("121 is palindrome, while 123 is not");
|
||||||
Set<String> stopTypes = Collections.singleton("<NUM>");
|
Set<String> stopTypes = Collections.singleton("<NUM>");
|
||||||
TokenStream stream = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes, true);
|
TokenStream stream = new TypeTokenFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes, true);
|
||||||
assertTokenStreamContents(stream, new String[]{"121", "123"});
|
assertTokenStreamContents(stream, new String[]{"121", "123"});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -50,7 +50,7 @@ public class TestTypeTokenFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
public void testCreationWithBlackList() throws Exception {
|
public void testCreationWithBlackList() throws Exception {
|
||||||
TokenFilterFactory factory = tokenFilterFactory("Type",
|
TokenFilterFactory factory = tokenFilterFactory("Type",
|
||||||
"types", "stoptypes-1.txt, stoptypes-2.txt",
|
"types", "stoptypes-1.txt, stoptypes-2.txt",
|
||||||
"enablePositionIncrements", "false");
|
"enablePositionIncrements", "true");
|
||||||
NumericTokenStream input = new NumericTokenStream();
|
NumericTokenStream input = new NumericTokenStream();
|
||||||
input.setIntValue(123);
|
input.setIntValue(123);
|
||||||
factory.create(input);
|
factory.create(input);
|
||||||
|
@ -59,7 +59,7 @@ public class TestTypeTokenFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
public void testCreationWithWhiteList() throws Exception {
|
public void testCreationWithWhiteList() throws Exception {
|
||||||
TokenFilterFactory factory = tokenFilterFactory("Type",
|
TokenFilterFactory factory = tokenFilterFactory("Type",
|
||||||
"types", "stoptypes-1.txt, stoptypes-2.txt",
|
"types", "stoptypes-1.txt, stoptypes-2.txt",
|
||||||
"enablePositionIncrements", "false",
|
"enablePositionIncrements", "true",
|
||||||
"useWhitelist", "true");
|
"useWhitelist", "true");
|
||||||
NumericTokenStream input = new NumericTokenStream();
|
NumericTokenStream input = new NumericTokenStream();
|
||||||
input.setIntValue(123);
|
input.setIntValue(123);
|
||||||
|
|
|
@ -61,7 +61,7 @@ public class TestIrishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
Analyzer a = new IrishAnalyzer(TEST_VERSION_CURRENT);
|
Analyzer a = new IrishAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesTo(a, "n-athair",
|
assertAnalyzesTo(a, "n-athair",
|
||||||
new String[] { "athair" },
|
new String[] { "athair" },
|
||||||
new int[] { 1 });
|
new int[] { 2 });
|
||||||
}
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/** Test {@link KeepWordFilter} */
|
/** Test {@link KeepWordFilter} */
|
||||||
public class TestKeepWordFilter extends BaseTokenStreamTestCase {
|
public class TestKeepWordFilter extends BaseTokenStreamTestCase {
|
||||||
|
@ -42,22 +43,22 @@ public class TestKeepWordFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
// Test Stopwords
|
// Test Stopwords
|
||||||
TokenStream stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
TokenStream stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||||
stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
stream = new KeepWordFilter(TEST_VERSION_CURRENT, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
||||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 });
|
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 });
|
||||||
|
|
||||||
// Now force case
|
// Now force case
|
||||||
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||||
stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
|
stream = new KeepWordFilter(TEST_VERSION_CURRENT, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
|
||||||
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 });
|
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 });
|
||||||
|
|
||||||
// Test Stopwords
|
// Test Stopwords
|
||||||
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||||
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
stream = new KeepWordFilter(Version.LUCENE_43, false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
||||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 });
|
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 });
|
||||||
|
|
||||||
// Now force case
|
// Now force case
|
||||||
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||||
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
|
stream = new KeepWordFilter(Version.LUCENE_43, false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
|
||||||
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
|
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -72,7 +73,7 @@ public class TestKeepWordFilter extends BaseTokenStreamTestCase {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
TokenStream stream = new KeepWordFilter(true, tokenizer, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
TokenStream stream = new KeepWordFilter(TEST_VERSION_CURRENT, tokenizer, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
||||||
return new TokenStreamComponents(tokenizer, stream);
|
return new TokenStreamComponents(tokenizer, stream);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.*;
|
import org.apache.lucene.analysis.*;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
@ -29,7 +30,7 @@ public class TestLengthFilter extends BaseTokenStreamTestCase {
|
||||||
public void testFilterNoPosIncr() throws Exception {
|
public void testFilterNoPosIncr() throws Exception {
|
||||||
TokenStream stream = new MockTokenizer(
|
TokenStream stream = new MockTokenizer(
|
||||||
new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
|
new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
|
||||||
LengthFilter filter = new LengthFilter(false, stream, 2, 6);
|
LengthFilter filter = new LengthFilter(Version.LUCENE_43, false, stream, 2, 6);
|
||||||
assertTokenStreamContents(filter,
|
assertTokenStreamContents(filter,
|
||||||
new String[]{"short", "ab", "foo"},
|
new String[]{"short", "ab", "foo"},
|
||||||
new int[]{1, 1, 1}
|
new int[]{1, 1, 1}
|
||||||
|
@ -39,7 +40,7 @@ public class TestLengthFilter extends BaseTokenStreamTestCase {
|
||||||
public void testFilterWithPosIncr() throws Exception {
|
public void testFilterWithPosIncr() throws Exception {
|
||||||
TokenStream stream = new MockTokenizer(
|
TokenStream stream = new MockTokenizer(
|
||||||
new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
|
new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
|
||||||
LengthFilter filter = new LengthFilter(true, stream, 2, 6);
|
LengthFilter filter = new LengthFilter(TEST_VERSION_CURRENT, stream, 2, 6);
|
||||||
assertTokenStreamContents(filter,
|
assertTokenStreamContents(filter,
|
||||||
new String[]{"short", "ab", "foo"},
|
new String[]{"short", "ab", "foo"},
|
||||||
new int[]{1, 4, 2}
|
new int[]{1, 4, 2}
|
||||||
|
@ -51,7 +52,7 @@ public class TestLengthFilter extends BaseTokenStreamTestCase {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
||||||
return new TokenStreamComponents(tokenizer, new LengthFilter(true, tokenizer, 0, 5));
|
return new TokenStreamComponents(tokenizer, new LengthFilter(TEST_VERSION_CURRENT, tokenizer, 0, 5));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkOneTermReuse(a, "", "");
|
checkOneTermReuse(a, "", "");
|
||||||
|
|
|
@ -22,6 +22,8 @@ import java.io.StringReader;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||||
|
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
public class TestLengthFilterFactory extends BaseTokenStreamFactoryTestCase {
|
public class TestLengthFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
|
|
||||||
|
@ -29,8 +31,10 @@ public class TestLengthFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
Reader reader = new StringReader("foo foobar super-duper-trooper");
|
Reader reader = new StringReader("foo foobar super-duper-trooper");
|
||||||
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
stream = tokenFilterFactory("Length",
|
stream = tokenFilterFactory("Length",
|
||||||
|
Version.LUCENE_43, new ClasspathResourceLoader(getClass()),
|
||||||
"min", "4",
|
"min", "4",
|
||||||
"max", "10").create(stream);
|
"max", "10",
|
||||||
|
"enablePositionIncrements", "false").create(stream);
|
||||||
assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 1 });
|
assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 1 });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.*;
|
import org.apache.lucene.analysis.tokenattributes.*;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*/
|
*/
|
||||||
|
@ -46,7 +47,7 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
|
||||||
new Token(ccc, 0, ccc.length, 11, 15),
|
new Token(ccc, 0, ccc.length, 11, 15),
|
||||||
new Token(whitespace, 0, whitespace.length, 16, 20),
|
new Token(whitespace, 0, whitespace.length, 16, 20),
|
||||||
new Token(empty, 0, empty.length, 21, 21));
|
new Token(empty, 0, empty.length, 21, 21));
|
||||||
ts = new TrimFilter(ts, false);
|
ts = new TrimFilter(TEST_VERSION_CURRENT, ts, false);
|
||||||
|
|
||||||
assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""});
|
assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""});
|
||||||
|
|
||||||
|
@ -59,7 +60,7 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
|
||||||
new Token(b, 0, b.length, 0, 2),
|
new Token(b, 0, b.length, 0, 2),
|
||||||
new Token(ccc, 0, ccc.length, 0, 3),
|
new Token(ccc, 0, ccc.length, 0, 3),
|
||||||
new Token(whitespace, 0, whitespace.length, 0, 3));
|
new Token(whitespace, 0, whitespace.length, 0, 3));
|
||||||
ts = new TrimFilter(ts, true);
|
ts = new TrimFilter(Version.LUCENE_43, ts, true);
|
||||||
|
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
new String[] { "a", "b", "c", "" },
|
new String[] { "a", "b", "c", "" },
|
||||||
|
@ -120,7 +121,7 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
|
||||||
return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, false));
|
return new TokenStreamComponents(tokenizer, new TrimFilter(Version.LUCENE_43, tokenizer, true));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||||
|
@ -130,7 +131,7 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
|
||||||
return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, true));
|
return new TokenStreamComponents(tokenizer, new TrimFilter(TEST_VERSION_CURRENT, tokenizer, false));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
|
||||||
|
@ -141,7 +142,9 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
||||||
return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, random().nextBoolean()));
|
final boolean updateOffsets = random().nextBoolean();
|
||||||
|
final Version version = updateOffsets ? Version.LUCENE_43 : TEST_VERSION_CURRENT;
|
||||||
|
return new TokenStreamComponents(tokenizer, new TrimFilter(version, tokenizer, updateOffsets));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
checkOneTermReuse(a, "", "");
|
checkOneTermReuse(a, "", "");
|
||||||
|
|
|
@ -306,7 +306,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
||||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
|
StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
|
||||||
tokenizer, StandardAnalyzer.STOP_WORDS_SET);
|
tokenizer, StandardAnalyzer.STOP_WORDS_SET);
|
||||||
filter.setEnablePositionIncrements(true);
|
|
||||||
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(filter, flags, protWords));
|
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(filter, flags, protWords));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -89,7 +89,7 @@ public class JapaneseAnalyzer extends StopwordAnalyzerBase {
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
Tokenizer tokenizer = new JapaneseTokenizer(reader, userDict, true, mode);
|
Tokenizer tokenizer = new JapaneseTokenizer(reader, userDict, true, mode);
|
||||||
TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
|
TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
|
||||||
stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
|
stream = new JapanesePartOfSpeechStopFilter(matchVersion, stream, stoptags);
|
||||||
stream = new CJKWidthFilter(stream);
|
stream = new CJKWidthFilter(stream);
|
||||||
stream = new StopFilter(matchVersion, stream, stopwords);
|
stream = new StopFilter(matchVersion, stream, stopwords);
|
||||||
stream = new JapaneseKatakanaStemFilter(stream);
|
stream = new JapaneseKatakanaStemFilter(stream);
|
||||||
|
|
|
@ -22,6 +22,7 @@ import java.util.Set;
|
||||||
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
|
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
|
||||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes tokens that match a set of part-of-speech tags.
|
* Removes tokens that match a set of part-of-speech tags.
|
||||||
|
@ -30,8 +31,21 @@ public final class JapanesePartOfSpeechStopFilter extends FilteringTokenFilter {
|
||||||
private final Set<String> stopTags;
|
private final Set<String> stopTags;
|
||||||
private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
|
private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
|
||||||
|
|
||||||
public JapanesePartOfSpeechStopFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTags) {
|
/** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
|
||||||
super(enablePositionIncrements, input);
|
@Deprecated
|
||||||
|
public JapanesePartOfSpeechStopFilter(Version version, boolean enablePositionIncrements, TokenStream input, Set<String> stopTags) {
|
||||||
|
super(version, enablePositionIncrements, input);
|
||||||
|
this.stopTags = stopTags;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new {@link JapanesePartOfSpeechStopFilter}.
|
||||||
|
* @param version the Lucene match version
|
||||||
|
* @param input the {@link TokenStream} to consume
|
||||||
|
* @param stopTags the part-of-speech tags that should be removed
|
||||||
|
*/
|
||||||
|
public JapanesePartOfSpeechStopFilter(Version version, TokenStream input, Set<String> stopTags) {
|
||||||
|
super(version, input);
|
||||||
this.stopTags = stopTags;
|
this.stopTags = stopTags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -50,7 +50,7 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im
|
||||||
public JapanesePartOfSpeechStopFilterFactory(Map<String,String> args) {
|
public JapanesePartOfSpeechStopFilterFactory(Map<String,String> args) {
|
||||||
super(args);
|
super(args);
|
||||||
stopTagFiles = get(args, "tags");
|
stopTagFiles = get(args, "tags");
|
||||||
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
|
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
|
||||||
if (!args.isEmpty()) {
|
if (!args.isEmpty()) {
|
||||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
}
|
}
|
||||||
|
@ -72,6 +72,12 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream stream) {
|
public TokenStream create(TokenStream stream) {
|
||||||
// if stoptags is null, it means the file is empty
|
// if stoptags is null, it means the file is empty
|
||||||
return stopTags == null ? stream : new JapanesePartOfSpeechStopFilter(enablePositionIncrements, stream, stopTags);
|
if (stopTags != null) {
|
||||||
|
@SuppressWarnings("deprecation")
|
||||||
|
final TokenStream filter = new JapanesePartOfSpeechStopFilter(luceneMatchVersion, enablePositionIncrements, stream, stopTags);
|
||||||
|
return filter;
|
||||||
|
} else {
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,12 +17,8 @@ package org.apache.lucene.analysis;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.FileOutputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.OutputStreamWriter;
|
|
||||||
import java.io.Writer;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||||
|
@ -43,8 +39,16 @@ import org.apache.lucene.util.automaton.Transition;
|
||||||
* @lucene.experimental */
|
* @lucene.experimental */
|
||||||
public class TokenStreamToAutomaton {
|
public class TokenStreamToAutomaton {
|
||||||
|
|
||||||
|
private boolean preservePositionIncrements;
|
||||||
|
|
||||||
/** Sole constructor. */
|
/** Sole constructor. */
|
||||||
public TokenStreamToAutomaton() {
|
public TokenStreamToAutomaton() {
|
||||||
|
this.preservePositionIncrements = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Whether to generate holes in the automaton for missing positions, <code>true</code> by default. */
|
||||||
|
public void setPreservePositionIncrements(boolean enablePositionIncrements) {
|
||||||
|
this.preservePositionIncrements = enablePositionIncrements;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class Position implements RollingBuffer.Resettable {
|
private static class Position implements RollingBuffer.Resettable {
|
||||||
|
@ -108,6 +112,9 @@ public class TokenStreamToAutomaton {
|
||||||
int maxOffset = 0;
|
int maxOffset = 0;
|
||||||
while (in.incrementToken()) {
|
while (in.incrementToken()) {
|
||||||
int posInc = posIncAtt.getPositionIncrement();
|
int posInc = posIncAtt.getPositionIncrement();
|
||||||
|
if (!preservePositionIncrements && posInc > 1) {
|
||||||
|
posInc = 1;
|
||||||
|
}
|
||||||
assert pos > -1 || posInc > 0;
|
assert pos > -1 || posInc > 0;
|
||||||
|
|
||||||
if (posInc > 0) {
|
if (posInc > 0) {
|
||||||
|
|
|
@ -282,18 +282,18 @@ and proximity searches (though sentence identification is not provided by Lucene
|
||||||
<p>
|
<p>
|
||||||
If the selected analyzer filters the stop words "is" and "the", then for a document
|
If the selected analyzer filters the stop words "is" and "the", then for a document
|
||||||
containing the string "blue is the sky", only the tokens "blue", "sky" are indexed,
|
containing the string "blue is the sky", only the tokens "blue", "sky" are indexed,
|
||||||
with position("sky") = 1 + position("blue"). Now, a phrase query "blue is the sky"
|
with position("sky") = 3 + position("blue"). Now, a phrase query "blue is the sky"
|
||||||
would find that document, because the same analyzer filters the same stop words from
|
would find that document, because the same analyzer filters the same stop words from
|
||||||
that query. But also the phrase query "blue sky" would find that document.
|
that query. But the phrase query "blue sky" would not find that document because the
|
||||||
|
position increment between "blue" and "sky" is only 1.
|
||||||
</p>
|
</p>
|
||||||
<p>
|
<p>
|
||||||
If this behavior does not fit the application needs, a modified analyzer can
|
If this behavior does not fit the application needs, the query parser needs to be
|
||||||
be used, that would increment further the positions of tokens following a
|
configured to not take position increments into account when generating phrase queries.
|
||||||
removed stop word, using
|
</p>
|
||||||
{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#setPositionIncrement(int)}.
|
<p>
|
||||||
This can be done with something like the following (note, however, that
|
Note that a StopFilter MUST increment the position increment in order not to generate corrupt
|
||||||
StopFilter natively includes this capability by subclassing
|
tokenstream graphs. Here is the logic used by StopFilter to increment positions when filtering out tokens:
|
||||||
FilteringTokenFilter}:
|
|
||||||
</p>
|
</p>
|
||||||
<PRE class="prettyprint">
|
<PRE class="prettyprint">
|
||||||
public TokenStream tokenStream(final String fieldName, Reader reader) {
|
public TokenStream tokenStream(final String fieldName, Reader reader) {
|
||||||
|
@ -308,7 +308,7 @@ and proximity searches (though sentence identification is not provided by Lucene
|
||||||
boolean hasNext = ts.incrementToken();
|
boolean hasNext = ts.incrementToken();
|
||||||
if (hasNext) {
|
if (hasNext) {
|
||||||
if (stopWords.contains(termAtt.toString())) {
|
if (stopWords.contains(termAtt.toString())) {
|
||||||
extraIncrement++; // filter this word
|
extraIncrement += posIncrAtt.getPositionIncrement(); // filter this word
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (extraIncrement>0) {
|
if (extraIncrement>0) {
|
||||||
|
@ -322,11 +322,6 @@ and proximity searches (though sentence identification is not provided by Lucene
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
</PRE>
|
</PRE>
|
||||||
<p>
|
|
||||||
Now, with this modified analyzer, the phrase query "blue sky" would find that document.
|
|
||||||
But note that this is yet not a perfect solution, because any phrase query "blue w1 w2 sky"
|
|
||||||
where both w1 and w2 are stop words would match that document.
|
|
||||||
</p>
|
|
||||||
<p>
|
<p>
|
||||||
A few more use cases for modifying position increments are:
|
A few more use cases for modifying position increments are:
|
||||||
</p>
|
</p>
|
||||||
|
@ -338,6 +333,72 @@ and proximity searches (though sentence identification is not provided by Lucene
|
||||||
As result, all synonyms of a token would be considered to appear in exactly the
|
As result, all synonyms of a token would be considered to appear in exactly the
|
||||||
same position as that token, and so would they be seen by phrase and proximity searches.</li>
|
same position as that token, and so would they be seen by phrase and proximity searches.</li>
|
||||||
</ol>
|
</ol>
|
||||||
|
|
||||||
|
<h3>Token Position Length</h3>
|
||||||
|
<p>
|
||||||
|
By default, all tokens created by Analyzers and Tokenizers have a
|
||||||
|
{@link org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute#getPositionLength() position length} of one.
|
||||||
|
This means that the token occupies a single position. This attribute is not indexed
|
||||||
|
and thus not taken into account for positional queries, but is used by eg. suggesters.
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
The main use case for positions lengths is multi-word synonyms. With single-word
|
||||||
|
synonyms, setting the position increment to 0 is enough to denote the fact that two
|
||||||
|
words are synonyms, for example:
|
||||||
|
</p>
|
||||||
|
<table>
|
||||||
|
<tr><td>Term</td><td>red</td><td>magenta</td></tr>
|
||||||
|
<tr><td>Position increment</td><td>1</td><td>0</td></tr>
|
||||||
|
</table>
|
||||||
|
<p>
|
||||||
|
Given that position(magenta) = 0 + position(red), they are at the same position, so anything
|
||||||
|
working with analyzers will return the exact same result if you replace "magenta" with "red"
|
||||||
|
in the input. However, multi-word synonyms are more tricky. Let's say that you want to build
|
||||||
|
a TokenStream where "IBM" is a synonym of "Internal Business Machines". Position increments
|
||||||
|
are not enough anymore:
|
||||||
|
</p>
|
||||||
|
<table>
|
||||||
|
<tr><td>Term</td><td>IBM</td><td>International</td><td>Business</td><td>Machines</td></tr>
|
||||||
|
<tr><td>Position increment</td><td>1</td><td>0</td><td>1</td><td>1</td></tr>
|
||||||
|
</table>
|
||||||
|
<p>
|
||||||
|
The problem with this token stream is that "IBM" is at the same position as "International"
|
||||||
|
although it is a synonym with "International Business Machines" as a whole. Setting
|
||||||
|
the position increment of "Business" and "Machines" to 0 wouldn't help as it would mean
|
||||||
|
than "International" is a synonym of "Business". The only way to solve this issue is to
|
||||||
|
make "IBM" span across 3 positions, this is where position lengths come to rescue.
|
||||||
|
</p>
|
||||||
|
<table>
|
||||||
|
<tr><td>Term</td><td>IBM</td><td>International</td><td>Business</td><td>Machines</td></tr>
|
||||||
|
<tr><td>Position increment</td><td>1</td><td>0</td><td>1</td><td>1</td></tr>
|
||||||
|
<tr><td>Position length</td><td>3</td><td>1</td><td>1</td><td>1</td></tr>
|
||||||
|
</table>
|
||||||
|
<p>
|
||||||
|
This new attribute makes clear that "IBM" and "International Business Machines" start and end
|
||||||
|
at the same positions.
|
||||||
|
</p>
|
||||||
|
<a name="corrupt" />
|
||||||
|
<h3>How to not write corrupt token streams</h3>
|
||||||
|
<p>
|
||||||
|
There are a few rules to observe when writing custom Tokenizers and TokenFilters:
|
||||||
|
</p>
|
||||||
|
<ul>
|
||||||
|
<li>The first position increment must be > 0.</li>
|
||||||
|
<li>Positions must not go backward.</li>
|
||||||
|
<li>Tokens that have the same start position must have the same start offset.</li>
|
||||||
|
<li>Tokens that have the same end position (taking into account the position length) must have the same end offset.</li>
|
||||||
|
</ul>
|
||||||
|
<p>
|
||||||
|
Although these rules might seem easy to follow, problems can quickly happen when chaining
|
||||||
|
badly implemented filters that play with positions and offsets, such as synonym or n-grams
|
||||||
|
filters. Here are good practices for writing correct filters:
|
||||||
|
</p>
|
||||||
|
<ul>
|
||||||
|
<li>Token filters should not modify offsets. If you feel that your filter would need to modify offsets, then it should probably be implemented as a tokenizer.</li>
|
||||||
|
<li>Token filters should not insert positions. If a filter needs to add tokens, then they shoud all have a position increment of 0.</li>
|
||||||
|
<li>When they remove tokens, token filters should increment the position increment of the following token.</li>
|
||||||
|
<li>Token filters should preserve position lengths.</li>
|
||||||
|
</ul>
|
||||||
<h2>TokenStream API</h2>
|
<h2>TokenStream API</h2>
|
||||||
<p>
|
<p>
|
||||||
"Flexible Indexing" summarizes the effort of making the Lucene indexer
|
"Flexible Indexing" summarizes the effort of making the Lucene indexer
|
||||||
|
@ -382,6 +443,10 @@ and proximity searches (though sentence identification is not provided by Lucene
|
||||||
<td>{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute}</td>
|
<td>{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute}</td>
|
||||||
<td>See above for detailed information about position increment.</td>
|
<td>See above for detailed information about position increment.</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>{@link org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute}</td>
|
||||||
|
<td>The number of positions occupied by a token.</td>
|
||||||
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>{@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute}</td>
|
<td>{@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute}</td>
|
||||||
<td>The payload that a Token can optionally have.</td>
|
<td>The payload that a Token can optionally have.</td>
|
||||||
|
@ -532,20 +597,26 @@ public final class LengthFilter extends FilteringTokenFilter {
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Build a filter that removes words that are too long or too
|
* Create a new LengthFilter. This will filter out tokens whose
|
||||||
* short from the text.
|
* CharTermAttribute is either too short
|
||||||
|
* (< min) or too long (> max).
|
||||||
|
* @param version the Lucene match version
|
||||||
|
* @param in the TokenStream to consume
|
||||||
|
* @param min the minimum length
|
||||||
|
* @param max the maximum length
|
||||||
*/
|
*/
|
||||||
public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) {
|
public LengthFilter(Version version, TokenStream in, int min, int max) {
|
||||||
super(enablePositionIncrements, in);
|
super(version, in);
|
||||||
this.min = min;
|
this.min = min;
|
||||||
this.max = max;
|
this.max = max;
|
||||||
}
|
}
|
||||||
|
|
||||||
{@literal @Override}
|
{@literal @Override}
|
||||||
public boolean accept() throws IOException {
|
public boolean accept() {
|
||||||
final int len = termAtt.length();
|
final int len = termAtt.length();
|
||||||
return (len >= min && len <= max);
|
return (len >= min && len <= max);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
</pre>
|
</pre>
|
||||||
<p>
|
<p>
|
||||||
|
@ -573,66 +644,39 @@ public final class LengthFilter extends FilteringTokenFilter {
|
||||||
public abstract class FilteringTokenFilter extends TokenFilter {
|
public abstract class FilteringTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
|
|
||||||
|
|
||||||
public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
|
/**
|
||||||
super(input);
|
* Create a new FilteringTokenFilter.
|
||||||
this.enablePositionIncrements = enablePositionIncrements;
|
* @param in the TokenStream to consume
|
||||||
|
*/
|
||||||
|
public FilteringTokenFilter(Version version, TokenStream in) {
|
||||||
|
super(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Override this method and return if the current input token should be returned by {@literal {@link #incrementToken}}. */
|
/** Override this method and return if the current input token should be returned by incrementToken. */
|
||||||
protected abstract boolean accept() throws IOException;
|
protected abstract boolean accept() throws IOException;
|
||||||
|
|
||||||
{@literal @Override}
|
{@literal @Override}
|
||||||
public final boolean incrementToken() throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
if (enablePositionIncrements) {
|
int skippedPositions = 0;
|
||||||
int skippedPositions = 0;
|
while (input.incrementToken()) {
|
||||||
while (input.incrementToken()) {
|
if (accept()) {
|
||||||
if (accept()) {
|
if (skippedPositions != 0) {
|
||||||
if (skippedPositions != 0) {
|
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
|
||||||
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
skippedPositions += posIncrAtt.getPositionIncrement();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
while (input.incrementToken()) {
|
|
||||||
if (accept()) {
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
skippedPositions += posIncrAtt.getPositionIncrement();
|
||||||
}
|
}
|
||||||
// reached EOS -- return false
|
// reached EOS -- return false
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
{@literal @Override}
|
||||||
* {@literal @see #setEnablePositionIncrements(boolean)}
|
public void reset() throws IOException {
|
||||||
*/
|
super.reset();
|
||||||
public boolean getEnablePositionIncrements() {
|
|
||||||
return enablePositionIncrements;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* If <code>true</code>, this TokenFilter will preserve
|
|
||||||
* positions of the incoming tokens (ie, accumulate and
|
|
||||||
* set position increments of the removed tokens).
|
|
||||||
* Generally, <code>true</code> is best as it does not
|
|
||||||
* lose information (positions of the original tokens)
|
|
||||||
* during indexing.
|
|
||||||
*
|
|
||||||
* <p> When set, when a token is stopped
|
|
||||||
* (omitted), the position increment of the following
|
|
||||||
* token is incremented.
|
|
||||||
*
|
|
||||||
* <p> <b>NOTE</b>: be sure to also
|
|
||||||
* set org.apache.lucene.queryparser.classic.QueryParser#setEnablePositionIncrements if
|
|
||||||
* you use QueryParser to create queries.
|
|
||||||
*/
|
|
||||||
public void setEnablePositionIncrements(boolean enable) {
|
|
||||||
this.enablePositionIncrements = enable;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
</pre>
|
</pre>
|
||||||
|
|
||||||
|
|
|
@ -64,16 +64,10 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** Test a configuration that behaves a lot like StopAnalyzer */
|
/** Test a configuration that behaves a lot like StopAnalyzer */
|
||||||
public void testStop() throws Exception {
|
public void testStop() throws Exception {
|
||||||
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||||
assertAnalyzesTo(a, "the quick brown a fox",
|
assertAnalyzesTo(a, "the quick brown a fox",
|
||||||
new String[] { "quick", "brown", "fox" },
|
new String[] { "quick", "brown", "fox" },
|
||||||
new int[] { 2, 1, 2 });
|
new int[] { 2, 1, 2 });
|
||||||
|
|
||||||
// disable positions
|
|
||||||
a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, false);
|
|
||||||
assertAnalyzesTo(a, "the quick brown a fox",
|
|
||||||
new String[] { "quick", "brown", "fox" },
|
|
||||||
new int[] { 1, 1, 1 });
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Test a configuration that behaves a lot like KeepWordFilter */
|
/** Test a configuration that behaves a lot like KeepWordFilter */
|
||||||
|
@ -83,7 +77,7 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
||||||
BasicOperations.complement(
|
BasicOperations.complement(
|
||||||
Automaton.union(
|
Automaton.union(
|
||||||
Arrays.asList(BasicAutomata.makeString("foo"), BasicAutomata.makeString("bar")))));
|
Arrays.asList(BasicAutomata.makeString("foo"), BasicAutomata.makeString("bar")))));
|
||||||
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords, true);
|
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords);
|
||||||
assertAnalyzesTo(a, "quick foo brown bar bar fox foo",
|
assertAnalyzesTo(a, "quick foo brown bar bar fox foo",
|
||||||
new String[] { "foo", "bar", "bar", "foo" },
|
new String[] { "foo", "bar", "bar", "foo" },
|
||||||
new int[] { 2, 2, 1, 2 });
|
new int[] { 2, 2, 1, 2 });
|
||||||
|
@ -92,7 +86,7 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** Test a configuration that behaves a lot like LengthFilter */
|
/** Test a configuration that behaves a lot like LengthFilter */
|
||||||
public void testLength() throws Exception {
|
public void testLength() throws Exception {
|
||||||
CharacterRunAutomaton length5 = new CharacterRunAutomaton(new RegExp(".{5,}").toAutomaton());
|
CharacterRunAutomaton length5 = new CharacterRunAutomaton(new RegExp(".{5,}").toAutomaton());
|
||||||
Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, length5, true);
|
Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, length5);
|
||||||
assertAnalyzesTo(a, "ok toolong fine notfine",
|
assertAnalyzesTo(a, "ok toolong fine notfine",
|
||||||
new String[] { "ok", "fine" },
|
new String[] { "ok", "fine" },
|
||||||
new int[] { 1, 2 });
|
new int[] { 1, 2 });
|
||||||
|
|
|
@ -213,7 +213,7 @@ public class TestTermVectorsWriter extends LuceneTestCase {
|
||||||
public void testEndOffsetPositionStopFilter() throws Exception {
|
public void testEndOffsetPositionStopFilter() throws Exception {
|
||||||
Directory dir = newDirectory();
|
Directory dir = newDirectory();
|
||||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
|
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
|
||||||
TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)));
|
TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
|
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
customType.setStoreTermVectors(true);
|
customType.setStoreTermVectors(true);
|
||||||
|
|
|
@ -222,7 +222,7 @@ public class TestPhraseQuery extends LuceneTestCase {
|
||||||
|
|
||||||
public void testPhraseQueryWithStopAnalyzer() throws Exception {
|
public void testPhraseQueryWithStopAnalyzer() throws Exception {
|
||||||
Directory directory = newDirectory();
|
Directory directory = newDirectory();
|
||||||
Analyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, false);
|
Analyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
|
RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
|
||||||
newIndexWriterConfig( Version.LUCENE_40, stopAnalyzer));
|
newIndexWriterConfig( Version.LUCENE_40, stopAnalyzer));
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
|
@ -241,16 +241,6 @@ public class TestPhraseQuery extends LuceneTestCase {
|
||||||
assertEquals(1, hits.length);
|
assertEquals(1, hits.length);
|
||||||
QueryUtils.check(random(), query,searcher);
|
QueryUtils.check(random(), query,searcher);
|
||||||
|
|
||||||
|
|
||||||
// StopAnalyzer as of 2.4 does not leave "holes", so this matches.
|
|
||||||
query = new PhraseQuery();
|
|
||||||
query.add(new Term("field", "words"));
|
|
||||||
query.add(new Term("field", "here"));
|
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(1, hits.length);
|
|
||||||
QueryUtils.check(random(), query,searcher);
|
|
||||||
|
|
||||||
|
|
||||||
reader.close();
|
reader.close();
|
||||||
directory.close();
|
directory.close();
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,7 +37,7 @@ public class TestSpanFirstQuery extends LuceneTestCase {
|
||||||
|
|
||||||
// mimic StopAnalyzer
|
// mimic StopAnalyzer
|
||||||
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|a|of").toAutomaton());
|
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|a|of").toAutomaton());
|
||||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true);
|
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
|
||||||
|
|
||||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
|
|
|
@ -60,7 +60,7 @@ public class TestSpansAdvanced extends LuceneTestCase {
|
||||||
mDirectory = newDirectory();
|
mDirectory = newDirectory();
|
||||||
final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory,
|
final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory,
|
||||||
newIndexWriterConfig(TEST_VERSION_CURRENT,
|
newIndexWriterConfig(TEST_VERSION_CURRENT,
|
||||||
new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true))
|
new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET))
|
||||||
.setMergePolicy(newLogMergePolicy()).setSimilarity(new DefaultSimilarity()));
|
.setMergePolicy(newLogMergePolicy()).setSimilarity(new DefaultSimilarity()));
|
||||||
addDocument(writer, "1", "I think it should work.");
|
addDocument(writer, "1", "I think it should work.");
|
||||||
addDocument(writer, "2", "I think it should work.");
|
addDocument(writer, "2", "I think it should work.");
|
||||||
|
|
|
@ -49,7 +49,7 @@ public class TestSpansAdvanced2 extends TestSpansAdvanced {
|
||||||
// create test index
|
// create test index
|
||||||
final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory,
|
final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory,
|
||||||
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(),
|
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(),
|
||||||
MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true))
|
MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET))
|
||||||
.setOpenMode(OpenMode.APPEND).setMergePolicy(newLogMergePolicy())
|
.setOpenMode(OpenMode.APPEND).setMergePolicy(newLogMergePolicy())
|
||||||
.setSimilarity(new DefaultSimilarity()));
|
.setSimilarity(new DefaultSimilarity()));
|
||||||
addDocument(writer, "A", "Should we, could we, would we?");
|
addDocument(writer, "A", "Should we, could we, would we?");
|
||||||
|
|
|
@ -247,7 +247,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
*/
|
*/
|
||||||
private String highlightField(Query query, String fieldName, String text)
|
private String highlightField(Query query, String fieldName, String text)
|
||||||
throws IOException, InvalidTokenOffsetsException {
|
throws IOException, InvalidTokenOffsetsException {
|
||||||
TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)
|
TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)
|
||||||
.tokenStream(fieldName, new StringReader(text));
|
.tokenStream(fieldName, new StringReader(text));
|
||||||
// Assuming "<B>", "</B>" used to highlight
|
// Assuming "<B>", "</B>" used to highlight
|
||||||
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
|
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
|
||||||
|
@ -1308,7 +1308,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testMaxSizeHighlight() throws Exception {
|
public void testMaxSizeHighlight() throws Exception {
|
||||||
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||||
// we disable MockTokenizer checks because we will forcefully limit the
|
// we disable MockTokenizer checks because we will forcefully limit the
|
||||||
// tokenstream and call end() before incrementToken() returns false.
|
// tokenstream and call end() before incrementToken() returns false.
|
||||||
analyzer.setEnableChecks(false);
|
analyzer.setEnableChecks(false);
|
||||||
|
@ -1343,7 +1343,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("stoppedtoken"));
|
CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("stoppedtoken"));
|
||||||
// we disable MockTokenizer checks because we will forcefully limit the
|
// we disable MockTokenizer checks because we will forcefully limit the
|
||||||
// tokenstream and call end() before incrementToken() returns false.
|
// tokenstream and call end() before incrementToken() returns false.
|
||||||
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords, true);
|
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
|
||||||
analyzer.setEnableChecks(false);
|
analyzer.setEnableChecks(false);
|
||||||
TermQuery query = new TermQuery(new Term("data", goodWord));
|
TermQuery query = new TermQuery(new Term("data", goodWord));
|
||||||
|
|
||||||
|
@ -1394,7 +1394,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
Highlighter hg = getHighlighter(query, "text", fm);
|
Highlighter hg = getHighlighter(query, "text", fm);
|
||||||
hg.setTextFragmenter(new NullFragmenter());
|
hg.setTextFragmenter(new NullFragmenter());
|
||||||
hg.setMaxDocCharsToAnalyze(36);
|
hg.setMaxDocCharsToAnalyze(36);
|
||||||
String match = hg.getBestFragment(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords, true), "text", text);
|
String match = hg.getBestFragment(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords), "text", text);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
"Matched text should contain remainder of text after highlighted query ",
|
"Matched text should contain remainder of text after highlighted query ",
|
||||||
match.endsWith("in it"));
|
match.endsWith("in it"));
|
||||||
|
@ -1411,7 +1411,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
numHighlights = 0;
|
numHighlights = 0;
|
||||||
// test to show how rewritten query can still be used
|
// test to show how rewritten query can still be used
|
||||||
searcher = newSearcher(reader);
|
searcher = newSearcher(reader);
|
||||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||||
|
|
||||||
BooleanQuery query = new BooleanQuery();
|
BooleanQuery query = new BooleanQuery();
|
||||||
query.add(new WildcardQuery(new Term(FIELD_NAME, "jf?")), Occur.SHOULD);
|
query.add(new WildcardQuery(new Term(FIELD_NAME, "jf?")), Occur.SHOULD);
|
||||||
|
@ -1875,11 +1875,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
super.setUp();
|
super.setUp();
|
||||||
|
|
||||||
a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
|
a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
|
||||||
analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||||
dir = newDirectory();
|
dir = newDirectory();
|
||||||
ramDir = newDirectory();
|
ramDir = newDirectory();
|
||||||
IndexWriter writer = new IndexWriter(ramDir, newIndexWriterConfig(
|
IndexWriter writer = new IndexWriter(ramDir, newIndexWriterConfig(
|
||||||
TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)));
|
TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
|
||||||
for (String text : texts) {
|
for (String text : texts) {
|
||||||
addDoc(writer, text);
|
addDoc(writer, text);
|
||||||
}
|
}
|
||||||
|
|
|
@ -89,7 +89,7 @@ public class HighlightCustomQueryTest extends LuceneTestCase {
|
||||||
private String highlightField(Query query, String fieldName,
|
private String highlightField(Query query, String fieldName,
|
||||||
String text) throws IOException, InvalidTokenOffsetsException {
|
String text) throws IOException, InvalidTokenOffsetsException {
|
||||||
TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE,
|
TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE,
|
||||||
true, MockTokenFilter.ENGLISH_STOPSET, true).tokenStream(fieldName,
|
true, MockTokenFilter.ENGLISH_STOPSET).tokenStream(fieldName,
|
||||||
new StringReader(text));
|
new StringReader(text));
|
||||||
// Assuming "<B>", "</B>" used to highlight
|
// Assuming "<B>", "</B>" used to highlight
|
||||||
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
|
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
|
||||||
|
|
|
@ -247,7 +247,7 @@ public class FastVectorHighlighterTest extends LuceneTestCase {
|
||||||
|
|
||||||
public void testCommonTermsQueryHighlightTest() throws IOException {
|
public void testCommonTermsQueryHighlightTest() throws IOException {
|
||||||
Directory dir = newDirectory();
|
Directory dir = newDirectory();
|
||||||
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)));
|
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
|
||||||
FieldType type = new FieldType(TextField.TYPE_STORED);
|
FieldType type = new FieldType(TextField.TYPE_STORED);
|
||||||
type.setStoreTermVectorOffsets(true);
|
type.setStoreTermVectorOffsets(true);
|
||||||
type.setStoreTermVectorPositions(true);
|
type.setStoreTermVectorPositions(true);
|
||||||
|
|
|
@ -259,7 +259,7 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
|
||||||
private Analyzer randomAnalyzer() {
|
private Analyzer randomAnalyzer() {
|
||||||
switch(random().nextInt(4)) {
|
switch(random().nextInt(4)) {
|
||||||
case 0: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
case 0: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||||
case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||||
case 2: return new Analyzer() {
|
case 2: return new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
|
|
@ -546,7 +546,7 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
|
||||||
|
|
||||||
public void testBoost() throws Exception {
|
public void testBoost() throws Exception {
|
||||||
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
|
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
|
||||||
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true);
|
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
|
||||||
|
|
||||||
PrecedenceQueryParser qp = new PrecedenceQueryParser();
|
PrecedenceQueryParser qp = new PrecedenceQueryParser();
|
||||||
qp.setAnalyzer(oneStopAnalyzer);
|
qp.setAnalyzer(oneStopAnalyzer);
|
||||||
|
@ -561,7 +561,7 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
|
||||||
q = qp.parse("\"on\"^1.0", "field");
|
q = qp.parse("\"on\"^1.0", "field");
|
||||||
assertNotNull(q);
|
assertNotNull(q);
|
||||||
|
|
||||||
q = getParser(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)).parse("the^3",
|
q = getParser(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)).parse("the^3",
|
||||||
"field");
|
"field");
|
||||||
assertNotNull(q);
|
assertNotNull(q);
|
||||||
}
|
}
|
||||||
|
|
|
@ -946,7 +946,7 @@ public class TestQPHelper extends LuceneTestCase {
|
||||||
|
|
||||||
public void testBoost() throws Exception {
|
public void testBoost() throws Exception {
|
||||||
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
|
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
|
||||||
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true);
|
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
|
||||||
StandardQueryParser qp = new StandardQueryParser();
|
StandardQueryParser qp = new StandardQueryParser();
|
||||||
qp.setAnalyzer(oneStopAnalyzer);
|
qp.setAnalyzer(oneStopAnalyzer);
|
||||||
|
|
||||||
|
@ -962,7 +962,7 @@ public class TestQPHelper extends LuceneTestCase {
|
||||||
assertNotNull(q);
|
assertNotNull(q);
|
||||||
|
|
||||||
StandardQueryParser qp2 = new StandardQueryParser();
|
StandardQueryParser qp2 = new StandardQueryParser();
|
||||||
qp2.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true));
|
qp2.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));
|
||||||
|
|
||||||
q = qp2.parse("the^3", "field");
|
q = qp2.parse("the^3", "field");
|
||||||
// "the" is a stop word so the result is an empty query:
|
// "the" is a stop word so the result is an empty query:
|
||||||
|
@ -1179,7 +1179,7 @@ public class TestQPHelper extends LuceneTestCase {
|
||||||
public void testStopwords() throws Exception {
|
public void testStopwords() throws Exception {
|
||||||
StandardQueryParser qp = new StandardQueryParser();
|
StandardQueryParser qp = new StandardQueryParser();
|
||||||
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
|
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
|
||||||
qp.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true));
|
qp.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
|
||||||
|
|
||||||
Query result = qp.parse("a:the OR a:foo", "a");
|
Query result = qp.parse("a:the OR a:foo", "a");
|
||||||
assertNotNull("result is null and it shouldn't be", result);
|
assertNotNull("result is null and it shouldn't be", result);
|
||||||
|
@ -1203,7 +1203,7 @@ public class TestQPHelper extends LuceneTestCase {
|
||||||
public void testPositionIncrement() throws Exception {
|
public void testPositionIncrement() throws Exception {
|
||||||
StandardQueryParser qp = new StandardQueryParser();
|
StandardQueryParser qp = new StandardQueryParser();
|
||||||
qp.setAnalyzer(
|
qp.setAnalyzer(
|
||||||
new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true));
|
new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));
|
||||||
|
|
||||||
qp.setEnablePositionIncrements(true);
|
qp.setEnablePositionIncrements(true);
|
||||||
|
|
||||||
|
|
|
@ -852,7 +852,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
||||||
public void testBoost()
|
public void testBoost()
|
||||||
throws Exception {
|
throws Exception {
|
||||||
CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
|
CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
|
||||||
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords, true);
|
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
|
||||||
CommonQueryParserConfiguration qp = getParserConfig(oneStopAnalyzer);
|
CommonQueryParserConfiguration qp = getParserConfig(oneStopAnalyzer);
|
||||||
Query q = getQuery("on^1.0",qp);
|
Query q = getQuery("on^1.0",qp);
|
||||||
assertNotNull(q);
|
assertNotNull(q);
|
||||||
|
@ -865,7 +865,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
||||||
q = getQuery("\"on\"^1.0",qp);
|
q = getQuery("\"on\"^1.0",qp);
|
||||||
assertNotNull(q);
|
assertNotNull(q);
|
||||||
|
|
||||||
Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||||
CommonQueryParserConfiguration qp2 = getParserConfig(a2);
|
CommonQueryParserConfiguration qp2 = getParserConfig(a2);
|
||||||
q = getQuery("the^3", qp2);
|
q = getQuery("the^3", qp2);
|
||||||
// "the" is a stop word so the result is an empty query:
|
// "the" is a stop word so the result is an empty query:
|
||||||
|
@ -1007,7 +1007,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
||||||
|
|
||||||
public void testStopwords() throws Exception {
|
public void testStopwords() throws Exception {
|
||||||
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
|
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
|
||||||
CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true));
|
CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
|
||||||
Query result = getQuery("field:the OR field:foo",qp);
|
Query result = getQuery("field:the OR field:foo",qp);
|
||||||
assertNotNull("result is null and it shouldn't be", result);
|
assertNotNull("result is null and it shouldn't be", result);
|
||||||
assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
|
assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
|
||||||
|
@ -1023,7 +1023,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testPositionIncrement() throws Exception {
|
public void testPositionIncrement() throws Exception {
|
||||||
CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true));
|
CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));
|
||||||
qp.setEnablePositionIncrements(true);
|
qp.setEnablePositionIncrements(true);
|
||||||
String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\"";
|
String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\"";
|
||||||
// 0 2 5 7 8
|
// 0 2 5 7 8
|
||||||
|
@ -1070,7 +1070,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
||||||
// "match"
|
// "match"
|
||||||
public void testPositionIncrements() throws Exception {
|
public void testPositionIncrements() throws Exception {
|
||||||
Directory dir = newDirectory();
|
Directory dir = newDirectory();
|
||||||
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, a));
|
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, a));
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
doc.add(newTextField("field", "the wizard of ozzy", Field.Store.NO));
|
doc.add(newTextField("field", "the wizard of ozzy", Field.Store.NO));
|
||||||
|
@ -1185,7 +1185,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testPhraseQueryToString() throws Exception {
|
public void testPhraseQueryToString() throws Exception {
|
||||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||||
CommonQueryParserConfiguration qp = getParserConfig(analyzer);
|
CommonQueryParserConfiguration qp = getParserConfig(analyzer);
|
||||||
qp.setEnablePositionIncrements(true);
|
qp.setEnablePositionIncrements(true);
|
||||||
PhraseQuery q = (PhraseQuery)getQuery("\"this hi this is a test is\"", qp);
|
PhraseQuery q = (PhraseQuery)getQuery("\"this hi this is a test is\"", qp);
|
||||||
|
@ -1235,26 +1235,13 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
||||||
CharacterRunAutomaton stopStopList =
|
CharacterRunAutomaton stopStopList =
|
||||||
new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton());
|
new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton());
|
||||||
|
|
||||||
CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList, false));
|
CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList));
|
||||||
|
|
||||||
PhraseQuery phraseQuery = new PhraseQuery();
|
|
||||||
phraseQuery.add(new Term("field", "1"));
|
|
||||||
phraseQuery.add(new Term("field", "2"));
|
|
||||||
|
|
||||||
assertEquals(phraseQuery, getQuery("\"1 2\"",qp));
|
|
||||||
assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp));
|
|
||||||
|
|
||||||
qp.setEnablePositionIncrements(true);
|
|
||||||
assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp));
|
|
||||||
|
|
||||||
qp.setEnablePositionIncrements(false);
|
|
||||||
assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp));
|
|
||||||
|
|
||||||
qp = getParserConfig(
|
qp = getParserConfig(
|
||||||
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList, true));
|
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList));
|
||||||
qp.setEnablePositionIncrements(true);
|
qp.setEnablePositionIncrements(true);
|
||||||
|
|
||||||
phraseQuery = new PhraseQuery();
|
PhraseQuery phraseQuery = new PhraseQuery();
|
||||||
phraseQuery.add(new Term("field", "1"));
|
phraseQuery.add(new Term("field", "1"));
|
||||||
phraseQuery.add(new Term("field", "2"), 2);
|
phraseQuery.add(new Term("field", "2"), 2);
|
||||||
assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp));
|
assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp));
|
||||||
|
|
|
@ -58,7 +58,7 @@ public class TestParser extends LuceneTestCase {
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
public static void beforeClass() throws Exception {
|
public static void beforeClass() throws Exception {
|
||||||
// TODO: rewrite test (this needs to set QueryParser.enablePositionIncrements, too, for work with CURRENT):
|
// TODO: rewrite test (this needs to set QueryParser.enablePositionIncrements, too, for work with CURRENT):
|
||||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false);
|
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||||
//initialize the parser
|
//initialize the parser
|
||||||
builder = new CorePlusExtensionsParser("contents", analyzer);
|
builder = new CorePlusExtensionsParser("contents", analyzer);
|
||||||
|
|
||||||
|
|
|
@ -75,9 +75,9 @@ import org.apache.lucene.util.fst.Util;
|
||||||
* example, if you use an analyzer removing stop words,
|
* example, if you use an analyzer removing stop words,
|
||||||
* then the partial text "ghost chr..." could see the
|
* then the partial text "ghost chr..." could see the
|
||||||
* suggestion "The Ghost of Christmas Past". Note that
|
* suggestion "The Ghost of Christmas Past". Note that
|
||||||
* your {@code StopFilter} instance must NOT preserve
|
* position increments MUST NOT be preserved for this example
|
||||||
* position increments for this example to work, so you should call
|
* to work, so you should call
|
||||||
* {@code setEnablePositionIncrements(false)} on it.
|
* {@link #setPreservePositionIncrements(boolean) setPreservePositionIncrements(false)}.
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* If SynonymFilter is used to map wifi and wireless network to
|
* If SynonymFilter is used to map wifi and wireless network to
|
||||||
|
@ -185,6 +185,9 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
|
|
||||||
private static final int PAYLOAD_SEP = '\u001f';
|
private static final int PAYLOAD_SEP = '\u001f';
|
||||||
|
|
||||||
|
/** Whether position holes should appear in the automaton. */
|
||||||
|
private boolean preservePositionIncrements;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)
|
* Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)
|
||||||
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
|
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
|
||||||
|
@ -241,6 +244,13 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")");
|
throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")");
|
||||||
}
|
}
|
||||||
this.maxGraphExpansions = maxGraphExpansions;
|
this.maxGraphExpansions = maxGraphExpansions;
|
||||||
|
preservePositionIncrements = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Whether to take position holes (position increment > 1) into account when
|
||||||
|
* building the automaton, <code>true</code> by default. */
|
||||||
|
public void setPreservePositionIncrements(boolean preservePositionIncrements) {
|
||||||
|
this.preservePositionIncrements = preservePositionIncrements;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns byte size of the underlying FST. */
|
/** Returns byte size of the underlying FST. */
|
||||||
|
@ -327,13 +337,16 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
}
|
}
|
||||||
|
|
||||||
TokenStreamToAutomaton getTokenStreamToAutomaton() {
|
TokenStreamToAutomaton getTokenStreamToAutomaton() {
|
||||||
|
final TokenStreamToAutomaton tsta;
|
||||||
if (preserveSep) {
|
if (preserveSep) {
|
||||||
return new EscapingTokenStreamToAutomaton();
|
tsta = new EscapingTokenStreamToAutomaton();
|
||||||
} else {
|
} else {
|
||||||
// When we're not preserving sep, we don't steal 0xff
|
// When we're not preserving sep, we don't steal 0xff
|
||||||
// byte, so we don't need to do any escaping:
|
// byte, so we don't need to do any escaping:
|
||||||
return new TokenStreamToAutomaton();
|
tsta = new TokenStreamToAutomaton();
|
||||||
}
|
}
|
||||||
|
tsta.setPreservePositionIncrements(preservePositionIncrements);
|
||||||
|
return tsta;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class AnalyzingComparator implements Comparator<BytesRef> {
|
private static class AnalyzingComparator implements Comparator<BytesRef> {
|
||||||
|
|
|
@ -164,8 +164,9 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
new TermFreq("the ghost of christmas past", 50),
|
new TermFreq("the ghost of christmas past", 50),
|
||||||
};
|
};
|
||||||
|
|
||||||
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false);
|
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||||
AnalyzingSuggester suggester = new AnalyzingSuggester(standard);
|
AnalyzingSuggester suggester = new AnalyzingSuggester(standard);
|
||||||
|
suggester.setPreservePositionIncrements(false);
|
||||||
suggester.build(new TermFreqArrayIterator(keys));
|
suggester.build(new TermFreqArrayIterator(keys));
|
||||||
|
|
||||||
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
|
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
|
||||||
|
@ -187,7 +188,7 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEmpty() throws Exception {
|
public void testEmpty() throws Exception {
|
||||||
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false);
|
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||||
AnalyzingSuggester suggester = new AnalyzingSuggester(standard);
|
AnalyzingSuggester suggester = new AnalyzingSuggester(standard);
|
||||||
suggester.build(new TermFreqArrayIterator(new TermFreq[0]));
|
suggester.build(new TermFreqArrayIterator(new TermFreq[0]));
|
||||||
|
|
||||||
|
|
|
@ -153,8 +153,9 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
||||||
new TermFreq("the ghost of christmas past", 50),
|
new TermFreq("the ghost of christmas past", 50),
|
||||||
};
|
};
|
||||||
|
|
||||||
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false);
|
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||||
FuzzySuggester suggester = new FuzzySuggester(standard);
|
FuzzySuggester suggester = new FuzzySuggester(standard);
|
||||||
|
suggester.setPreservePositionIncrements(false);
|
||||||
suggester.build(new TermFreqArrayIterator(keys));
|
suggester.build(new TermFreqArrayIterator(keys));
|
||||||
|
|
||||||
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
|
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
|
||||||
|
|
|
@ -17,7 +17,6 @@ package org.apache.lucene.analysis;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -46,7 +45,6 @@ public final class MockAnalyzer extends Analyzer {
|
||||||
private final CharacterRunAutomaton runAutomaton;
|
private final CharacterRunAutomaton runAutomaton;
|
||||||
private final boolean lowerCase;
|
private final boolean lowerCase;
|
||||||
private final CharacterRunAutomaton filter;
|
private final CharacterRunAutomaton filter;
|
||||||
private final boolean enablePositionIncrements;
|
|
||||||
private int positionIncrementGap;
|
private int positionIncrementGap;
|
||||||
private final Random random;
|
private final Random random;
|
||||||
private Map<String,Integer> previousMappings = new HashMap<String,Integer>();
|
private Map<String,Integer> previousMappings = new HashMap<String,Integer>();
|
||||||
|
@ -60,30 +58,28 @@ public final class MockAnalyzer extends Analyzer {
|
||||||
* @param runAutomaton DFA describing how tokenization should happen (e.g. [a-zA-Z]+)
|
* @param runAutomaton DFA describing how tokenization should happen (e.g. [a-zA-Z]+)
|
||||||
* @param lowerCase true if the tokenizer should lowercase terms
|
* @param lowerCase true if the tokenizer should lowercase terms
|
||||||
* @param filter DFA describing how terms should be filtered (set of stopwords, etc)
|
* @param filter DFA describing how terms should be filtered (set of stopwords, etc)
|
||||||
* @param enablePositionIncrements true if position increments should reflect filtered terms.
|
|
||||||
*/
|
*/
|
||||||
public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase, CharacterRunAutomaton filter, boolean enablePositionIncrements) {
|
public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase, CharacterRunAutomaton filter) {
|
||||||
super(new PerFieldReuseStrategy());
|
super(new PerFieldReuseStrategy());
|
||||||
// TODO: this should be solved in a different way; Random should not be shared (!).
|
// TODO: this should be solved in a different way; Random should not be shared (!).
|
||||||
this.random = new Random(random.nextLong());
|
this.random = new Random(random.nextLong());
|
||||||
this.runAutomaton = runAutomaton;
|
this.runAutomaton = runAutomaton;
|
||||||
this.lowerCase = lowerCase;
|
this.lowerCase = lowerCase;
|
||||||
this.filter = filter;
|
this.filter = filter;
|
||||||
this.enablePositionIncrements = enablePositionIncrements;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton, boolean)
|
* Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton)
|
||||||
* MockAnalyzer(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false}).
|
* MockAnalyzer(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false}).
|
||||||
*/
|
*/
|
||||||
public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
|
public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
|
||||||
this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, true);
|
this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a Whitespace-lowercasing analyzer with no stopwords removal.
|
* Create a Whitespace-lowercasing analyzer with no stopwords removal.
|
||||||
* <p>
|
* <p>
|
||||||
* Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton, boolean)
|
* Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton)
|
||||||
* MockAnalyzer(random, MockTokenizer.WHITESPACE, true, MockTokenFilter.EMPTY_STOPSET, false}).
|
* MockAnalyzer(random, MockTokenizer.WHITESPACE, true, MockTokenFilter.EMPTY_STOPSET, false}).
|
||||||
*/
|
*/
|
||||||
public MockAnalyzer(Random random) {
|
public MockAnalyzer(Random random) {
|
||||||
|
@ -95,7 +91,6 @@ public final class MockAnalyzer extends Analyzer {
|
||||||
MockTokenizer tokenizer = new MockTokenizer(reader, runAutomaton, lowerCase, maxTokenLength);
|
MockTokenizer tokenizer = new MockTokenizer(reader, runAutomaton, lowerCase, maxTokenLength);
|
||||||
tokenizer.setEnableChecks(enableChecks);
|
tokenizer.setEnableChecks(enableChecks);
|
||||||
MockTokenFilter filt = new MockTokenFilter(tokenizer, filter);
|
MockTokenFilter filt = new MockTokenFilter(tokenizer, filter);
|
||||||
filt.setEnablePositionIncrements(enablePositionIncrements);
|
|
||||||
return new TokenStreamComponents(tokenizer, maybePayload(filt, fieldName));
|
return new TokenStreamComponents(tokenizer, maybePayload(filt, fieldName));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -55,7 +55,6 @@ public final class MockTokenFilter extends TokenFilter {
|
||||||
makeString("with"))));
|
makeString("with"))));
|
||||||
|
|
||||||
private final CharacterRunAutomaton filter;
|
private final CharacterRunAutomaton filter;
|
||||||
private boolean enablePositionIncrements = true;
|
|
||||||
|
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
@ -80,9 +79,7 @@ public final class MockTokenFilter extends TokenFilter {
|
||||||
int skippedPositions = 0;
|
int skippedPositions = 0;
|
||||||
while (input.incrementToken()) {
|
while (input.incrementToken()) {
|
||||||
if (!filter.run(termAtt.buffer(), 0, termAtt.length())) {
|
if (!filter.run(termAtt.buffer(), 0, termAtt.length())) {
|
||||||
if (enablePositionIncrements) {
|
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
|
||||||
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
skippedPositions += posIncrAtt.getPositionIncrement();
|
skippedPositions += posIncrAtt.getPositionIncrement();
|
||||||
|
@ -90,20 +87,4 @@ public final class MockTokenFilter extends TokenFilter {
|
||||||
// reached EOS -- return false
|
// reached EOS -- return false
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @see #setEnablePositionIncrements(boolean)
|
|
||||||
*/
|
|
||||||
public boolean getEnablePositionIncrements() {
|
|
||||||
return enablePositionIncrements;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* If <code>true</code>, this Filter will preserve
|
|
||||||
* positions of the incoming tokens (ie, accumulate and
|
|
||||||
* set position increments of the removed stop tokens).
|
|
||||||
*/
|
|
||||||
public void setEnablePositionIncrements(boolean enable) {
|
|
||||||
this.enablePositionIncrements = enable;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -59,7 +59,7 @@ public abstract class SearchEquivalenceTestBase extends LuceneTestCase {
|
||||||
directory = newDirectory();
|
directory = newDirectory();
|
||||||
stopword = "" + randomChar();
|
stopword = "" + randomChar();
|
||||||
CharacterRunAutomaton stopset = new CharacterRunAutomaton(BasicAutomata.makeString(stopword));
|
CharacterRunAutomaton stopset = new CharacterRunAutomaton(BasicAutomata.makeString(stopword));
|
||||||
analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset, true);
|
analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset);
|
||||||
RandomIndexWriter iw = new RandomIndexWriter(random, directory, analyzer);
|
RandomIndexWriter iw = new RandomIndexWriter(random, directory, analyzer);
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
Field id = new StringField("id", "", Field.Store.NO);
|
Field id = new StringField("id", "", Field.Store.NO);
|
||||||
|
|
|
@ -87,8 +87,8 @@ public class DisMaxRequestHandlerTest extends SolrTestCaseJ4 {
|
||||||
req("cool stuff")
|
req("cool stuff")
|
||||||
,"//*[@numFound='3']"
|
,"//*[@numFound='3']"
|
||||||
,"//result/doc[1]/int[@name='id'][.='42']"
|
,"//result/doc[1]/int[@name='id'][.='42']"
|
||||||
,"//result/doc[2]/int[@name='id'][.='666']"
|
,"//result/doc[2]/int[@name='id'][.='8675309']"
|
||||||
,"//result/doc[3]/int[@name='id'][.='8675309']"
|
,"//result/doc[3]/int[@name='id'][.='666']"
|
||||||
);
|
);
|
||||||
|
|
||||||
assertQ("multi qf",
|
assertQ("multi qf",
|
||||||
|
|
|
@ -323,16 +323,16 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe
|
||||||
tokenList = valueResult.get("org.apache.lucene.analysis.core.StopFilter");
|
tokenList = valueResult.get("org.apache.lucene.analysis.core.StopFilter");
|
||||||
assertNotNull("Expecting the 'StopFilter' to be applied on the index for the 'text' field", tokenList);
|
assertNotNull("Expecting the 'StopFilter' to be applied on the index for the 'text' field", tokenList);
|
||||||
assertEquals("Expecting 4 tokens after stop word removal", 4, tokenList.size());
|
assertEquals("Expecting 4 tokens after stop word removal", 4, tokenList.size());
|
||||||
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 1, new int[]{2,2,2,1}, null, false));
|
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2,2,2,2}, null, false));
|
||||||
assertToken(tokenList.get(1), new TokenInfo("jumped", null, "<ALPHANUM>", 8, 14, 2, new int[]{3,3,3,2}, null, false));
|
assertToken(tokenList.get(1), new TokenInfo("jumped", null, "<ALPHANUM>", 8, 14, 3, new int[]{3,3,3,3}, null, false));
|
||||||
assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 3, new int[]{4,4,4,3}, null, false));
|
assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4,4,4}, null, false));
|
||||||
assertToken(tokenList.get(3), new TokenInfo("dogs", null, "<ALPHANUM>", 24, 28, 4, new int[]{6,6,6,4}, null, false));
|
assertToken(tokenList.get(3), new TokenInfo("dogs", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6,6,6}, null, false));
|
||||||
tokenList = valueResult.get("org.apache.lucene.analysis.en.PorterStemFilter");
|
tokenList = valueResult.get("org.apache.lucene.analysis.en.PorterStemFilter");
|
||||||
assertNotNull("Expecting the 'PorterStemFilter' to be applied on the index for the 'text' field", tokenList);
|
assertNotNull("Expecting the 'PorterStemFilter' to be applied on the index for the 'text' field", tokenList);
|
||||||
assertEquals("Expecting 4 tokens", 4, tokenList.size());
|
assertEquals("Expecting 4 tokens", 4, tokenList.size());
|
||||||
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 1, new int[]{2,2,2,1,1}, null, false));
|
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2,2,2,2,2}, null, false));
|
||||||
assertToken(tokenList.get(1), new TokenInfo("jump", null, "<ALPHANUM>", 8, 14, 2, new int[]{3,3,3,2,2}, null, true));
|
assertToken(tokenList.get(1), new TokenInfo("jump", null, "<ALPHANUM>", 8, 14, 3, new int[]{3,3,3,3,3}, null, true));
|
||||||
assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 3, new int[]{4,4,4,3,3}, null, false));
|
assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4,4,4,4}, null, false));
|
||||||
assertToken(tokenList.get(3), new TokenInfo("dog", null, "<ALPHANUM>", 24, 28, 4, new int[]{6,6,6,4,4}, null, false));
|
assertToken(tokenList.get(3), new TokenInfo("dog", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6,6,6,6}, null, false));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -178,25 +178,25 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
|
||||||
tokenList = indexPart.get("org.apache.lucene.analysis.core.StopFilter");
|
tokenList = indexPart.get("org.apache.lucene.analysis.core.StopFilter");
|
||||||
assertNotNull("Expcting StopFilter analysis breakdown", tokenList);
|
assertNotNull("Expcting StopFilter analysis breakdown", tokenList);
|
||||||
assertEquals(tokenList.size(), 8);
|
assertEquals(tokenList.size(), 8);
|
||||||
assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 1, new int[]{2,2,2,1}, null, false));
|
assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2,2,2}, null, false));
|
||||||
assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 2, new int[]{3,3,3,2}, null, false));
|
assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 3, new int[]{3,3,3,3}, null, false));
|
||||||
assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 3, new int[]{4,4,4,3}, null, true));
|
assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 4, new int[]{4,4,4,4}, null, true));
|
||||||
assertToken(tokenList.get(3), new TokenInfo("jumped", null, "<ALPHANUM>", 18, 24, 4, new int[]{5,5,5,4}, null, false));
|
assertToken(tokenList.get(3), new TokenInfo("jumped", null, "<ALPHANUM>", 18, 24, 5, new int[]{5,5,5,5}, null, false));
|
||||||
assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 5, new int[]{6,6,6,5}, null, false));
|
assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 6, new int[]{6,6,6,6}, null, false));
|
||||||
assertToken(tokenList.get(5), new TokenInfo("lazy", null, "<ALPHANUM>", 34, 38, 6, new int[]{8,8,8,6}, null, false));
|
assertToken(tokenList.get(5), new TokenInfo("lazy", null, "<ALPHANUM>", 34, 38, 8, new int[]{8,8,8,8}, null, false));
|
||||||
assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 7, new int[]{9,9,9,7}, null, true));
|
assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 9, new int[]{9,9,9,9}, null, true));
|
||||||
assertToken(tokenList.get(7), new TokenInfo("dogs", null, "<ALPHANUM>", 45, 49, 8, new int[]{10,10,10,8}, null, false));
|
assertToken(tokenList.get(7), new TokenInfo("dogs", null, "<ALPHANUM>", 45, 49, 10, new int[]{10,10,10,10}, null, false));
|
||||||
tokenList = indexPart.get("org.apache.lucene.analysis.en.PorterStemFilter");
|
tokenList = indexPart.get("org.apache.lucene.analysis.en.PorterStemFilter");
|
||||||
assertNotNull("Expcting PorterStemFilter analysis breakdown", tokenList);
|
assertNotNull("Expcting PorterStemFilter analysis breakdown", tokenList);
|
||||||
assertEquals(tokenList.size(), 8);
|
assertEquals(tokenList.size(), 8);
|
||||||
assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 1, new int[]{2,2,2,1,1}, null, false));
|
assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2,2,2,2}, null, false));
|
||||||
assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 2, new int[]{3,3,3,2,2}, null, false));
|
assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 3, new int[]{3,3,3,3,3}, null, false));
|
||||||
assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 3, new int[]{4,4,4,3,3}, null, true));
|
assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 4, new int[]{4,4,4,4,4}, null, true));
|
||||||
assertToken(tokenList.get(3), new TokenInfo("jump", null, "<ALPHANUM>", 18, 24, 4, new int[]{5,5,5,4,4}, null, false));
|
assertToken(tokenList.get(3), new TokenInfo("jump", null, "<ALPHANUM>", 18, 24, 5, new int[]{5,5,5,5,5}, null, false));
|
||||||
assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 5, new int[]{6,6,6,5,5}, null, false));
|
assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 6, new int[]{6,6,6,6,6}, null, false));
|
||||||
assertToken(tokenList.get(5), new TokenInfo("lazi", null, "<ALPHANUM>", 34, 38, 6, new int[]{8,8,8,6,6}, null, false));
|
assertToken(tokenList.get(5), new TokenInfo("lazi", null, "<ALPHANUM>", 34, 38, 8, new int[]{8,8,8,8,8}, null, false));
|
||||||
assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 7, new int[]{9,9,9,7,7}, null, true));
|
assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 9, new int[]{9,9,9,9,9}, null, true));
|
||||||
assertToken(tokenList.get(7), new TokenInfo("dog", null, "<ALPHANUM>", 45, 49, 8, new int[]{10,10,10,8,8}, null, false));
|
assertToken(tokenList.get(7), new TokenInfo("dog", null, "<ALPHANUM>", 45, 49, 10, new int[]{10,10,10,10,10}, null, false));
|
||||||
|
|
||||||
NamedList<List<NamedList>> queryPart = textType.get("query");
|
NamedList<List<NamedList>> queryPart = textType.get("query");
|
||||||
assertNotNull("expecting a query token analysis for field type 'text'", queryPart);
|
assertNotNull("expecting a query token analysis for field type 'text'", queryPart);
|
||||||
|
|
|
@ -201,12 +201,12 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
|
||||||
public void testOptions() throws Exception {
|
public void testOptions() throws Exception {
|
||||||
assertJQ(req("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
|
assertJQ(req("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
|
||||||
, TermVectorParams.TF, "true", TermVectorParams.DF, "true", TermVectorParams.OFFSETS, "true", TermVectorParams.POSITIONS, "true", TermVectorParams.TF_IDF, "true")
|
, TermVectorParams.TF, "true", TermVectorParams.DF, "true", TermVectorParams.OFFSETS, "true", TermVectorParams.POSITIONS, "true", TermVectorParams.TF_IDF, "true")
|
||||||
,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':1}, 'df':2, 'tf-idf':0.5}"
|
,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
|
||||||
);
|
);
|
||||||
|
|
||||||
assertJQ(req("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
|
assertJQ(req("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
|
||||||
, TermVectorParams.ALL, "true")
|
, TermVectorParams.ALL, "true")
|
||||||
,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':1}, 'df':2, 'tf-idf':0.5}"
|
,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
|
||||||
);
|
);
|
||||||
|
|
||||||
// test each combination at random
|
// test each combination at random
|
||||||
|
@ -214,7 +214,7 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
|
||||||
list.addAll(Arrays.asList("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"));
|
list.addAll(Arrays.asList("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"));
|
||||||
String[][] options = new String[][] { { TermVectorParams.TF, "'tf':1" },
|
String[][] options = new String[][] { { TermVectorParams.TF, "'tf':1" },
|
||||||
{ TermVectorParams.OFFSETS, "'offsets':{'start':20, 'end':27}" },
|
{ TermVectorParams.OFFSETS, "'offsets':{'start':20, 'end':27}" },
|
||||||
{ TermVectorParams.POSITIONS, "'positions':{'position':1}" },
|
{ TermVectorParams.POSITIONS, "'positions':{'position':5}" },
|
||||||
{ TermVectorParams.DF, "'df':2" },
|
{ TermVectorParams.DF, "'df':2" },
|
||||||
{ TermVectorParams.TF_IDF, "'tf-idf':0.5" } };
|
{ TermVectorParams.TF_IDF, "'tf-idf':0.5" } };
|
||||||
StringBuilder expected = new StringBuilder("/termVectors/0/test_posofftv/anoth=={");
|
StringBuilder expected = new StringBuilder("/termVectors/0/test_posofftv/anoth=={");
|
||||||
|
@ -249,7 +249,7 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
|
||||||
,"f.test_basictv." + TermVectorParams.TF_IDF, "false"
|
,"f.test_basictv." + TermVectorParams.TF_IDF, "false"
|
||||||
)
|
)
|
||||||
,"/termVectors/0/test_basictv=={'anoth':{},'titl':{}}"
|
,"/termVectors/0/test_basictv=={'anoth':{},'titl':{}}"
|
||||||
,"/termVectors/0/test_postv/anoth=={'tf':1, 'positions':{'position':1}, 'df':2, 'tf-idf':0.5}"
|
,"/termVectors/0/test_postv/anoth=={'tf':1, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
|
||||||
,"/termVectors/0/test_offtv/anoth=={'tf':1, 'df':2, 'tf-idf':0.5}"
|
,"/termVectors/0/test_offtv/anoth=={'tf':1, 'df':2, 'tf-idf':0.5}"
|
||||||
,"/termVectors/warnings=={ 'noTermVectors':['test_notv'], 'noPositions':['test_basictv', 'test_offtv'], 'noOffsets':['test_basictv', 'test_postv']}"
|
,"/termVectors/warnings=={ 'noTermVectors':['test_notv'], 'noPositions':['test_basictv', 'test_offtv'], 'noOffsets':['test_basictv', 'test_postv']}"
|
||||||
);
|
);
|
||||||
|
|
|
@ -53,7 +53,7 @@ public class TestSuggestSpellingConverter extends BaseTokenStreamTestCase {
|
||||||
TokenStream filter = new PatternReplaceFilter(tokenizer,
|
TokenStream filter = new PatternReplaceFilter(tokenizer,
|
||||||
Pattern.compile("([^\\p{L}\\p{M}\\p{N}\\p{Cs}]*[\\p{L}\\p{M}\\p{N}\\p{Cs}\\_]+:)|([^\\p{L}\\p{M}\\p{N}\\p{Cs}])+"), " ", true);
|
Pattern.compile("([^\\p{L}\\p{M}\\p{N}\\p{Cs}]*[\\p{L}\\p{M}\\p{N}\\p{Cs}\\_]+:)|([^\\p{L}\\p{M}\\p{N}\\p{Cs}])+"), " ", true);
|
||||||
filter = new LowerCaseFilter(TEST_VERSION_CURRENT, filter);
|
filter = new LowerCaseFilter(TEST_VERSION_CURRENT, filter);
|
||||||
filter = new TrimFilter(filter, false);
|
filter = new TrimFilter(TEST_VERSION_CURRENT, filter, false);
|
||||||
return new TokenStreamComponents(tokenizer, filter);
|
return new TokenStreamComponents(tokenizer, filter);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
|
@ -202,13 +202,10 @@
|
||||||
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||||
-->
|
-->
|
||||||
<!-- Case insensitive stop word removal.
|
<!-- Case insensitive stop word removal.
|
||||||
add enablePositionIncrements=true in both the index and query
|
|
||||||
analyzers to leave a 'gap' for more accurate phrase queries.
|
|
||||||
-->
|
-->
|
||||||
<filter class="solr.StopFilterFactory"
|
<filter class="solr.StopFilterFactory"
|
||||||
ignoreCase="true"
|
ignoreCase="true"
|
||||||
words="stopwords.txt"
|
words="stopwords.txt"
|
||||||
enablePositionIncrements="true"
|
|
||||||
/>
|
/>
|
||||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
@ -222,7 +219,6 @@
|
||||||
<filter class="solr.StopFilterFactory"
|
<filter class="solr.StopFilterFactory"
|
||||||
ignoreCase="true"
|
ignoreCase="true"
|
||||||
words="stopwords.txt"
|
words="stopwords.txt"
|
||||||
enablePositionIncrements="true"
|
|
||||||
/>
|
/>
|
||||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
|
|
@ -440,7 +440,7 @@
|
||||||
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
|
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
|
||||||
<analyzer type="index">
|
<analyzer type="index">
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
|
||||||
<!-- in this example, we will only use synonyms at query time
|
<!-- in this example, we will only use synonyms at query time
|
||||||
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||||
-->
|
-->
|
||||||
|
@ -448,7 +448,7 @@
|
||||||
</analyzer>
|
</analyzer>
|
||||||
<analyzer type="query">
|
<analyzer type="query">
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
|
||||||
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
|
@ -466,13 +466,10 @@
|
||||||
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||||
-->
|
-->
|
||||||
<!-- Case insensitive stop word removal.
|
<!-- Case insensitive stop word removal.
|
||||||
add enablePositionIncrements=true in both the index and query
|
|
||||||
analyzers to leave a 'gap' for more accurate phrase queries.
|
|
||||||
-->
|
-->
|
||||||
<filter class="solr.StopFilterFactory"
|
<filter class="solr.StopFilterFactory"
|
||||||
ignoreCase="true"
|
ignoreCase="true"
|
||||||
words="lang/stopwords_en.txt"
|
words="lang/stopwords_en.txt"
|
||||||
enablePositionIncrements="true"
|
|
||||||
/>
|
/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.EnglishPossessiveFilterFactory"/>
|
<filter class="solr.EnglishPossessiveFilterFactory"/>
|
||||||
|
@ -488,7 +485,6 @@
|
||||||
<filter class="solr.StopFilterFactory"
|
<filter class="solr.StopFilterFactory"
|
||||||
ignoreCase="true"
|
ignoreCase="true"
|
||||||
words="lang/stopwords_en.txt"
|
words="lang/stopwords_en.txt"
|
||||||
enablePositionIncrements="true"
|
|
||||||
/>
|
/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.EnglishPossessiveFilterFactory"/>
|
<filter class="solr.EnglishPossessiveFilterFactory"/>
|
||||||
|
@ -516,13 +512,10 @@
|
||||||
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||||
-->
|
-->
|
||||||
<!-- Case insensitive stop word removal.
|
<!-- Case insensitive stop word removal.
|
||||||
add enablePositionIncrements=true in both the index and query
|
|
||||||
analyzers to leave a 'gap' for more accurate phrase queries.
|
|
||||||
-->
|
-->
|
||||||
<filter class="solr.StopFilterFactory"
|
<filter class="solr.StopFilterFactory"
|
||||||
ignoreCase="true"
|
ignoreCase="true"
|
||||||
words="lang/stopwords_en.txt"
|
words="lang/stopwords_en.txt"
|
||||||
enablePositionIncrements="true"
|
|
||||||
/>
|
/>
|
||||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
@ -535,7 +528,6 @@
|
||||||
<filter class="solr.StopFilterFactory"
|
<filter class="solr.StopFilterFactory"
|
||||||
ignoreCase="true"
|
ignoreCase="true"
|
||||||
words="lang/stopwords_en.txt"
|
words="lang/stopwords_en.txt"
|
||||||
enablePositionIncrements="true"
|
|
||||||
/>
|
/>
|
||||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
@ -566,7 +558,7 @@
|
||||||
<fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
|
<fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
|
||||||
<analyzer type="index">
|
<analyzer type="index">
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
|
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
|
||||||
maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
|
maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
|
||||||
|
@ -574,7 +566,7 @@
|
||||||
<analyzer type="query">
|
<analyzer type="query">
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
@ -730,7 +722,7 @@
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<!-- for any non-arabic -->
|
<!-- for any non-arabic -->
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ar.txt" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ar.txt" />
|
||||||
<!-- normalizes ﻯ to ﻱ, etc -->
|
<!-- normalizes ﻯ to ﻱ, etc -->
|
||||||
<filter class="solr.ArabicNormalizationFilterFactory"/>
|
<filter class="solr.ArabicNormalizationFilterFactory"/>
|
||||||
<filter class="solr.ArabicStemFilterFactory"/>
|
<filter class="solr.ArabicStemFilterFactory"/>
|
||||||
|
@ -742,7 +734,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_bg.txt" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_bg.txt" />
|
||||||
<filter class="solr.BulgarianStemFilterFactory"/>
|
<filter class="solr.BulgarianStemFilterFactory"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
@ -754,7 +746,7 @@
|
||||||
<!-- removes l', etc -->
|
<!-- removes l', etc -->
|
||||||
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ca.txt"/>
|
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ca.txt"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ca.txt" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ca.txt" />
|
||||||
<filter class="solr.SnowballPorterFilterFactory" language="Catalan"/>
|
<filter class="solr.SnowballPorterFilterFactory" language="Catalan"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
@ -776,7 +768,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_cz.txt" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_cz.txt" />
|
||||||
<filter class="solr.CzechStemFilterFactory"/>
|
<filter class="solr.CzechStemFilterFactory"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
@ -786,7 +778,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_da.txt" format="snowball" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_da.txt" format="snowball" />
|
||||||
<filter class="solr.SnowballPorterFilterFactory" language="Danish"/>
|
<filter class="solr.SnowballPorterFilterFactory" language="Danish"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
@ -796,7 +788,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" />
|
||||||
<filter class="solr.GermanNormalizationFilterFactory"/>
|
<filter class="solr.GermanNormalizationFilterFactory"/>
|
||||||
<filter class="solr.GermanLightStemFilterFactory"/>
|
<filter class="solr.GermanLightStemFilterFactory"/>
|
||||||
<!-- less aggressive: <filter class="solr.GermanMinimalStemFilterFactory"/> -->
|
<!-- less aggressive: <filter class="solr.GermanMinimalStemFilterFactory"/> -->
|
||||||
|
@ -810,7 +802,7 @@
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<!-- greek specific lowercase for sigma -->
|
<!-- greek specific lowercase for sigma -->
|
||||||
<filter class="solr.GreekLowerCaseFilterFactory"/>
|
<filter class="solr.GreekLowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_el.txt" />
|
||||||
<filter class="solr.GreekStemFilterFactory"/>
|
<filter class="solr.GreekStemFilterFactory"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
@ -820,7 +812,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_es.txt" format="snowball" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_es.txt" format="snowball" />
|
||||||
<filter class="solr.SpanishLightStemFilterFactory"/>
|
<filter class="solr.SpanishLightStemFilterFactory"/>
|
||||||
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Spanish"/> -->
|
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Spanish"/> -->
|
||||||
</analyzer>
|
</analyzer>
|
||||||
|
@ -831,7 +823,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_eu.txt" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_eu.txt" />
|
||||||
<filter class="solr.SnowballPorterFilterFactory" language="Basque"/>
|
<filter class="solr.SnowballPorterFilterFactory" language="Basque"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
@ -845,7 +837,7 @@
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.ArabicNormalizationFilterFactory"/>
|
<filter class="solr.ArabicNormalizationFilterFactory"/>
|
||||||
<filter class="solr.PersianNormalizationFilterFactory"/>
|
<filter class="solr.PersianNormalizationFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fa.txt" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fa.txt" />
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
||||||
|
@ -854,7 +846,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fi.txt" format="snowball" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fi.txt" format="snowball" />
|
||||||
<filter class="solr.SnowballPorterFilterFactory" language="Finnish"/>
|
<filter class="solr.SnowballPorterFilterFactory" language="Finnish"/>
|
||||||
<!-- less aggressive: <filter class="solr.FinnishLightStemFilterFactory"/> -->
|
<!-- less aggressive: <filter class="solr.FinnishLightStemFilterFactory"/> -->
|
||||||
</analyzer>
|
</analyzer>
|
||||||
|
@ -867,7 +859,7 @@
|
||||||
<!-- removes l', etc -->
|
<!-- removes l', etc -->
|
||||||
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_fr.txt"/>
|
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_fr.txt"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fr.txt" format="snowball" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fr.txt" format="snowball" />
|
||||||
<filter class="solr.FrenchLightStemFilterFactory"/>
|
<filter class="solr.FrenchLightStemFilterFactory"/>
|
||||||
<!-- less aggressive: <filter class="solr.FrenchMinimalStemFilterFactory"/> -->
|
<!-- less aggressive: <filter class="solr.FrenchMinimalStemFilterFactory"/> -->
|
||||||
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="French"/> -->
|
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="French"/> -->
|
||||||
|
@ -881,9 +873,9 @@
|
||||||
<!-- removes d', etc -->
|
<!-- removes d', etc -->
|
||||||
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt"/>
|
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt"/>
|
||||||
<!-- removes n-, etc. position increments is intentionally false! -->
|
<!-- removes n-, etc. position increments is intentionally false! -->
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt" enablePositionIncrements="false"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt"/>
|
||||||
<filter class="solr.IrishLowerCaseFilterFactory"/>
|
<filter class="solr.IrishLowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt"/>
|
||||||
<filter class="solr.SnowballPorterFilterFactory" language="Irish"/>
|
<filter class="solr.SnowballPorterFilterFactory" language="Irish"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
@ -893,7 +885,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_gl.txt" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_gl.txt" />
|
||||||
<filter class="solr.GalicianStemFilterFactory"/>
|
<filter class="solr.GalicianStemFilterFactory"/>
|
||||||
<!-- less aggressive: <filter class="solr.GalicianMinimalStemFilterFactory"/> -->
|
<!-- less aggressive: <filter class="solr.GalicianMinimalStemFilterFactory"/> -->
|
||||||
</analyzer>
|
</analyzer>
|
||||||
|
@ -908,7 +900,7 @@
|
||||||
<filter class="solr.IndicNormalizationFilterFactory"/>
|
<filter class="solr.IndicNormalizationFilterFactory"/>
|
||||||
<!-- normalizes variation in spelling -->
|
<!-- normalizes variation in spelling -->
|
||||||
<filter class="solr.HindiNormalizationFilterFactory"/>
|
<filter class="solr.HindiNormalizationFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hi.txt" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hi.txt" />
|
||||||
<filter class="solr.HindiStemFilterFactory"/>
|
<filter class="solr.HindiStemFilterFactory"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
@ -918,7 +910,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hu.txt" format="snowball" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hu.txt" format="snowball" />
|
||||||
<filter class="solr.SnowballPorterFilterFactory" language="Hungarian"/>
|
<filter class="solr.SnowballPorterFilterFactory" language="Hungarian"/>
|
||||||
<!-- less aggressive: <filter class="solr.HungarianLightStemFilterFactory"/> -->
|
<!-- less aggressive: <filter class="solr.HungarianLightStemFilterFactory"/> -->
|
||||||
</analyzer>
|
</analyzer>
|
||||||
|
@ -929,7 +921,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hy.txt" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hy.txt" />
|
||||||
<filter class="solr.SnowballPorterFilterFactory" language="Armenian"/>
|
<filter class="solr.SnowballPorterFilterFactory" language="Armenian"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
@ -939,7 +931,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_id.txt" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_id.txt" />
|
||||||
<!-- for a less aggressive approach (only inflectional suffixes), set stemDerivational to false -->
|
<!-- for a less aggressive approach (only inflectional suffixes), set stemDerivational to false -->
|
||||||
<filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/>
|
<filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
|
@ -952,7 +944,7 @@
|
||||||
<!-- removes l', etc -->
|
<!-- removes l', etc -->
|
||||||
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_it.txt"/>
|
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_it.txt"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt" format="snowball" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt" format="snowball" />
|
||||||
<filter class="solr.ItalianLightStemFilterFactory"/>
|
<filter class="solr.ItalianLightStemFilterFactory"/>
|
||||||
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> -->
|
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> -->
|
||||||
</analyzer>
|
</analyzer>
|
||||||
|
@ -999,11 +991,11 @@
|
||||||
<!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
|
<!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
|
||||||
<filter class="solr.JapaneseBaseFormFilterFactory"/>
|
<filter class="solr.JapaneseBaseFormFilterFactory"/>
|
||||||
<!-- Removes tokens with certain part-of-speech tags -->
|
<!-- Removes tokens with certain part-of-speech tags -->
|
||||||
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/>
|
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" />
|
||||||
<!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) -->
|
<!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) -->
|
||||||
<filter class="solr.CJKWidthFilterFactory"/>
|
<filter class="solr.CJKWidthFilterFactory"/>
|
||||||
<!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->
|
<!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" />
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" />
|
||||||
<!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) -->
|
<!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) -->
|
||||||
<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
|
<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
|
||||||
<!-- Lower-cases romaji characters -->
|
<!-- Lower-cases romaji characters -->
|
||||||
|
@ -1016,7 +1008,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_lv.txt" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_lv.txt" />
|
||||||
<filter class="solr.LatvianStemFilterFactory"/>
|
<filter class="solr.LatvianStemFilterFactory"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
@ -1026,7 +1018,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_nl.txt" format="snowball" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_nl.txt" format="snowball" />
|
||||||
<filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict_nl.txt" ignoreCase="false"/>
|
<filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict_nl.txt" ignoreCase="false"/>
|
||||||
<filter class="solr.SnowballPorterFilterFactory" language="Dutch"/>
|
<filter class="solr.SnowballPorterFilterFactory" language="Dutch"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
|
@ -1037,7 +1029,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball" />
|
||||||
<filter class="solr.SnowballPorterFilterFactory" language="Norwegian"/>
|
<filter class="solr.SnowballPorterFilterFactory" language="Norwegian"/>
|
||||||
<!-- less aggressive: <filter class="solr.NorwegianLightStemFilterFactory"/> -->
|
<!-- less aggressive: <filter class="solr.NorwegianLightStemFilterFactory"/> -->
|
||||||
<!-- singular/plural: <filter class="solr.NorwegianMinimalStemFilterFactory"/> -->
|
<!-- singular/plural: <filter class="solr.NorwegianMinimalStemFilterFactory"/> -->
|
||||||
|
@ -1049,7 +1041,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" />
|
||||||
<filter class="solr.PortugueseLightStemFilterFactory"/>
|
<filter class="solr.PortugueseLightStemFilterFactory"/>
|
||||||
<!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->
|
<!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->
|
||||||
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->
|
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->
|
||||||
|
@ -1062,7 +1054,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ro.txt" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ro.txt" />
|
||||||
<filter class="solr.SnowballPorterFilterFactory" language="Romanian"/>
|
<filter class="solr.SnowballPorterFilterFactory" language="Romanian"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
@ -1072,7 +1064,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" />
|
||||||
<filter class="solr.SnowballPorterFilterFactory" language="Russian"/>
|
<filter class="solr.SnowballPorterFilterFactory" language="Russian"/>
|
||||||
<!-- less aggressive: <filter class="solr.RussianLightStemFilterFactory"/> -->
|
<!-- less aggressive: <filter class="solr.RussianLightStemFilterFactory"/> -->
|
||||||
</analyzer>
|
</analyzer>
|
||||||
|
@ -1083,7 +1075,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_sv.txt" format="snowball" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_sv.txt" format="snowball" />
|
||||||
<filter class="solr.SnowballPorterFilterFactory" language="Swedish"/>
|
<filter class="solr.SnowballPorterFilterFactory" language="Swedish"/>
|
||||||
<!-- less aggressive: <filter class="solr.SwedishLightStemFilterFactory"/> -->
|
<!-- less aggressive: <filter class="solr.SwedishLightStemFilterFactory"/> -->
|
||||||
</analyzer>
|
</analyzer>
|
||||||
|
@ -1095,7 +1087,7 @@
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.ThaiWordFilterFactory"/>
|
<filter class="solr.ThaiWordFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_th.txt" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_th.txt" />
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
||||||
|
@ -1104,7 +1096,7 @@
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
<filter class="solr.TurkishLowerCaseFilterFactory"/>
|
<filter class="solr.TurkishLowerCaseFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt" />
|
||||||
<filter class="solr.SnowballPorterFilterFactory" language="Turkish"/>
|
<filter class="solr.SnowballPorterFilterFactory" language="Turkish"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
Loading…
Reference in New Issue