LUCENE-4963: Deprecate broken TokenFilter options.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1479148 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2013-05-04 18:18:08 +00:00
parent ec7317a8d6
commit 8a7f2b6cc4
56 changed files with 480 additions and 372 deletions

View File

@ -59,6 +59,16 @@ Changes in backwards compatibility policy
completely refactored to allow for a better implementation of TimSort.
(Adrien Grand, Uwe Schindler, Dawid Weiss)
* LUCENE-4963: Some TokenFilter options that generate broken TokenStreams have
been deprecated: updateOffsets=true on TrimFilter and
enablePositionIncrements=false on all classes that inherit from
FilteringTokenFilter: JapanesePartOfSpeechStopFilter, KeepWordFilter,
LengthFilter, StopFilter and TypeTokenFilter. (Adrien Grand)
* LUCENE-4963: In order not to take position increments into account in
suggesters, you now need to call setPreservePositionIncrements(false) instead
of configuring the token filters to not increment positions. (Adrien Grand)
Bug Fixes
* LUCENE-4935: CustomScoreQuery wrongly applied its query boost twice

View File

@ -57,7 +57,7 @@ public final class StopFilter extends FilteringTokenFilter {
* @see #makeStopSet(Version, java.lang.String...)
*/
public StopFilter(Version matchVersion, TokenStream in, CharArraySet stopWords) {
super(true, in);
super(matchVersion, in);
this.stopWords = stopWords;
}

View File

@ -51,7 +51,7 @@ public class StopFilterFactory extends TokenFilterFactory implements ResourceLoa
stopWordFiles = get(args, "words");
format = get(args, "format");
ignoreCase = getBoolean(args, "ignoreCase", false);
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}

View File

@ -17,12 +17,12 @@ package org.apache.lucene.analysis.core;
* limitations under the License.
*/
import java.util.Set;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.util.Version;
/**
* Removes tokens whose types appear in a set of blocked types from a token stream.
@ -33,14 +33,41 @@ public final class TypeTokenFilter extends FilteringTokenFilter {
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
private final boolean useWhiteList;
public TypeTokenFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes, boolean useWhiteList) {
super(enablePositionIncrements, input);
/** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
@Deprecated
public TypeTokenFilter(Version version, boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes, boolean useWhiteList) {
super(version, enablePositionIncrements, input);
this.stopTypes = stopTypes;
this.useWhiteList = useWhiteList;
}
public TypeTokenFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes) {
this(enablePositionIncrements, input, stopTypes, false);
/** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
@Deprecated
public TypeTokenFilter(Version version, boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes) {
this(version, enablePositionIncrements, input, stopTypes, false);
}
/**
* Create a new {@link TypeTokenFilter}.
* @param version the Lucene match version
* @param input the {@link TokenStream} to consume
* @param stopTypes the types to filter
* @param useWhiteList if true, then tokens whose type is in stopTypes will
* be kept, otherwise they will be filtered out
*/
public TypeTokenFilter(Version version, TokenStream input, Set<String> stopTypes, boolean useWhiteList) {
super(version, input);
this.stopTypes = stopTypes;
this.useWhiteList = useWhiteList;
}
/**
* Create a new {@link TypeTokenFilter} that filters tokens out
* (useWhiteList=false).
* @see #TypeTokenFilter(Version, TokenStream, Set, boolean)
*/
public TypeTokenFilter(Version version, TokenStream input, Set<String> stopTypes) {
this(version, input, stopTypes, false);
}
/**

View File

@ -35,7 +35,7 @@ import java.util.Set;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.TypeTokenFilterFactory" types="stoptypes.txt"
* enablePositionIncrements="true" useWhitelist="false"/&gt;
* useWhitelist="false"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
@ -49,7 +49,7 @@ public class TypeTokenFilterFactory extends TokenFilterFactory implements Resour
public TypeTokenFilterFactory(Map<String,String> args) {
super(args);
stopTypesFiles = require(args, "types");
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
useWhitelist = getBoolean(args, "useWhitelist", false);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
@ -78,6 +78,8 @@ public class TypeTokenFilterFactory extends TokenFilterFactory implements Resour
@Override
public TokenStream create(TokenStream input) {
return new TypeTokenFilter(enablePositionIncrements, input, stopTypes, useWhitelist);
@SuppressWarnings("deprecation")
final TokenStream filter = new TypeTokenFilter(luceneMatchVersion, enablePositionIncrements, input, stopTypes, useWhitelist);
return filter;
}
}

View File

@ -138,7 +138,9 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS);
if (!matchVersion.onOrAfter(Version.LUCENE_44)) {
s.setEnablePositionIncrements(false);
}
result = s;
result = new ElisionFilter(result, DEFAULT_ARTICLES);
result = new IrishLowerCaseFilter(result);

View File

@ -21,6 +21,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/**
* A TokenFilter that only keeps tokens with text contained in the
@ -32,10 +33,23 @@ public final class KeepWordFilter extends FilteringTokenFilter {
private final CharArraySet words;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/** The words set passed to this constructor will be directly used by this filter
* and should not be modified, */
public KeepWordFilter(boolean enablePositionIncrements, TokenStream in, CharArraySet words) {
super(enablePositionIncrements, in);
/** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
@Deprecated
public KeepWordFilter(Version version, boolean enablePositionIncrements, TokenStream in, CharArraySet words) {
super(version, enablePositionIncrements, in);
this.words = words;
}
/**
* Create a new {@link KeepWordFilter}.
* <p><b>NOTE</b>: The words set passed to this constructor will be directly
* used by this filter and should not be modified.
* @param version the Lucene match version
* @param in the {@link TokenStream} to consume
* @param words the words to keep
*/
public KeepWordFilter(Version version, TokenStream in, CharArraySet words) {
super(version, in);
this.words = words;
}

View File

@ -32,7 +32,7 @@ import java.io.IOException;
* &lt;fieldType name="text_keepword" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.KeepWordFilterFactory" words="keepwords.txt" ignoreCase="false" enablePositionIncrements="false"/&gt;
* &lt;filter class="solr.KeepWordFilterFactory" words="keepwords.txt" ignoreCase="false"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
@ -48,7 +48,7 @@ public class KeepWordFilterFactory extends TokenFilterFactory implements Resourc
assureMatchVersion();
wordFiles = get(args, "words");
ignoreCase = getBoolean(args, "ignoreCase", false);
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -76,6 +76,12 @@ public class KeepWordFilterFactory extends TokenFilterFactory implements Resourc
@Override
public TokenStream create(TokenStream input) {
// if the set is null, it means it was empty
return words == null ? input : new KeepWordFilter(enablePositionIncrements, input, words);
if (words == null) {
return input;
} else {
@SuppressWarnings("deprecation")
final TokenStream filter = new KeepWordFilter(luceneMatchVersion, enablePositionIncrements, input, words);
return filter;
}
}
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
/**
* Removes words that are too long or too short from the stream.
@ -34,12 +35,25 @@ public final class LengthFilter extends FilteringTokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
@Deprecated
public LengthFilter(Version version, boolean enablePositionIncrements, TokenStream in, int min, int max) {
super(version, enablePositionIncrements, in);
this.min = min;
this.max = max;
}
/**
* Build a filter that removes words that are too long or too
* short from the text.
* Create a new {@link LengthFilter}. This will filter out tokens whose
* {@link CharTermAttribute} is either too short ({@link CharTermAttribute#length()}
* &lt; min) or too long ({@link CharTermAttribute#length()} &gt; max).
* @param version the Lucene match version
* @param in the {@link TokenStream} to consume
* @param min the minimum length
* @param max the maximum length
*/
public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) {
super(enablePositionIncrements, in);
public LengthFilter(Version version, TokenStream in, int min, int max) {
super(version, in);
this.min = min;
this.max = max;
}

View File

@ -17,18 +17,18 @@ package org.apache.lucene.analysis.miscellaneous;
* limitations under the License.
*/
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import java.util.Map;
/**
* Factory for {@link LengthFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_lngth" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.LengthFilterFactory" min="0" max="1" enablePositionIncrements="false"/&gt;
* &lt;filter class="solr.LengthFilterFactory" min="0" max="1" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
@ -44,7 +44,7 @@ public class LengthFilterFactory extends TokenFilterFactory {
super(args);
min = requireInt(args, MIN_KEY);
max = requireInt(args, MAX_KEY);
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -52,6 +52,8 @@ public class LengthFilterFactory extends TokenFilterFactory {
@Override
public LengthFilter create(TokenStream input) {
return new LengthFilter(enablePositionIncrements, input,min,max);
@SuppressWarnings("deprecation")
final LengthFilter filter = new LengthFilter(luceneMatchVersion, enablePositionIncrements, input,min,max);
return filter;
}
}

View File

@ -21,11 +21,14 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.Version;
import java.io.IOException;
/**
* Trims leading and trailing whitespace from Tokens in the stream.
* <p>As of Lucene 4.4, this filter does not support updateOffsets=true anymore
* as it can lead to broken token streams.
*/
public final class TrimFilter extends TokenFilter {
@ -33,12 +36,27 @@ public final class TrimFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
public TrimFilter(TokenStream in, boolean updateOffsets) {
/**
* Create a new {@link TrimFilter}.
* @param version the Lucene match version
* @param in the stream to consume
* @param updateOffsets whether to update offsets
* @deprecated Offset updates are not supported anymore as of Lucene 4.4.
*/
@Deprecated
public TrimFilter(Version version, TokenStream in, boolean updateOffsets) {
super(in);
if (updateOffsets && version.onOrAfter(Version.LUCENE_44)) {
throw new IllegalArgumentException("updateOffsets=true is not supported anymore as of Lucene 4.4");
}
this.updateOffsets = updateOffsets;
}
/** Create a new {@link TrimFilter} on top of <code>in</code>. */
public TrimFilter(Version version, TokenStream in) {
this(version, in, false);
}
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken()) return false;
@ -55,11 +73,10 @@ public final class TrimFilter extends TokenFilter {
int endOff = 0;
// eat the first characters
//QUESTION: Should we use Character.isWhitespace() instead?
for (start = 0; start < len && termBuffer[start] <= ' '; start++) {
for (start = 0; start < len && Character.isWhitespace(termBuffer[start]); start++) {
}
// eat the end characters
for (end = len; end >= start && termBuffer[end - 1] <= ' '; end--) {
for (end = len; end >= start && Character.isWhitespace(termBuffer[end - 1]); end--) {
endOff++;
}
if (start > 0 || end < len) {

View File

@ -29,7 +29,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;fieldType name="text_trm" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.NGramTokenizerFactory"/&gt;
* &lt;filter class="solr.TrimFilterFactory" updateOffsets="false"/&gt;
* &lt;filter class="solr.TrimFilterFactory" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
@ -50,6 +50,8 @@ public class TrimFilterFactory extends TokenFilterFactory {
@Override
public TrimFilter create(TokenStream input) {
return new TrimFilter(input, updateOffsets);
@SuppressWarnings("deprecation")
final TrimFilter filter = new TrimFilter(luceneMatchVersion, input, updateOffsets);
return filter;
}
}

View File

@ -73,7 +73,7 @@ public final class NGramTokenFilter extends TokenFilter {
* @param maxGram the largest n-gram to generate
*/
public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
super(new LengthFilter(true, input, minGram, Integer.MAX_VALUE));
super(new LengthFilter(version, input, minGram, Integer.MAX_VALUE));
this.version = version;
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");

View File

@ -22,24 +22,54 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.Version;
/**
* Abstract base class for TokenFilters that may remove tokens.
* You have to implement {@link #accept} and return a boolean if the current
* token should be preserved. {@link #incrementToken} uses this method
* to decide if a token should be passed to the caller.
* <p><a name="version" />As of Lucene 4.4, an {@link IllegalArgumentException}
* is thrown when trying to disable position increments when filtering terms.
*/
public abstract class FilteringTokenFilter extends TokenFilter {
private static void checkPositionIncrement(Version version, boolean enablePositionIncrements) {
if (!enablePositionIncrements && version.onOrAfter(Version.LUCENE_44)) {
throw new IllegalArgumentException("enablePositionIncrements=false is not supported anymore as of Lucene 4.4 as it can create broken token streams");
}
}
protected final Version version;
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
private boolean first = true; // only used when not preserving gaps
private boolean first = true;
public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
super(input);
/**
* Create a new {@link FilteringTokenFilter}.
* @param version the Lucene match <a href="#version">version</a>
* @param enablePositionIncrements whether to increment position increments when filtering out terms
* @param input the input to consume
* @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4
*/
@Deprecated
public FilteringTokenFilter(Version version, boolean enablePositionIncrements, TokenStream input){
this(version, input);
checkPositionIncrement(version, enablePositionIncrements);
this.enablePositionIncrements = enablePositionIncrements;
}
/**
* Create a new {@link FilteringTokenFilter}.
* @param version the Lucene match version
* @param in the {@link TokenStream} to consume
*/
public FilteringTokenFilter(Version version, TokenStream in) {
super(in);
this.version = version;
this.enablePositionIncrements = true;
}
/** Override this method and return if the current input token should be returned by {@link #incrementToken}. */
protected abstract boolean accept() throws IOException;
@ -102,8 +132,11 @@ public abstract class FilteringTokenFilter extends TokenFilter {
* <p> <b>NOTE</b>: be sure to also
* set org.apache.lucene.queryparser.classic.QueryParser#setEnablePositionIncrements if
* you use QueryParser to create queries.
* @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4
*/
@Deprecated
public void setEnablePositionIncrements(boolean enable) {
checkPositionIncrement(version, enable);
this.enablePositionIncrements = enable;
}
}

View File

@ -161,8 +161,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
// startOffset thats > its endOffset
// (see LUCENE-3738 for a list of other offenders here)
// broken!
Lucene43NGramTokenizer.class,
// broken!
EdgeNGramTokenizer.class,
// broken!
EdgeNGramTokenFilter.class,
@ -182,55 +180,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
private static final Map<Constructor<?>,Predicate<Object[]>> brokenOffsetsConstructors = new HashMap<Constructor<?>, Predicate<Object[]>>();
static {
try {
brokenOffsetsConstructors.put(
TrimFilter.class.getConstructor(TokenStream.class, boolean.class),
new Predicate<Object[]>() {
@Override
public boolean apply(Object[] args) {
assert args.length == 2;
return (Boolean) args[1]; // args are broken if updateOffsets is true
}
});
brokenOffsetsConstructors.put(
TypeTokenFilter.class.getConstructor(boolean.class, TokenStream.class, Set.class, boolean.class),
new Predicate<Object[]>() {
@Override
public boolean apply(Object[] args) {
assert args.length == 4;
// LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
return !(Boolean) args[0];
}
});
brokenOffsetsConstructors.put(
TypeTokenFilter.class.getConstructor(boolean.class, TokenStream.class, Set.class),
new Predicate<Object[]>() {
@Override
public boolean apply(Object[] args) {
assert args.length == 3;
// LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
return !(Boolean) args[0];
}
});
brokenOffsetsConstructors.put(
LengthFilter.class.getConstructor(boolean.class, TokenStream.class, int.class, int.class),
new Predicate<Object[]>() {
@Override
public boolean apply(Object[] args) {
assert args.length == 4;
// LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
return !(Boolean) args[0];
}
});
brokenOffsetsConstructors.put(
KeepWordFilter.class.getConstructor(boolean.class, TokenStream.class, CharArraySet.class),
new Predicate<Object[]>() {
@Override
public boolean apply(Object[] args) {
assert args.length == 3;
// LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
return !(Boolean) args[0];
}
});
for (Class<?> c : Arrays.<Class<?>>asList(
ReversePathHierarchyTokenizer.class,
PathHierarchyTokenizer.class,

View File

@ -75,7 +75,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
doTestStopPositons(stpf,true);
// without increments
reader = new StringReader(sb.toString());
stpf = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
stpf = new StopFilter(Version.LUCENE_43, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
doTestStopPositons(stpf,false);
// with increments, concatenating two stop filters
ArrayList<String> a0 = new ArrayList<String>();
@ -166,7 +166,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenFilter filter = new MockSynonymFilter(tokenizer);
StopFilter stopfilter = new StopFilter(TEST_VERSION_CURRENT, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
StopFilter stopfilter = new StopFilter(Version.LUCENE_43, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
stopfilter.setEnablePositionIncrements(false);
return new TokenStreamComponents(tokenizer, stopfilter);
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.English;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.StringReader;
@ -36,7 +37,7 @@ public class TestTypeTokenFilter extends BaseTokenStreamTestCase {
public void testTypeFilter() throws IOException {
StringReader reader = new StringReader("121 is palindrome, while 123 is not");
Set<String> stopTypes = asSet("<NUM>");
TokenStream stream = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes);
TokenStream stream = new TypeTokenFilter(TEST_VERSION_CURRENT, true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes);
assertTokenStreamContents(stream, new String[]{"is", "palindrome", "while", "is", "not"});
}
@ -59,12 +60,12 @@ public class TestTypeTokenFilter extends BaseTokenStreamTestCase {
// with increments
StringReader reader = new StringReader(sb.toString());
TypeTokenFilter typeTokenFilter = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
TypeTokenFilter typeTokenFilter = new TypeTokenFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
testPositons(typeTokenFilter);
// without increments
reader = new StringReader(sb.toString());
typeTokenFilter = new TypeTokenFilter(false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
typeTokenFilter = new TypeTokenFilter(Version.LUCENE_43, false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
testPositons(typeTokenFilter);
}
@ -87,7 +88,7 @@ public class TestTypeTokenFilter extends BaseTokenStreamTestCase {
public void testTypeFilterWhitelist() throws IOException {
StringReader reader = new StringReader("121 is palindrome, while 123 is not");
Set<String> stopTypes = Collections.singleton("<NUM>");
TokenStream stream = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes, true);
TokenStream stream = new TypeTokenFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes, true);
assertTokenStreamContents(stream, new String[]{"121", "123"});
}

View File

@ -50,7 +50,7 @@ public class TestTypeTokenFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testCreationWithBlackList() throws Exception {
TokenFilterFactory factory = tokenFilterFactory("Type",
"types", "stoptypes-1.txt, stoptypes-2.txt",
"enablePositionIncrements", "false");
"enablePositionIncrements", "true");
NumericTokenStream input = new NumericTokenStream();
input.setIntValue(123);
factory.create(input);
@ -59,7 +59,7 @@ public class TestTypeTokenFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testCreationWithWhiteList() throws Exception {
TokenFilterFactory factory = tokenFilterFactory("Type",
"types", "stoptypes-1.txt, stoptypes-2.txt",
"enablePositionIncrements", "false",
"enablePositionIncrements", "true",
"useWhitelist", "true");
NumericTokenStream input = new NumericTokenStream();
input.setIntValue(123);

View File

@ -61,7 +61,7 @@ public class TestIrishAnalyzer extends BaseTokenStreamTestCase {
Analyzer a = new IrishAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(a, "n-athair",
new String[] { "athair" },
new int[] { 1 });
new int[] { 2 });
}
/** blast some random strings through the analyzer */

View File

@ -28,6 +28,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/** Test {@link KeepWordFilter} */
public class TestKeepWordFilter extends BaseTokenStreamTestCase {
@ -42,22 +43,22 @@ public class TestKeepWordFilter extends BaseTokenStreamTestCase {
// Test Stopwords
TokenStream stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
stream = new KeepWordFilter(TEST_VERSION_CURRENT, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 });
// Now force case
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
stream = new KeepWordFilter(TEST_VERSION_CURRENT, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 });
// Test Stopwords
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
stream = new KeepWordFilter(Version.LUCENE_43, false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 });
// Now force case
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
stream = new KeepWordFilter(Version.LUCENE_43, false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
}
@ -72,7 +73,7 @@ public class TestKeepWordFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenStream stream = new KeepWordFilter(true, tokenizer, new CharArraySet(TEST_VERSION_CURRENT, words, true));
TokenStream stream = new KeepWordFilter(TEST_VERSION_CURRENT, tokenizer, new CharArraySet(TEST_VERSION_CURRENT, words, true));
return new TokenStreamComponents(tokenizer, stream);
}
};

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
@ -29,7 +30,7 @@ public class TestLengthFilter extends BaseTokenStreamTestCase {
public void testFilterNoPosIncr() throws Exception {
TokenStream stream = new MockTokenizer(
new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
LengthFilter filter = new LengthFilter(false, stream, 2, 6);
LengthFilter filter = new LengthFilter(Version.LUCENE_43, false, stream, 2, 6);
assertTokenStreamContents(filter,
new String[]{"short", "ab", "foo"},
new int[]{1, 1, 1}
@ -39,7 +40,7 @@ public class TestLengthFilter extends BaseTokenStreamTestCase {
public void testFilterWithPosIncr() throws Exception {
TokenStream stream = new MockTokenizer(
new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
LengthFilter filter = new LengthFilter(true, stream, 2, 6);
LengthFilter filter = new LengthFilter(TEST_VERSION_CURRENT, stream, 2, 6);
assertTokenStreamContents(filter,
new String[]{"short", "ab", "foo"},
new int[]{1, 4, 2}
@ -51,7 +52,7 @@ public class TestLengthFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new LengthFilter(true, tokenizer, 0, 5));
return new TokenStreamComponents(tokenizer, new LengthFilter(TEST_VERSION_CURRENT, tokenizer, 0, 5));
}
};
checkOneTermReuse(a, "", "");

View File

@ -22,6 +22,8 @@ import java.io.StringReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.util.Version;
public class TestLengthFilterFactory extends BaseTokenStreamFactoryTestCase {
@ -29,8 +31,10 @@ public class TestLengthFilterFactory extends BaseTokenStreamFactoryTestCase {
Reader reader = new StringReader("foo foobar super-duper-trooper");
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
stream = tokenFilterFactory("Length",
Version.LUCENE_43, new ClasspathResourceLoader(getClass()),
"min", "4",
"max", "10").create(stream);
"max", "10",
"enablePositionIncrements", "false").create(stream);
assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 1 });
}

View File

@ -29,6 +29,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.util.Version;
/**
*/
@ -46,7 +47,7 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
new Token(ccc, 0, ccc.length, 11, 15),
new Token(whitespace, 0, whitespace.length, 16, 20),
new Token(empty, 0, empty.length, 21, 21));
ts = new TrimFilter(ts, false);
ts = new TrimFilter(TEST_VERSION_CURRENT, ts, false);
assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""});
@ -59,7 +60,7 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
new Token(b, 0, b.length, 0, 2),
new Token(ccc, 0, ccc.length, 0, 3),
new Token(whitespace, 0, whitespace.length, 0, 3));
ts = new TrimFilter(ts, true);
ts = new TrimFilter(Version.LUCENE_43, ts, true);
assertTokenStreamContents(ts,
new String[] { "a", "b", "c", "" },
@ -120,7 +121,7 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, false));
return new TokenStreamComponents(tokenizer, new TrimFilter(Version.LUCENE_43, tokenizer, true));
}
};
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
@ -130,7 +131,7 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, true));
return new TokenStreamComponents(tokenizer, new TrimFilter(TEST_VERSION_CURRENT, tokenizer, false));
}
};
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
@ -141,7 +142,9 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, random().nextBoolean()));
final boolean updateOffsets = random().nextBoolean();
final Version version = updateOffsets ? Version.LUCENE_43 : TEST_VERSION_CURRENT;
return new TokenStreamComponents(tokenizer, new TrimFilter(version, tokenizer, updateOffsets));
}
};
checkOneTermReuse(a, "", "");

View File

@ -306,7 +306,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
tokenizer, StandardAnalyzer.STOP_WORDS_SET);
filter.setEnablePositionIncrements(true);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(filter, flags, protWords));
}
};

View File

@ -89,7 +89,7 @@ public class JapaneseAnalyzer extends StopwordAnalyzerBase {
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new JapaneseTokenizer(reader, userDict, true, mode);
TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
stream = new JapanesePartOfSpeechStopFilter(matchVersion, stream, stoptags);
stream = new CJKWidthFilter(stream);
stream = new StopFilter(matchVersion, stream, stopwords);
stream = new JapaneseKatakanaStemFilter(stream);

View File

@ -22,6 +22,7 @@ import java.util.Set;
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
/**
* Removes tokens that match a set of part-of-speech tags.
@ -30,8 +31,21 @@ public final class JapanesePartOfSpeechStopFilter extends FilteringTokenFilter {
private final Set<String> stopTags;
private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
public JapanesePartOfSpeechStopFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTags) {
super(enablePositionIncrements, input);
/** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
@Deprecated
public JapanesePartOfSpeechStopFilter(Version version, boolean enablePositionIncrements, TokenStream input, Set<String> stopTags) {
super(version, enablePositionIncrements, input);
this.stopTags = stopTags;
}
/**
* Create a new {@link JapanesePartOfSpeechStopFilter}.
* @param version the Lucene match version
* @param input the {@link TokenStream} to consume
* @param stopTags the part-of-speech tags that should be removed
*/
public JapanesePartOfSpeechStopFilter(Version version, TokenStream input, Set<String> stopTags) {
super(version, input);
this.stopTags = stopTags;
}

View File

@ -50,7 +50,7 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im
public JapanesePartOfSpeechStopFilterFactory(Map<String,String> args) {
super(args);
stopTagFiles = get(args, "tags");
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -72,6 +72,12 @@ public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory im
@Override
public TokenStream create(TokenStream stream) {
// if stoptags is null, it means the file is empty
return stopTags == null ? stream : new JapanesePartOfSpeechStopFilter(enablePositionIncrements, stream, stopTags);
if (stopTags != null) {
@SuppressWarnings("deprecation")
final TokenStream filter = new JapanesePartOfSpeechStopFilter(luceneMatchVersion, enablePositionIncrements, stream, stopTags);
return filter;
} else {
return stream;
}
}
}

View File

@ -17,12 +17,8 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
@ -43,8 +39,16 @@ import org.apache.lucene.util.automaton.Transition;
* @lucene.experimental */
public class TokenStreamToAutomaton {
private boolean preservePositionIncrements;
/** Sole constructor. */
public TokenStreamToAutomaton() {
this.preservePositionIncrements = true;
}
/** Whether to generate holes in the automaton for missing positions, <code>true</code> by default. */
public void setPreservePositionIncrements(boolean enablePositionIncrements) {
this.preservePositionIncrements = enablePositionIncrements;
}
private static class Position implements RollingBuffer.Resettable {
@ -108,6 +112,9 @@ public class TokenStreamToAutomaton {
int maxOffset = 0;
while (in.incrementToken()) {
int posInc = posIncAtt.getPositionIncrement();
if (!preservePositionIncrements && posInc > 1) {
posInc = 1;
}
assert pos > -1 || posInc > 0;
if (posInc > 0) {

View File

@ -282,18 +282,18 @@ and proximity searches (though sentence identification is not provided by Lucene
<p>
If the selected analyzer filters the stop words "is" and "the", then for a document
containing the string "blue is the sky", only the tokens "blue", "sky" are indexed,
with position("sky") = 1 + position("blue"). Now, a phrase query "blue is the sky"
with position("sky") = 3 + position("blue"). Now, a phrase query "blue is the sky"
would find that document, because the same analyzer filters the same stop words from
that query. But also the phrase query "blue sky" would find that document.
that query. But the phrase query "blue sky" would not find that document because the
position increment between "blue" and "sky" is only 1.
</p>
<p>
If this behavior does not fit the application needs, a modified analyzer can
be used, that would increment further the positions of tokens following a
removed stop word, using
{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#setPositionIncrement(int)}.
This can be done with something like the following (note, however, that
StopFilter natively includes this capability by subclassing
FilteringTokenFilter}:
If this behavior does not fit the application needs, the query parser needs to be
configured to not take position increments into account when generating phrase queries.
</p>
<p>
Note that a StopFilter MUST increment the position increment in order not to generate corrupt
tokenstream graphs. Here is the logic used by StopFilter to increment positions when filtering out tokens:
</p>
<PRE class="prettyprint">
public TokenStream tokenStream(final String fieldName, Reader reader) {
@ -308,7 +308,7 @@ and proximity searches (though sentence identification is not provided by Lucene
boolean hasNext = ts.incrementToken();
if (hasNext) {
if (stopWords.contains(termAtt.toString())) {
extraIncrement++; // filter this word
extraIncrement += posIncrAtt.getPositionIncrement(); // filter this word
continue;
}
if (extraIncrement>0) {
@ -322,11 +322,6 @@ and proximity searches (though sentence identification is not provided by Lucene
return res;
}
</PRE>
<p>
Now, with this modified analyzer, the phrase query "blue sky" would find that document.
But note that this is yet not a perfect solution, because any phrase query "blue w1 w2 sky"
where both w1 and w2 are stop words would match that document.
</p>
<p>
A few more use cases for modifying position increments are:
</p>
@ -338,6 +333,72 @@ and proximity searches (though sentence identification is not provided by Lucene
As result, all synonyms of a token would be considered to appear in exactly the
same position as that token, and so would they be seen by phrase and proximity searches.</li>
</ol>
<h3>Token Position Length</h3>
<p>
By default, all tokens created by Analyzers and Tokenizers have a
{@link org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute#getPositionLength() position length} of one.
This means that the token occupies a single position. This attribute is not indexed
and thus not taken into account for positional queries, but is used by eg. suggesters.
</p>
<p>
The main use case for positions lengths is multi-word synonyms. With single-word
synonyms, setting the position increment to 0 is enough to denote the fact that two
words are synonyms, for example:
</p>
<table>
<tr><td>Term</td><td>red</td><td>magenta</td></tr>
<tr><td>Position increment</td><td>1</td><td>0</td></tr>
</table>
<p>
Given that position(magenta) = 0 + position(red), they are at the same position, so anything
working with analyzers will return the exact same result if you replace "magenta" with "red"
in the input. However, multi-word synonyms are more tricky. Let's say that you want to build
a TokenStream where "IBM" is a synonym of "Internal Business Machines". Position increments
are not enough anymore:
</p>
<table>
<tr><td>Term</td><td>IBM</td><td>International</td><td>Business</td><td>Machines</td></tr>
<tr><td>Position increment</td><td>1</td><td>0</td><td>1</td><td>1</td></tr>
</table>
<p>
The problem with this token stream is that "IBM" is at the same position as "International"
although it is a synonym with "International Business Machines" as a whole. Setting
the position increment of "Business" and "Machines" to 0 wouldn't help as it would mean
than "International" is a synonym of "Business". The only way to solve this issue is to
make "IBM" span across 3 positions, this is where position lengths come to rescue.
</p>
<table>
<tr><td>Term</td><td>IBM</td><td>International</td><td>Business</td><td>Machines</td></tr>
<tr><td>Position increment</td><td>1</td><td>0</td><td>1</td><td>1</td></tr>
<tr><td>Position length</td><td>3</td><td>1</td><td>1</td><td>1</td></tr>
</table>
<p>
This new attribute makes clear that "IBM" and "International Business Machines" start and end
at the same positions.
</p>
<a name="corrupt" />
<h3>How to not write corrupt token streams</h3>
<p>
There are a few rules to observe when writing custom Tokenizers and TokenFilters:
</p>
<ul>
<li>The first position increment must be &gt; 0.</li>
<li>Positions must not go backward.</li>
<li>Tokens that have the same start position must have the same start offset.</li>
<li>Tokens that have the same end position (taking into account the position length) must have the same end offset.</li>
</ul>
<p>
Although these rules might seem easy to follow, problems can quickly happen when chaining
badly implemented filters that play with positions and offsets, such as synonym or n-grams
filters. Here are good practices for writing correct filters:
</p>
<ul>
<li>Token filters should not modify offsets. If you feel that your filter would need to modify offsets, then it should probably be implemented as a tokenizer.</li>
<li>Token filters should not insert positions. If a filter needs to add tokens, then they shoud all have a position increment of 0.</li>
<li>When they remove tokens, token filters should increment the position increment of the following token.</li>
<li>Token filters should preserve position lengths.</li>
</ul>
<h2>TokenStream API</h2>
<p>
"Flexible Indexing" summarizes the effort of making the Lucene indexer
@ -382,6 +443,10 @@ and proximity searches (though sentence identification is not provided by Lucene
<td>{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute}</td>
<td>See above for detailed information about position increment.</td>
</tr>
<tr>
<td>{@link org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute}</td>
<td>The number of positions occupied by a token.</td>
</tr>
<tr>
<td>{@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute}</td>
<td>The payload that a Token can optionally have.</td>
@ -532,20 +597,26 @@ public final class LengthFilter extends FilteringTokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
* Build a filter that removes words that are too long or too
* short from the text.
* Create a new LengthFilter. This will filter out tokens whose
* CharTermAttribute is either too short
* (&lt; min) or too long (&gt; max).
* @param version the Lucene match version
* @param in the TokenStream to consume
* @param min the minimum length
* @param max the maximum length
*/
public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) {
super(enablePositionIncrements, in);
public LengthFilter(Version version, TokenStream in, int min, int max) {
super(version, in);
this.min = min;
this.max = max;
}
{@literal @Override}
public boolean accept() throws IOException {
public boolean accept() {
final int len = termAtt.length();
return (len >= min && len <= max);
return (len &gt;= min &amp;&amp; len <= max);
}
}
</pre>
<p>
@ -573,19 +644,20 @@ public final class LengthFilter extends FilteringTokenFilter {
public abstract class FilteringTokenFilter extends TokenFilter {
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
super(input);
this.enablePositionIncrements = enablePositionIncrements;
/**
* Create a new FilteringTokenFilter.
* @param in the TokenStream to consume
*/
public FilteringTokenFilter(Version version, TokenStream in) {
super(in);
}
/** Override this method and return if the current input token should be returned by {@literal {@link #incrementToken}}. */
/** Override this method and return if the current input token should be returned by incrementToken. */
protected abstract boolean accept() throws IOException;
{@literal @Override}
public final boolean incrementToken() throws IOException {
if (enablePositionIncrements) {
int skippedPositions = 0;
while (input.incrementToken()) {
if (accept()) {
@ -596,43 +668,15 @@ public abstract class FilteringTokenFilter extends TokenFilter {
}
skippedPositions += posIncrAtt.getPositionIncrement();
}
} else {
while (input.incrementToken()) {
if (accept()) {
return true;
}
}
}
// reached EOS -- return false
return false;
}
/**
* {@literal @see #setEnablePositionIncrements(boolean)}
*/
public boolean getEnablePositionIncrements() {
return enablePositionIncrements;
{@literal @Override}
public void reset() throws IOException {
super.reset();
}
/**
* If <code>true</code>, this TokenFilter will preserve
* positions of the incoming tokens (ie, accumulate and
* set position increments of the removed tokens).
* Generally, <code>true</code> is best as it does not
* lose information (positions of the original tokens)
* during indexing.
*
* <p> When set, when a token is stopped
* (omitted), the position increment of the following
* token is incremented.
*
* <p> <b>NOTE</b>: be sure to also
* set org.apache.lucene.queryparser.classic.QueryParser#setEnablePositionIncrements if
* you use QueryParser to create queries.
*/
public void setEnablePositionIncrements(boolean enable) {
this.enablePositionIncrements = enable;
}
}
</pre>

View File

@ -64,16 +64,10 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
/** Test a configuration that behaves a lot like StopAnalyzer */
public void testStop() throws Exception {
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
assertAnalyzesTo(a, "the quick brown a fox",
new String[] { "quick", "brown", "fox" },
new int[] { 2, 1, 2 });
// disable positions
a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, false);
assertAnalyzesTo(a, "the quick brown a fox",
new String[] { "quick", "brown", "fox" },
new int[] { 1, 1, 1 });
}
/** Test a configuration that behaves a lot like KeepWordFilter */
@ -83,7 +77,7 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
BasicOperations.complement(
Automaton.union(
Arrays.asList(BasicAutomata.makeString("foo"), BasicAutomata.makeString("bar")))));
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords, true);
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords);
assertAnalyzesTo(a, "quick foo brown bar bar fox foo",
new String[] { "foo", "bar", "bar", "foo" },
new int[] { 2, 2, 1, 2 });
@ -92,7 +86,7 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
/** Test a configuration that behaves a lot like LengthFilter */
public void testLength() throws Exception {
CharacterRunAutomaton length5 = new CharacterRunAutomaton(new RegExp(".{5,}").toAutomaton());
Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, length5, true);
Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, length5);
assertAnalyzesTo(a, "ok toolong fine notfine",
new String[] { "ok", "fine" },
new int[] { 1, 2 });

View File

@ -213,7 +213,7 @@ public class TestTermVectorsWriter extends LuceneTestCase {
public void testEndOffsetPositionStopFilter() throws Exception {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)));
TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
Document doc = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);

View File

@ -222,7 +222,7 @@ public class TestPhraseQuery extends LuceneTestCase {
public void testPhraseQueryWithStopAnalyzer() throws Exception {
Directory directory = newDirectory();
Analyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, false);
Analyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
newIndexWriterConfig( Version.LUCENE_40, stopAnalyzer));
Document doc = new Document();
@ -241,16 +241,6 @@ public class TestPhraseQuery extends LuceneTestCase {
assertEquals(1, hits.length);
QueryUtils.check(random(), query,searcher);
// StopAnalyzer as of 2.4 does not leave "holes", so this matches.
query = new PhraseQuery();
query.add(new Term("field", "words"));
query.add(new Term("field", "here"));
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
QueryUtils.check(random(), query,searcher);
reader.close();
directory.close();
}

View File

@ -37,7 +37,7 @@ public class TestSpanFirstQuery extends LuceneTestCase {
// mimic StopAnalyzer
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|a|of").toAutomaton());
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true);
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
Document doc = new Document();

View File

@ -60,7 +60,7 @@ public class TestSpansAdvanced extends LuceneTestCase {
mDirectory = newDirectory();
final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory,
newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true))
new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET))
.setMergePolicy(newLogMergePolicy()).setSimilarity(new DefaultSimilarity()));
addDocument(writer, "1", "I think it should work.");
addDocument(writer, "2", "I think it should work.");

View File

@ -49,7 +49,7 @@ public class TestSpansAdvanced2 extends TestSpansAdvanced {
// create test index
final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory,
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(),
MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true))
MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET))
.setOpenMode(OpenMode.APPEND).setMergePolicy(newLogMergePolicy())
.setSimilarity(new DefaultSimilarity()));
addDocument(writer, "A", "Should we, could we, would we?");

View File

@ -247,7 +247,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
*/
private String highlightField(Query query, String fieldName, String text)
throws IOException, InvalidTokenOffsetsException {
TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)
TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)
.tokenStream(fieldName, new StringReader(text));
// Assuming "<B>", "</B>" used to highlight
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
@ -1308,7 +1308,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
}
public void testMaxSizeHighlight() throws Exception {
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
// we disable MockTokenizer checks because we will forcefully limit the
// tokenstream and call end() before incrementToken() returns false.
analyzer.setEnableChecks(false);
@ -1343,7 +1343,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("stoppedtoken"));
// we disable MockTokenizer checks because we will forcefully limit the
// tokenstream and call end() before incrementToken() returns false.
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords, true);
final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
analyzer.setEnableChecks(false);
TermQuery query = new TermQuery(new Term("data", goodWord));
@ -1394,7 +1394,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
Highlighter hg = getHighlighter(query, "text", fm);
hg.setTextFragmenter(new NullFragmenter());
hg.setMaxDocCharsToAnalyze(36);
String match = hg.getBestFragment(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords, true), "text", text);
String match = hg.getBestFragment(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords), "text", text);
assertTrue(
"Matched text should contain remainder of text after highlighted query ",
match.endsWith("in it"));
@ -1411,7 +1411,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
numHighlights = 0;
// test to show how rewritten query can still be used
searcher = newSearcher(reader);
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
BooleanQuery query = new BooleanQuery();
query.add(new WildcardQuery(new Term(FIELD_NAME, "jf?")), Occur.SHOULD);
@ -1875,11 +1875,11 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
super.setUp();
a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
dir = newDirectory();
ramDir = newDirectory();
IndexWriter writer = new IndexWriter(ramDir, newIndexWriterConfig(
TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)));
TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
for (String text : texts) {
addDoc(writer, text);
}

View File

@ -89,7 +89,7 @@ public class HighlightCustomQueryTest extends LuceneTestCase {
private String highlightField(Query query, String fieldName,
String text) throws IOException, InvalidTokenOffsetsException {
TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE,
true, MockTokenFilter.ENGLISH_STOPSET, true).tokenStream(fieldName,
true, MockTokenFilter.ENGLISH_STOPSET).tokenStream(fieldName,
new StringReader(text));
// Assuming "<B>", "</B>" used to highlight
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();

View File

@ -247,7 +247,7 @@ public class FastVectorHighlighterTest extends LuceneTestCase {
public void testCommonTermsQueryHighlightTest() throws IOException {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)));
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
FieldType type = new FieldType(TextField.TYPE_STORED);
type.setStoreTermVectorOffsets(true);
type.setStoreTermVectorPositions(true);

View File

@ -259,7 +259,7 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
private Analyzer randomAnalyzer() {
switch(random().nextInt(4)) {
case 0: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
case 2: return new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {

View File

@ -546,7 +546,7 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
public void testBoost() throws Exception {
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true);
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
PrecedenceQueryParser qp = new PrecedenceQueryParser();
qp.setAnalyzer(oneStopAnalyzer);
@ -561,7 +561,7 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
q = qp.parse("\"on\"^1.0", "field");
assertNotNull(q);
q = getParser(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)).parse("the^3",
q = getParser(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)).parse("the^3",
"field");
assertNotNull(q);
}

View File

@ -946,7 +946,7 @@ public class TestQPHelper extends LuceneTestCase {
public void testBoost() throws Exception {
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true);
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
StandardQueryParser qp = new StandardQueryParser();
qp.setAnalyzer(oneStopAnalyzer);
@ -962,7 +962,7 @@ public class TestQPHelper extends LuceneTestCase {
assertNotNull(q);
StandardQueryParser qp2 = new StandardQueryParser();
qp2.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true));
qp2.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));
q = qp2.parse("the^3", "field");
// "the" is a stop word so the result is an empty query:
@ -1179,7 +1179,7 @@ public class TestQPHelper extends LuceneTestCase {
public void testStopwords() throws Exception {
StandardQueryParser qp = new StandardQueryParser();
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
qp.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true));
qp.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
Query result = qp.parse("a:the OR a:foo", "a");
assertNotNull("result is null and it shouldn't be", result);
@ -1203,7 +1203,7 @@ public class TestQPHelper extends LuceneTestCase {
public void testPositionIncrement() throws Exception {
StandardQueryParser qp = new StandardQueryParser();
qp.setAnalyzer(
new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true));
new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));
qp.setEnablePositionIncrements(true);

View File

@ -852,7 +852,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
public void testBoost()
throws Exception {
CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords, true);
Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
CommonQueryParserConfiguration qp = getParserConfig(oneStopAnalyzer);
Query q = getQuery("on^1.0",qp);
assertNotNull(q);
@ -865,7 +865,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
q = getQuery("\"on\"^1.0",qp);
assertNotNull(q);
Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
CommonQueryParserConfiguration qp2 = getParserConfig(a2);
q = getQuery("the^3", qp2);
// "the" is a stop word so the result is an empty query:
@ -1007,7 +1007,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
public void testStopwords() throws Exception {
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true));
CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
Query result = getQuery("field:the OR field:foo",qp);
assertNotNull("result is null and it shouldn't be", result);
assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
@ -1023,7 +1023,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
}
public void testPositionIncrement() throws Exception {
CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true));
CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));
qp.setEnablePositionIncrements(true);
String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\"";
// 0 2 5 7 8
@ -1070,7 +1070,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
// "match"
public void testPositionIncrements() throws Exception {
Directory dir = newDirectory();
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, a));
Document doc = new Document();
doc.add(newTextField("field", "the wizard of ozzy", Field.Store.NO));
@ -1185,7 +1185,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
}
public void testPhraseQueryToString() throws Exception {
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
CommonQueryParserConfiguration qp = getParserConfig(analyzer);
qp.setEnablePositionIncrements(true);
PhraseQuery q = (PhraseQuery)getQuery("\"this hi this is a test is\"", qp);
@ -1235,26 +1235,13 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
CharacterRunAutomaton stopStopList =
new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton());
CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList, false));
PhraseQuery phraseQuery = new PhraseQuery();
phraseQuery.add(new Term("field", "1"));
phraseQuery.add(new Term("field", "2"));
assertEquals(phraseQuery, getQuery("\"1 2\"",qp));
assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp));
qp.setEnablePositionIncrements(true);
assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp));
qp.setEnablePositionIncrements(false);
assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp));
CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList));
qp = getParserConfig(
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList, true));
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList));
qp.setEnablePositionIncrements(true);
phraseQuery = new PhraseQuery();
PhraseQuery phraseQuery = new PhraseQuery();
phraseQuery.add(new Term("field", "1"));
phraseQuery.add(new Term("field", "2"), 2);
assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp));

View File

@ -58,7 +58,7 @@ public class TestParser extends LuceneTestCase {
@BeforeClass
public static void beforeClass() throws Exception {
// TODO: rewrite test (this needs to set QueryParser.enablePositionIncrements, too, for work with CURRENT):
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false);
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
//initialize the parser
builder = new CorePlusExtensionsParser("contents", analyzer);

View File

@ -75,9 +75,9 @@ import org.apache.lucene.util.fst.Util;
* example, if you use an analyzer removing stop words,
* then the partial text "ghost chr..." could see the
* suggestion "The Ghost of Christmas Past". Note that
* your {@code StopFilter} instance must NOT preserve
* position increments for this example to work, so you should call
* {@code setEnablePositionIncrements(false)} on it.
* position increments MUST NOT be preserved for this example
* to work, so you should call
* {@link #setPreservePositionIncrements(boolean) setPreservePositionIncrements(false)}.
*
* <p>
* If SynonymFilter is used to map wifi and wireless network to
@ -185,6 +185,9 @@ public class AnalyzingSuggester extends Lookup {
private static final int PAYLOAD_SEP = '\u001f';
/** Whether position holes should appear in the automaton. */
private boolean preservePositionIncrements;
/**
* Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
@ -241,6 +244,13 @@ public class AnalyzingSuggester extends Lookup {
throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")");
}
this.maxGraphExpansions = maxGraphExpansions;
preservePositionIncrements = true;
}
/** Whether to take position holes (position increment > 1) into account when
* building the automaton, <code>true</code> by default. */
public void setPreservePositionIncrements(boolean preservePositionIncrements) {
this.preservePositionIncrements = preservePositionIncrements;
}
/** Returns byte size of the underlying FST. */
@ -327,13 +337,16 @@ public class AnalyzingSuggester extends Lookup {
}
TokenStreamToAutomaton getTokenStreamToAutomaton() {
final TokenStreamToAutomaton tsta;
if (preserveSep) {
return new EscapingTokenStreamToAutomaton();
tsta = new EscapingTokenStreamToAutomaton();
} else {
// When we're not preserving sep, we don't steal 0xff
// byte, so we don't need to do any escaping:
return new TokenStreamToAutomaton();
tsta = new TokenStreamToAutomaton();
}
tsta.setPreservePositionIncrements(preservePositionIncrements);
return tsta;
}
private static class AnalyzingComparator implements Comparator<BytesRef> {

View File

@ -164,8 +164,9 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
new TermFreq("the ghost of christmas past", 50),
};
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false);
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
AnalyzingSuggester suggester = new AnalyzingSuggester(standard);
suggester.setPreservePositionIncrements(false);
suggester.build(new TermFreqArrayIterator(keys));
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
@ -187,7 +188,7 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
}
public void testEmpty() throws Exception {
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false);
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
AnalyzingSuggester suggester = new AnalyzingSuggester(standard);
suggester.build(new TermFreqArrayIterator(new TermFreq[0]));

View File

@ -153,8 +153,9 @@ public class FuzzySuggesterTest extends LuceneTestCase {
new TermFreq("the ghost of christmas past", 50),
};
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false);
Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
FuzzySuggester suggester = new FuzzySuggester(standard);
suggester.setPreservePositionIncrements(false);
suggester.build(new TermFreqArrayIterator(keys));
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);

View File

@ -17,7 +17,6 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
@ -46,7 +45,6 @@ public final class MockAnalyzer extends Analyzer {
private final CharacterRunAutomaton runAutomaton;
private final boolean lowerCase;
private final CharacterRunAutomaton filter;
private final boolean enablePositionIncrements;
private int positionIncrementGap;
private final Random random;
private Map<String,Integer> previousMappings = new HashMap<String,Integer>();
@ -60,30 +58,28 @@ public final class MockAnalyzer extends Analyzer {
* @param runAutomaton DFA describing how tokenization should happen (e.g. [a-zA-Z]+)
* @param lowerCase true if the tokenizer should lowercase terms
* @param filter DFA describing how terms should be filtered (set of stopwords, etc)
* @param enablePositionIncrements true if position increments should reflect filtered terms.
*/
public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase, CharacterRunAutomaton filter, boolean enablePositionIncrements) {
public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase, CharacterRunAutomaton filter) {
super(new PerFieldReuseStrategy());
// TODO: this should be solved in a different way; Random should not be shared (!).
this.random = new Random(random.nextLong());
this.runAutomaton = runAutomaton;
this.lowerCase = lowerCase;
this.filter = filter;
this.enablePositionIncrements = enablePositionIncrements;
}
/**
* Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton, boolean)
* Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton)
* MockAnalyzer(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false}).
*/
public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, true);
this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET);
}
/**
* Create a Whitespace-lowercasing analyzer with no stopwords removal.
* <p>
* Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton, boolean)
* Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton)
* MockAnalyzer(random, MockTokenizer.WHITESPACE, true, MockTokenFilter.EMPTY_STOPSET, false}).
*/
public MockAnalyzer(Random random) {
@ -95,7 +91,6 @@ public final class MockAnalyzer extends Analyzer {
MockTokenizer tokenizer = new MockTokenizer(reader, runAutomaton, lowerCase, maxTokenLength);
tokenizer.setEnableChecks(enableChecks);
MockTokenFilter filt = new MockTokenFilter(tokenizer, filter);
filt.setEnablePositionIncrements(enablePositionIncrements);
return new TokenStreamComponents(tokenizer, maybePayload(filt, fieldName));
}

View File

@ -55,7 +55,6 @@ public final class MockTokenFilter extends TokenFilter {
makeString("with"))));
private final CharacterRunAutomaton filter;
private boolean enablePositionIncrements = true;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
@ -80,9 +79,7 @@ public final class MockTokenFilter extends TokenFilter {
int skippedPositions = 0;
while (input.incrementToken()) {
if (!filter.run(termAtt.buffer(), 0, termAtt.length())) {
if (enablePositionIncrements) {
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
return true;
}
skippedPositions += posIncrAtt.getPositionIncrement();
@ -90,20 +87,4 @@ public final class MockTokenFilter extends TokenFilter {
// reached EOS -- return false
return false;
}
/**
* @see #setEnablePositionIncrements(boolean)
*/
public boolean getEnablePositionIncrements() {
return enablePositionIncrements;
}
/**
* If <code>true</code>, this Filter will preserve
* positions of the incoming tokens (ie, accumulate and
* set position increments of the removed stop tokens).
*/
public void setEnablePositionIncrements(boolean enable) {
this.enablePositionIncrements = enable;
}
}

View File

@ -59,7 +59,7 @@ public abstract class SearchEquivalenceTestBase extends LuceneTestCase {
directory = newDirectory();
stopword = "" + randomChar();
CharacterRunAutomaton stopset = new CharacterRunAutomaton(BasicAutomata.makeString(stopword));
analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset, true);
analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset);
RandomIndexWriter iw = new RandomIndexWriter(random, directory, analyzer);
Document doc = new Document();
Field id = new StringField("id", "", Field.Store.NO);

View File

@ -87,8 +87,8 @@ public class DisMaxRequestHandlerTest extends SolrTestCaseJ4 {
req("cool stuff")
,"//*[@numFound='3']"
,"//result/doc[1]/int[@name='id'][.='42']"
,"//result/doc[2]/int[@name='id'][.='666']"
,"//result/doc[3]/int[@name='id'][.='8675309']"
,"//result/doc[2]/int[@name='id'][.='8675309']"
,"//result/doc[3]/int[@name='id'][.='666']"
);
assertQ("multi qf",

View File

@ -323,16 +323,16 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe
tokenList = valueResult.get("org.apache.lucene.analysis.core.StopFilter");
assertNotNull("Expecting the 'StopFilter' to be applied on the index for the 'text' field", tokenList);
assertEquals("Expecting 4 tokens after stop word removal", 4, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 1, new int[]{2,2,2,1}, null, false));
assertToken(tokenList.get(1), new TokenInfo("jumped", null, "<ALPHANUM>", 8, 14, 2, new int[]{3,3,3,2}, null, false));
assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 3, new int[]{4,4,4,3}, null, false));
assertToken(tokenList.get(3), new TokenInfo("dogs", null, "<ALPHANUM>", 24, 28, 4, new int[]{6,6,6,4}, null, false));
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2,2,2,2}, null, false));
assertToken(tokenList.get(1), new TokenInfo("jumped", null, "<ALPHANUM>", 8, 14, 3, new int[]{3,3,3,3}, null, false));
assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4,4,4}, null, false));
assertToken(tokenList.get(3), new TokenInfo("dogs", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6,6,6}, null, false));
tokenList = valueResult.get("org.apache.lucene.analysis.en.PorterStemFilter");
assertNotNull("Expecting the 'PorterStemFilter' to be applied on the index for the 'text' field", tokenList);
assertEquals("Expecting 4 tokens", 4, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 1, new int[]{2,2,2,1,1}, null, false));
assertToken(tokenList.get(1), new TokenInfo("jump", null, "<ALPHANUM>", 8, 14, 2, new int[]{3,3,3,2,2}, null, true));
assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 3, new int[]{4,4,4,3,3}, null, false));
assertToken(tokenList.get(3), new TokenInfo("dog", null, "<ALPHANUM>", 24, 28, 4, new int[]{6,6,6,4,4}, null, false));
assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2,2,2,2,2}, null, false));
assertToken(tokenList.get(1), new TokenInfo("jump", null, "<ALPHANUM>", 8, 14, 3, new int[]{3,3,3,3,3}, null, true));
assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4,4,4,4}, null, false));
assertToken(tokenList.get(3), new TokenInfo("dog", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6,6,6,6}, null, false));
}
}

View File

@ -178,25 +178,25 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
tokenList = indexPart.get("org.apache.lucene.analysis.core.StopFilter");
assertNotNull("Expcting StopFilter analysis breakdown", tokenList);
assertEquals(tokenList.size(), 8);
assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 1, new int[]{2,2,2,1}, null, false));
assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 2, new int[]{3,3,3,2}, null, false));
assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 3, new int[]{4,4,4,3}, null, true));
assertToken(tokenList.get(3), new TokenInfo("jumped", null, "<ALPHANUM>", 18, 24, 4, new int[]{5,5,5,4}, null, false));
assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 5, new int[]{6,6,6,5}, null, false));
assertToken(tokenList.get(5), new TokenInfo("lazy", null, "<ALPHANUM>", 34, 38, 6, new int[]{8,8,8,6}, null, false));
assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 7, new int[]{9,9,9,7}, null, true));
assertToken(tokenList.get(7), new TokenInfo("dogs", null, "<ALPHANUM>", 45, 49, 8, new int[]{10,10,10,8}, null, false));
assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2,2,2}, null, false));
assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 3, new int[]{3,3,3,3}, null, false));
assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 4, new int[]{4,4,4,4}, null, true));
assertToken(tokenList.get(3), new TokenInfo("jumped", null, "<ALPHANUM>", 18, 24, 5, new int[]{5,5,5,5}, null, false));
assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 6, new int[]{6,6,6,6}, null, false));
assertToken(tokenList.get(5), new TokenInfo("lazy", null, "<ALPHANUM>", 34, 38, 8, new int[]{8,8,8,8}, null, false));
assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 9, new int[]{9,9,9,9}, null, true));
assertToken(tokenList.get(7), new TokenInfo("dogs", null, "<ALPHANUM>", 45, 49, 10, new int[]{10,10,10,10}, null, false));
tokenList = indexPart.get("org.apache.lucene.analysis.en.PorterStemFilter");
assertNotNull("Expcting PorterStemFilter analysis breakdown", tokenList);
assertEquals(tokenList.size(), 8);
assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 1, new int[]{2,2,2,1,1}, null, false));
assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 2, new int[]{3,3,3,2,2}, null, false));
assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 3, new int[]{4,4,4,3,3}, null, true));
assertToken(tokenList.get(3), new TokenInfo("jump", null, "<ALPHANUM>", 18, 24, 4, new int[]{5,5,5,4,4}, null, false));
assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 5, new int[]{6,6,6,5,5}, null, false));
assertToken(tokenList.get(5), new TokenInfo("lazi", null, "<ALPHANUM>", 34, 38, 6, new int[]{8,8,8,6,6}, null, false));
assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 7, new int[]{9,9,9,7,7}, null, true));
assertToken(tokenList.get(7), new TokenInfo("dog", null, "<ALPHANUM>", 45, 49, 8, new int[]{10,10,10,8,8}, null, false));
assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2,2,2,2}, null, false));
assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 3, new int[]{3,3,3,3,3}, null, false));
assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 4, new int[]{4,4,4,4,4}, null, true));
assertToken(tokenList.get(3), new TokenInfo("jump", null, "<ALPHANUM>", 18, 24, 5, new int[]{5,5,5,5,5}, null, false));
assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 6, new int[]{6,6,6,6,6}, null, false));
assertToken(tokenList.get(5), new TokenInfo("lazi", null, "<ALPHANUM>", 34, 38, 8, new int[]{8,8,8,8,8}, null, false));
assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 9, new int[]{9,9,9,9,9}, null, true));
assertToken(tokenList.get(7), new TokenInfo("dog", null, "<ALPHANUM>", 45, 49, 10, new int[]{10,10,10,10,10}, null, false));
NamedList<List<NamedList>> queryPart = textType.get("query");
assertNotNull("expecting a query token analysis for field type 'text'", queryPart);

View File

@ -201,12 +201,12 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
public void testOptions() throws Exception {
assertJQ(req("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
, TermVectorParams.TF, "true", TermVectorParams.DF, "true", TermVectorParams.OFFSETS, "true", TermVectorParams.POSITIONS, "true", TermVectorParams.TF_IDF, "true")
,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':1}, 'df':2, 'tf-idf':0.5}"
,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
);
assertJQ(req("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
, TermVectorParams.ALL, "true")
,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':1}, 'df':2, 'tf-idf':0.5}"
,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
);
// test each combination at random
@ -214,7 +214,7 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
list.addAll(Arrays.asList("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"));
String[][] options = new String[][] { { TermVectorParams.TF, "'tf':1" },
{ TermVectorParams.OFFSETS, "'offsets':{'start':20, 'end':27}" },
{ TermVectorParams.POSITIONS, "'positions':{'position':1}" },
{ TermVectorParams.POSITIONS, "'positions':{'position':5}" },
{ TermVectorParams.DF, "'df':2" },
{ TermVectorParams.TF_IDF, "'tf-idf':0.5" } };
StringBuilder expected = new StringBuilder("/termVectors/0/test_posofftv/anoth=={");
@ -249,7 +249,7 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
,"f.test_basictv." + TermVectorParams.TF_IDF, "false"
)
,"/termVectors/0/test_basictv=={'anoth':{},'titl':{}}"
,"/termVectors/0/test_postv/anoth=={'tf':1, 'positions':{'position':1}, 'df':2, 'tf-idf':0.5}"
,"/termVectors/0/test_postv/anoth=={'tf':1, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
,"/termVectors/0/test_offtv/anoth=={'tf':1, 'df':2, 'tf-idf':0.5}"
,"/termVectors/warnings=={ 'noTermVectors':['test_notv'], 'noPositions':['test_basictv', 'test_offtv'], 'noOffsets':['test_basictv', 'test_postv']}"
);

View File

@ -53,7 +53,7 @@ public class TestSuggestSpellingConverter extends BaseTokenStreamTestCase {
TokenStream filter = new PatternReplaceFilter(tokenizer,
Pattern.compile("([^\\p{L}\\p{M}\\p{N}\\p{Cs}]*[\\p{L}\\p{M}\\p{N}\\p{Cs}\\_]+:)|([^\\p{L}\\p{M}\\p{N}\\p{Cs}])+"), " ", true);
filter = new LowerCaseFilter(TEST_VERSION_CURRENT, filter);
filter = new TrimFilter(filter, false);
filter = new TrimFilter(TEST_VERSION_CURRENT, filter, false);
return new TokenStreamComponents(tokenizer, filter);
}
});

View File

@ -202,13 +202,10 @@
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<!-- Case insensitive stop word removal.
add enablePositionIncrements=true in both the index and query
analyzers to leave a 'gap' for more accurate phrase queries.
-->
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
@ -222,7 +219,6 @@
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>

View File

@ -440,7 +440,7 @@
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<!-- in this example, we will only use synonyms at query time
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
@ -448,7 +448,7 @@
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
@ -466,13 +466,10 @@
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<!-- Case insensitive stop word removal.
add enablePositionIncrements=true in both the index and query
analyzers to leave a 'gap' for more accurate phrase queries.
-->
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="lang/stopwords_en.txt"
enablePositionIncrements="true"
/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPossessiveFilterFactory"/>
@ -488,7 +485,6 @@
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="lang/stopwords_en.txt"
enablePositionIncrements="true"
/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPossessiveFilterFactory"/>
@ -516,13 +512,10 @@
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<!-- Case insensitive stop word removal.
add enablePositionIncrements=true in both the index and query
analyzers to leave a 'gap' for more accurate phrase queries.
-->
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="lang/stopwords_en.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
@ -535,7 +528,6 @@
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="lang/stopwords_en.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
@ -566,7 +558,7 @@
<fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
@ -574,7 +566,7 @@
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
@ -730,7 +722,7 @@
<tokenizer class="solr.StandardTokenizerFactory"/>
<!-- for any non-arabic -->
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ar.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ar.txt" />
<!-- normalizes ﻯ to ﻱ, etc -->
<filter class="solr.ArabicNormalizationFilterFactory"/>
<filter class="solr.ArabicStemFilterFactory"/>
@ -742,7 +734,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_bg.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_bg.txt" />
<filter class="solr.BulgarianStemFilterFactory"/>
</analyzer>
</fieldType>
@ -754,7 +746,7 @@
<!-- removes l', etc -->
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ca.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ca.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ca.txt" />
<filter class="solr.SnowballPorterFilterFactory" language="Catalan"/>
</analyzer>
</fieldType>
@ -776,7 +768,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_cz.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_cz.txt" />
<filter class="solr.CzechStemFilterFactory"/>
</analyzer>
</fieldType>
@ -786,7 +778,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_da.txt" format="snowball" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_da.txt" format="snowball" />
<filter class="solr.SnowballPorterFilterFactory" language="Danish"/>
</analyzer>
</fieldType>
@ -796,7 +788,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" />
<filter class="solr.GermanNormalizationFilterFactory"/>
<filter class="solr.GermanLightStemFilterFactory"/>
<!-- less aggressive: <filter class="solr.GermanMinimalStemFilterFactory"/> -->
@ -810,7 +802,7 @@
<tokenizer class="solr.StandardTokenizerFactory"/>
<!-- greek specific lowercase for sigma -->
<filter class="solr.GreekLowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_el.txt" />
<filter class="solr.GreekStemFilterFactory"/>
</analyzer>
</fieldType>
@ -820,7 +812,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_es.txt" format="snowball" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_es.txt" format="snowball" />
<filter class="solr.SpanishLightStemFilterFactory"/>
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Spanish"/> -->
</analyzer>
@ -831,7 +823,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_eu.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_eu.txt" />
<filter class="solr.SnowballPorterFilterFactory" language="Basque"/>
</analyzer>
</fieldType>
@ -845,7 +837,7 @@
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ArabicNormalizationFilterFactory"/>
<filter class="solr.PersianNormalizationFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fa.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fa.txt" />
</analyzer>
</fieldType>
@ -854,7 +846,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fi.txt" format="snowball" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fi.txt" format="snowball" />
<filter class="solr.SnowballPorterFilterFactory" language="Finnish"/>
<!-- less aggressive: <filter class="solr.FinnishLightStemFilterFactory"/> -->
</analyzer>
@ -867,7 +859,7 @@
<!-- removes l', etc -->
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_fr.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fr.txt" format="snowball" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fr.txt" format="snowball" />
<filter class="solr.FrenchLightStemFilterFactory"/>
<!-- less aggressive: <filter class="solr.FrenchMinimalStemFilterFactory"/> -->
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="French"/> -->
@ -881,9 +873,9 @@
<!-- removes d', etc -->
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt"/>
<!-- removes n-, etc. position increments is intentionally false! -->
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt" enablePositionIncrements="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt"/>
<filter class="solr.IrishLowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt"/>
<filter class="solr.SnowballPorterFilterFactory" language="Irish"/>
</analyzer>
</fieldType>
@ -893,7 +885,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_gl.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_gl.txt" />
<filter class="solr.GalicianStemFilterFactory"/>
<!-- less aggressive: <filter class="solr.GalicianMinimalStemFilterFactory"/> -->
</analyzer>
@ -908,7 +900,7 @@
<filter class="solr.IndicNormalizationFilterFactory"/>
<!-- normalizes variation in spelling -->
<filter class="solr.HindiNormalizationFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hi.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hi.txt" />
<filter class="solr.HindiStemFilterFactory"/>
</analyzer>
</fieldType>
@ -918,7 +910,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hu.txt" format="snowball" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hu.txt" format="snowball" />
<filter class="solr.SnowballPorterFilterFactory" language="Hungarian"/>
<!-- less aggressive: <filter class="solr.HungarianLightStemFilterFactory"/> -->
</analyzer>
@ -929,7 +921,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hy.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hy.txt" />
<filter class="solr.SnowballPorterFilterFactory" language="Armenian"/>
</analyzer>
</fieldType>
@ -939,7 +931,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_id.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_id.txt" />
<!-- for a less aggressive approach (only inflectional suffixes), set stemDerivational to false -->
<filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/>
</analyzer>
@ -952,7 +944,7 @@
<!-- removes l', etc -->
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_it.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt" format="snowball" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt" format="snowball" />
<filter class="solr.ItalianLightStemFilterFactory"/>
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> -->
</analyzer>
@ -999,11 +991,11 @@
<!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
<filter class="solr.JapaneseBaseFormFilterFactory"/>
<!-- Removes tokens with certain part-of-speech tags -->
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/>
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" />
<!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) -->
<filter class="solr.CJKWidthFilterFactory"/>
<!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" />
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" />
<!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) -->
<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
<!-- Lower-cases romaji characters -->
@ -1016,7 +1008,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_lv.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_lv.txt" />
<filter class="solr.LatvianStemFilterFactory"/>
</analyzer>
</fieldType>
@ -1026,7 +1018,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_nl.txt" format="snowball" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_nl.txt" format="snowball" />
<filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict_nl.txt" ignoreCase="false"/>
<filter class="solr.SnowballPorterFilterFactory" language="Dutch"/>
</analyzer>
@ -1037,7 +1029,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball" />
<filter class="solr.SnowballPorterFilterFactory" language="Norwegian"/>
<!-- less aggressive: <filter class="solr.NorwegianLightStemFilterFactory"/> -->
<!-- singular/plural: <filter class="solr.NorwegianMinimalStemFilterFactory"/> -->
@ -1049,7 +1041,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" />
<filter class="solr.PortugueseLightStemFilterFactory"/>
<!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->
@ -1062,7 +1054,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ro.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ro.txt" />
<filter class="solr.SnowballPorterFilterFactory" language="Romanian"/>
</analyzer>
</fieldType>
@ -1072,7 +1064,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" />
<filter class="solr.SnowballPorterFilterFactory" language="Russian"/>
<!-- less aggressive: <filter class="solr.RussianLightStemFilterFactory"/> -->
</analyzer>
@ -1083,7 +1075,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_sv.txt" format="snowball" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_sv.txt" format="snowball" />
<filter class="solr.SnowballPorterFilterFactory" language="Swedish"/>
<!-- less aggressive: <filter class="solr.SwedishLightStemFilterFactory"/> -->
</analyzer>
@ -1095,7 +1087,7 @@
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ThaiWordFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_th.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_th.txt" />
</analyzer>
</fieldType>
@ -1104,7 +1096,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.TurkishLowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt" />
<filter class="solr.SnowballPorterFilterFactory" language="Turkish"/>
</analyzer>
</fieldType>