LUCENE-1253: LengthFilter (and Solr's KeepWordTokenFilter) now require up front specification of enablePositionIncrement. Together with StopFilter they have a common base class (FilteringTokenFilter) that handles the position increments automatically. Implementors only need to override an accept() method that filters tokens

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1065343 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2011-01-30 18:30:34 +00:00
parent 277dfa0e88
commit e7088279f7
10 changed files with 186 additions and 101 deletions

View File

@ -643,6 +643,12 @@ API Changes
deletes remain buffered so that the next time you open an NRT reader deletes remain buffered so that the next time you open an NRT reader
and pass true, all deletes will be a applied. (Mike McCandless) and pass true, all deletes will be a applied. (Mike McCandless)
* LUCENE-1253: LengthFilter (and Solr's KeepWordTokenFilter) now
require up front specification of enablePositionIncrement. Together with
StopFilter they have a common base class (FilteringTokenFilter) that handles
the position increments automatically. Implementors only need to override an
accept() method that filters tokens. (Uwe Schindler, Robert Muir)
Bug fixes Bug fixes
* LUCENE-2249: ParallelMultiSearcher should shut down thread pool on * LUCENE-2249: ParallelMultiSearcher should shut down thread pool on

View File

@ -22,10 +22,9 @@ import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
@ -42,14 +41,10 @@ import org.apache.lucene.util.Version;
* increments are preserved * increments are preserved
* </ul> * </ul>
*/ */
public final class StopFilter extends TokenFilter { public final class StopFilter extends FilteringTokenFilter {
private final CharArraySet stopWords; private final CharArraySet stopWords;
private boolean enablePositionIncrements = true;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
/** /**
* Construct a token stream filtering the given input. If * Construct a token stream filtering the given input. If
@ -75,7 +70,7 @@ public final class StopFilter extends TokenFilter {
*/ */
public StopFilter(Version matchVersion, TokenStream input, Set<?> stopWords, boolean ignoreCase) public StopFilter(Version matchVersion, TokenStream input, Set<?> stopWords, boolean ignoreCase)
{ {
super(input); super(true, input);
this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet) stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase); this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet) stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase);
} }
@ -157,48 +152,8 @@ public final class StopFilter extends TokenFilter {
* Returns the next input Token whose term() is not a stop word. * Returns the next input Token whose term() is not a stop word.
*/ */
@Override @Override
public final boolean incrementToken() throws IOException { protected boolean accept() throws IOException {
// return the first non-stop word found return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
int skippedPositions = 0;
while (input.incrementToken()) {
if (!stopWords.contains(termAtt.buffer(), 0, termAtt.length())) {
if (enablePositionIncrements) {
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
return true;
}
skippedPositions += posIncrAtt.getPositionIncrement();
}
// reached EOS -- return false
return false;
} }
/**
* @see #setEnablePositionIncrements(boolean)
*/
public boolean getEnablePositionIncrements() {
return enablePositionIncrements;
}
/**
* If <code>true</code>, this StopFilter will preserve
* positions of the incoming tokens (ie, accumulate and
* set position increments of the removed stop tokens).
* Generally, <code>true</code> is best as it does not
* lose information (positions of the original tokens)
* during indexing.
*
* Default is true.
*
* <p> When set, when a token is stopped
* (omitted), the position increment of the following
* token is incremented.
*
* <p> <b>NOTE</b>: be sure to also
* set {@link QueryParser#setEnablePositionIncrements} if
* you use QueryParser to create queries.
*/
public void setEnablePositionIncrements(boolean enable) {
this.enablePositionIncrements = enable;
}
} }

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
@ -30,22 +31,19 @@ import org.apache.lucene.analysis.util.CharArraySet;
* *
* @since solr 1.3 * @since solr 1.3
*/ */
public final class KeepWordFilter extends TokenFilter { public final class KeepWordFilter extends FilteringTokenFilter {
private final CharArraySet words; private final CharArraySet words;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/** The words set passed to this constructor will be directly used by this filter /** The words set passed to this constructor will be directly used by this filter
* and should not be modified, */ * and should not be modified, */
public KeepWordFilter(TokenStream in, CharArraySet words) { public KeepWordFilter(boolean enablePositionIncrements, TokenStream in, CharArraySet words) {
super(in); super(enablePositionIncrements, in);
this.words = words; this.words = words;
} }
@Override @Override
public boolean incrementToken() throws IOException { public boolean accept() throws IOException {
while (input.incrementToken()) { return words.contains(termAtt.buffer(), 0, termAtt.length());
if (words.contains(termAtt.buffer(), 0, termAtt.length())) return true;
}
return false;
} }
} }

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/** /**
@ -29,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* Note: Length is calculated as the number of UTF-16 code units. * Note: Length is calculated as the number of UTF-16 code units.
* </p> * </p>
*/ */
public final class LengthFilter extends TokenFilter { public final class LengthFilter extends FilteringTokenFilter {
private final int min; private final int min;
private final int max; private final int max;
@ -40,27 +41,15 @@ public final class LengthFilter extends TokenFilter {
* Build a filter that removes words that are too long or too * Build a filter that removes words that are too long or too
* short from the text. * short from the text.
*/ */
public LengthFilter(TokenStream in, int min, int max) public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) {
{ super(enablePositionIncrements, in);
super(in);
this.min = min; this.min = min;
this.max = max; this.max = max;
} }
/**
* Returns the next input Token whose term() is the right len
*/
@Override @Override
public final boolean incrementToken() throws IOException { public boolean accept() throws IOException {
// return the first non-stop word found final int len = termAtt.length();
while (input.incrementToken()) { return (len >= min && len <= max);
int len = termAtt.length();
if (len >= min && len <= max) {
return true;
}
// note: else we ignore it but should we index each part of it?
}
// reached EOS -- return false
return false;
} }
} }

View File

@ -0,0 +1,96 @@
package org.apache.lucene.analysis.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.queryParser.QueryParser; // for javadoc
/**
* Abstract base class for TokenFilters that may remove tokens.
* You have to implement {@link #accept} and return a boolean if the current
* token should be preserved. {@link #incrementToken} uses this method
* to decide if a token should be passed to the caller.
*/
public abstract class FilteringTokenFilter extends TokenFilter {
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
super(input);
this.enablePositionIncrements = enablePositionIncrements;
}
/** Override this method and return if the current input token should be returned by {@link #incrementToken}. */
protected abstract boolean accept() throws IOException;
@Override
public final boolean incrementToken() throws IOException {
if (enablePositionIncrements) {
int skippedPositions = 0;
while (input.incrementToken()) {
if (accept()) {
if (skippedPositions != 0) {
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
return true;
}
skippedPositions += posIncrAtt.getPositionIncrement();
}
} else {
while (input.incrementToken()) {
if (accept()) {
return true;
}
}
}
// reached EOS -- return false
return false;
}
/**
* @see #setEnablePositionIncrements(boolean)
*/
public boolean getEnablePositionIncrements() {
return enablePositionIncrements;
}
/**
* If <code>true</code>, this TokenFilter will preserve
* positions of the incoming tokens (ie, accumulate and
* set position increments of the removed tokens).
* Generally, <code>true</code> is best as it does not
* lose information (positions of the original tokens)
* during indexing.
*
* <p> When set, when a token is stopped
* (omitted), the position increment of the following
* token is incremented.
*
* <p> <b>NOTE</b>: be sure to also
* set {@link QueryParser#setEnablePositionIncrements} if
* you use QueryParser to create queries.
*/
public void setEnablePositionIncrements(boolean enable) {
this.enablePositionIncrements = enable;
}
}

View File

@ -35,16 +35,26 @@ public class TestKeepWordFilter extends BaseTokenStreamTestCase {
words.add( "aaa" ); words.add( "aaa" );
words.add( "bbb" ); words.add( "bbb" );
String input = "aaa BBB ccc ddd EEE"; String input = "xxx yyy aaa zzz BBB ccc ddd EEE";
// Test Stopwords // Test Stopwords
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
stream = new KeepWordFilter(stream, new CharArraySet(TEST_VERSION_CURRENT, words, true)); stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }); assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 });
// Now force case // Now force case
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
stream = new KeepWordFilter(stream, new CharArraySet(TEST_VERSION_CURRENT,words, false)); stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
assertTokenStreamContents(stream, new String[] { "aaa" }); assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 });
// Test Stopwords
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 });
// Now force case
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
} }
} }

View File

@ -24,19 +24,24 @@ import java.io.StringReader;
public class TestLengthFilter extends BaseTokenStreamTestCase { public class TestLengthFilter extends BaseTokenStreamTestCase {
public void testFilter() throws Exception { public void testFilterNoPosIncr() throws Exception {
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
new StringReader("short toolong evenmuchlongertext a ab toolong foo")); new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
LengthFilter filter = new LengthFilter(stream, 2, 6); LengthFilter filter = new LengthFilter(false, stream, 2, 6);
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); assertTokenStreamContents(filter,
new String[]{"short", "ab", "foo"},
new int[]{1, 1, 1}
);
}
assertTrue(filter.incrementToken()); public void testFilterWithPosIncr() throws Exception {
assertEquals("short", termAtt.toString()); TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
assertTrue(filter.incrementToken()); new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
assertEquals("ab", termAtt.toString()); LengthFilter filter = new LengthFilter(true, stream, 2, 6);
assertTrue(filter.incrementToken()); assertTokenStreamContents(filter,
assertEquals("foo", termAtt.toString()); new String[]{"short", "ab", "foo"},
assertFalse(filter.incrementToken()); new int[]{1, 4, 2}
);
} }
} }

View File

@ -23,21 +23,26 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeepWordFilter; import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import java.util.Map;
import java.util.Set; import java.util.Set;
import java.io.IOException; import java.io.IOException;
/** /**
* @version $Id$ * @version $Id$
* @since solr 1.3
*/ */
public class KeepWordFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { public class KeepWordFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
private CharArraySet words; @Override
private boolean ignoreCase; public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
}
public void inform(ResourceLoader loader) { public void inform(ResourceLoader loader) {
String wordFiles = args.get("words"); String wordFiles = args.get("words");
ignoreCase = getBoolean("ignoreCase", false); ignoreCase = getBoolean("ignoreCase", false);
enablePositionIncrements = getBoolean("enablePositionIncrements",false);
if (wordFiles != null) { if (wordFiles != null) {
try { try {
words = getWordSet(loader, wordFiles, ignoreCase); words = getWordSet(loader, wordFiles, ignoreCase);
@ -47,6 +52,10 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
} }
} }
private CharArraySet words;
private boolean ignoreCase;
private boolean enablePositionIncrements;
/** /**
* Set the keep word list. * Set the keep word list.
* NOTE: if ignoreCase==true, the words are expected to be lowercase * NOTE: if ignoreCase==true, the words are expected to be lowercase
@ -62,15 +71,19 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
this.ignoreCase = ignoreCase; this.ignoreCase = ignoreCase;
} }
public KeepWordFilter create(TokenStream input) { public boolean isEnablePositionIncrements() {
return new KeepWordFilter(input, words); return enablePositionIncrements;
}
public boolean isIgnoreCase() {
return ignoreCase;
} }
public CharArraySet getWords() { public CharArraySet getWords() {
return words; return words;
} }
public boolean isIgnoreCase() { public KeepWordFilter create(TokenStream input) {
return ignoreCase; return new KeepWordFilter(enablePositionIncrements, input, words);
} }
} }

View File

@ -27,6 +27,7 @@ import java.util.Map;
*/ */
public class LengthFilterFactory extends BaseTokenFilterFactory { public class LengthFilterFactory extends BaseTokenFilterFactory {
int min,max; int min,max;
boolean enablePositionIncrements;
public static final String MIN_KEY = "min"; public static final String MIN_KEY = "min";
public static final String MAX_KEY = "max"; public static final String MAX_KEY = "max";
@ -35,8 +36,10 @@ public class LengthFilterFactory extends BaseTokenFilterFactory {
super.init(args); super.init(args);
min=Integer.parseInt(args.get(MIN_KEY)); min=Integer.parseInt(args.get(MIN_KEY));
max=Integer.parseInt(args.get(MAX_KEY)); max=Integer.parseInt(args.get(MAX_KEY));
enablePositionIncrements = getBoolean("enablePositionIncrements",false);
} }
public LengthFilter create(TokenStream input) { public LengthFilter create(TokenStream input) {
return new LengthFilter(input,min,max); return new LengthFilter(enablePositionIncrements, input,min,max);
} }
} }

View File

@ -31,9 +31,19 @@ public class LengthFilterTest extends BaseTokenTestCase {
Map<String, String> args = new HashMap<String, String>(); Map<String, String> args = new HashMap<String, String>();
args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4)); args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10)); args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
// default: args.put("enablePositionIncrements", "false");
factory.init(args); factory.init(args);
String test = "foo foobar super-duper-trooper"; String test = "foo foobar super-duper-trooper";
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test))); TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test)));
assertTokenStreamContents(stream, new String[] { "foobar" }); assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 1 });
factory = new LengthFilterFactory();
args = new HashMap<String, String>();
args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
args.put("enablePositionIncrements", "true");
factory.init(args);
stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test)));
assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 2 });
} }
} }