LUCENE-1253: LengthFilter (and Solr's KeepWordTokenFilter) now require up front specification of enablePositionIncrement. Together with StopFilter they have a common base class (FilteringTokenFilter) that handles the position increments automatically. Implementors only need to override an accept() method that filters tokens

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1065343 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2011-01-30 18:30:34 +00:00
parent 277dfa0e88
commit e7088279f7
10 changed files with 186 additions and 101 deletions

View File

@ -643,6 +643,12 @@ API Changes
deletes remain buffered so that the next time you open an NRT reader
and pass true, all deletes will be a applied. (Mike McCandless)
* LUCENE-1253: LengthFilter (and Solr's KeepWordTokenFilter) now
require up front specification of enablePositionIncrement. Together with
StopFilter they have a common base class (FilteringTokenFilter) that handles
the position increments automatically. Implementors only need to override an
accept() method that filters tokens. (Uwe Schindler, Robert Muir)
Bug fixes
* LUCENE-2249: ParallelMultiSearcher should shut down thread pool on

View File

@ -22,10 +22,9 @@ import java.util.Arrays;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.util.Version;
@ -42,14 +41,10 @@ import org.apache.lucene.util.Version;
* increments are preserved
* </ul>
*/
public final class StopFilter extends TokenFilter {
public final class StopFilter extends FilteringTokenFilter {
private final CharArraySet stopWords;
private boolean enablePositionIncrements = true;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
/**
* Construct a token stream filtering the given input. If
@ -75,7 +70,7 @@ public final class StopFilter extends TokenFilter {
*/
public StopFilter(Version matchVersion, TokenStream input, Set<?> stopWords, boolean ignoreCase)
{
super(input);
super(true, input);
this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet) stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase);
}
@ -157,48 +152,8 @@ public final class StopFilter extends TokenFilter {
* Returns the next input Token whose term() is not a stop word.
*/
@Override
public final boolean incrementToken() throws IOException {
// return the first non-stop word found
int skippedPositions = 0;
while (input.incrementToken()) {
if (!stopWords.contains(termAtt.buffer(), 0, termAtt.length())) {
if (enablePositionIncrements) {
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
return true;
}
skippedPositions += posIncrAtt.getPositionIncrement();
}
// reached EOS -- return false
return false;
protected boolean accept() throws IOException {
return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
}
/**
* @see #setEnablePositionIncrements(boolean)
*/
public boolean getEnablePositionIncrements() {
return enablePositionIncrements;
}
/**
* If <code>true</code>, this StopFilter will preserve
* positions of the incoming tokens (ie, accumulate and
* set position increments of the removed stop tokens).
* Generally, <code>true</code> is best as it does not
* lose information (positions of the original tokens)
* during indexing.
*
* Default is true.
*
* <p> When set, when a token is stopped
* (omitted), the position increment of the following
* token is incremented.
*
* <p> <b>NOTE</b>: be sure to also
* set {@link QueryParser#setEnablePositionIncrements} if
* you use QueryParser to create queries.
*/
public void setEnablePositionIncrements(boolean enable) {
this.enablePositionIncrements = enable;
}
}

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
@ -30,22 +31,19 @@ import org.apache.lucene.analysis.util.CharArraySet;
*
* @since solr 1.3
*/
public final class KeepWordFilter extends TokenFilter {
public final class KeepWordFilter extends FilteringTokenFilter {
private final CharArraySet words;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/** The words set passed to this constructor will be directly used by this filter
* and should not be modified, */
public KeepWordFilter(TokenStream in, CharArraySet words) {
super(in);
public KeepWordFilter(boolean enablePositionIncrements, TokenStream in, CharArraySet words) {
super(enablePositionIncrements, in);
this.words = words;
}
@Override
public boolean incrementToken() throws IOException {
while (input.incrementToken()) {
if (words.contains(termAtt.buffer(), 0, termAtt.length())) return true;
}
return false;
public boolean accept() throws IOException {
return words.contains(termAtt.buffer(), 0, termAtt.length());
}
}

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
@ -29,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* Note: Length is calculated as the number of UTF-16 code units.
* </p>
*/
public final class LengthFilter extends TokenFilter {
public final class LengthFilter extends FilteringTokenFilter {
private final int min;
private final int max;
@ -40,27 +41,15 @@ public final class LengthFilter extends TokenFilter {
* Build a filter that removes words that are too long or too
* short from the text.
*/
public LengthFilter(TokenStream in, int min, int max)
{
super(in);
public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) {
super(enablePositionIncrements, in);
this.min = min;
this.max = max;
}
/**
* Returns the next input Token whose term() is the right len
*/
@Override
public final boolean incrementToken() throws IOException {
// return the first non-stop word found
while (input.incrementToken()) {
int len = termAtt.length();
if (len >= min && len <= max) {
return true;
}
// note: else we ignore it but should we index each part of it?
}
// reached EOS -- return false
return false;
public boolean accept() throws IOException {
final int len = termAtt.length();
return (len >= min && len <= max);
}
}

View File

@ -0,0 +1,96 @@
package org.apache.lucene.analysis.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.queryParser.QueryParser; // for javadoc
/**
* Abstract base class for TokenFilters that may remove tokens.
* You have to implement {@link #accept} and return a boolean if the current
* token should be preserved. {@link #incrementToken} uses this method
* to decide if a token should be passed to the caller.
*/
public abstract class FilteringTokenFilter extends TokenFilter {
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
super(input);
this.enablePositionIncrements = enablePositionIncrements;
}
/** Override this method and return if the current input token should be returned by {@link #incrementToken}. */
protected abstract boolean accept() throws IOException;
@Override
public final boolean incrementToken() throws IOException {
if (enablePositionIncrements) {
int skippedPositions = 0;
while (input.incrementToken()) {
if (accept()) {
if (skippedPositions != 0) {
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
return true;
}
skippedPositions += posIncrAtt.getPositionIncrement();
}
} else {
while (input.incrementToken()) {
if (accept()) {
return true;
}
}
}
// reached EOS -- return false
return false;
}
/**
* @see #setEnablePositionIncrements(boolean)
*/
public boolean getEnablePositionIncrements() {
return enablePositionIncrements;
}
/**
* If <code>true</code>, this TokenFilter will preserve
* positions of the incoming tokens (ie, accumulate and
* set position increments of the removed tokens).
* Generally, <code>true</code> is best as it does not
* lose information (positions of the original tokens)
* during indexing.
*
* <p> When set, when a token is stopped
* (omitted), the position increment of the following
* token is incremented.
*
* <p> <b>NOTE</b>: be sure to also
* set {@link QueryParser#setEnablePositionIncrements} if
* you use QueryParser to create queries.
*/
public void setEnablePositionIncrements(boolean enable) {
this.enablePositionIncrements = enable;
}
}

View File

@ -35,16 +35,26 @@ public class TestKeepWordFilter extends BaseTokenStreamTestCase {
words.add( "aaa" );
words.add( "bbb" );
String input = "aaa BBB ccc ddd EEE";
String input = "xxx yyy aaa zzz BBB ccc ddd EEE";
// Test Stopwords
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
stream = new KeepWordFilter(stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 });
// Now force case
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
stream = new KeepWordFilter(stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
assertTokenStreamContents(stream, new String[] { "aaa" });
stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 });
// Test Stopwords
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 });
// Now force case
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
}
}

View File

@ -24,19 +24,24 @@ import java.io.StringReader;
public class TestLengthFilter extends BaseTokenStreamTestCase {
public void testFilter() throws Exception {
public void testFilterNoPosIncr() throws Exception {
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
LengthFilter filter = new LengthFilter(stream, 2, 6);
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
LengthFilter filter = new LengthFilter(false, stream, 2, 6);
assertTokenStreamContents(filter,
new String[]{"short", "ab", "foo"},
new int[]{1, 1, 1}
);
}
assertTrue(filter.incrementToken());
assertEquals("short", termAtt.toString());
assertTrue(filter.incrementToken());
assertEquals("ab", termAtt.toString());
assertTrue(filter.incrementToken());
assertEquals("foo", termAtt.toString());
assertFalse(filter.incrementToken());
public void testFilterWithPosIncr() throws Exception {
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
LengthFilter filter = new LengthFilter(true, stream, 2, 6);
assertTokenStreamContents(filter,
new String[]{"short", "ab", "foo"},
new int[]{1, 4, 2}
);
}
}

View File

@ -23,21 +23,26 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import java.util.Map;
import java.util.Set;
import java.io.IOException;
/**
* @version $Id$
* @since solr 1.3
*/
public class KeepWordFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
private CharArraySet words;
private boolean ignoreCase;
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
}
public void inform(ResourceLoader loader) {
String wordFiles = args.get("words");
ignoreCase = getBoolean("ignoreCase", false);
enablePositionIncrements = getBoolean("enablePositionIncrements",false);
if (wordFiles != null) {
try {
words = getWordSet(loader, wordFiles, ignoreCase);
@ -47,6 +52,10 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
}
}
private CharArraySet words;
private boolean ignoreCase;
private boolean enablePositionIncrements;
/**
* Set the keep word list.
* NOTE: if ignoreCase==true, the words are expected to be lowercase
@ -62,15 +71,19 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
this.ignoreCase = ignoreCase;
}
public KeepWordFilter create(TokenStream input) {
return new KeepWordFilter(input, words);
public boolean isEnablePositionIncrements() {
return enablePositionIncrements;
}
public boolean isIgnoreCase() {
return ignoreCase;
}
public CharArraySet getWords() {
return words;
}
public boolean isIgnoreCase() {
return ignoreCase;
public KeepWordFilter create(TokenStream input) {
return new KeepWordFilter(enablePositionIncrements, input, words);
}
}

View File

@ -27,6 +27,7 @@ import java.util.Map;
*/
public class LengthFilterFactory extends BaseTokenFilterFactory {
int min,max;
boolean enablePositionIncrements;
public static final String MIN_KEY = "min";
public static final String MAX_KEY = "max";
@ -35,8 +36,10 @@ public class LengthFilterFactory extends BaseTokenFilterFactory {
super.init(args);
min=Integer.parseInt(args.get(MIN_KEY));
max=Integer.parseInt(args.get(MAX_KEY));
enablePositionIncrements = getBoolean("enablePositionIncrements",false);
}
public LengthFilter create(TokenStream input) {
return new LengthFilter(input,min,max);
return new LengthFilter(enablePositionIncrements, input,min,max);
}
}

View File

@ -31,9 +31,19 @@ public class LengthFilterTest extends BaseTokenTestCase {
Map<String, String> args = new HashMap<String, String>();
args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
// default: args.put("enablePositionIncrements", "false");
factory.init(args);
String test = "foo foobar super-duper-trooper";
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test)));
assertTokenStreamContents(stream, new String[] { "foobar" });
assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 1 });
factory = new LengthFilterFactory();
args = new HashMap<String, String>();
args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
args.put("enablePositionIncrements", "true");
factory.init(args);
stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test)));
assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 2 });
}
}