mirror of https://github.com/apache/lucene.git
LUCENE-1253: LengthFilter (and Solr's KeepWordTokenFilter) now require up front specification of enablePositionIncrement. Together with StopFilter they have a common base class (FilteringTokenFilter) that handles the position increments automatically. Implementors only need to override an accept() method that filters tokens
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1065343 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
277dfa0e88
commit
e7088279f7
|
@ -643,6 +643,12 @@ API Changes
|
|||
deletes remain buffered so that the next time you open an NRT reader
|
||||
and pass true, all deletes will be a applied. (Mike McCandless)
|
||||
|
||||
* LUCENE-1253: LengthFilter (and Solr's KeepWordTokenFilter) now
|
||||
require up front specification of enablePositionIncrement. Together with
|
||||
StopFilter they have a common base class (FilteringTokenFilter) that handles
|
||||
the position increments automatically. Implementors only need to override an
|
||||
accept() method that filters tokens. (Uwe Schindler, Robert Muir)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-2249: ParallelMultiSearcher should shut down thread pool on
|
||||
|
|
|
@ -22,10 +22,9 @@ import java.util.Arrays;
|
|||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
@ -42,14 +41,10 @@ import org.apache.lucene.util.Version;
|
|||
* increments are preserved
|
||||
* </ul>
|
||||
*/
|
||||
public final class StopFilter extends TokenFilter {
|
||||
public final class StopFilter extends FilteringTokenFilter {
|
||||
|
||||
private final CharArraySet stopWords;
|
||||
private boolean enablePositionIncrements = true;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
|
||||
/**
|
||||
* Construct a token stream filtering the given input. If
|
||||
|
@ -75,7 +70,7 @@ public final class StopFilter extends TokenFilter {
|
|||
*/
|
||||
public StopFilter(Version matchVersion, TokenStream input, Set<?> stopWords, boolean ignoreCase)
|
||||
{
|
||||
super(input);
|
||||
super(true, input);
|
||||
this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet) stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase);
|
||||
}
|
||||
|
||||
|
@ -157,48 +152,8 @@ public final class StopFilter extends TokenFilter {
|
|||
* Returns the next input Token whose term() is not a stop word.
|
||||
*/
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
// return the first non-stop word found
|
||||
int skippedPositions = 0;
|
||||
while (input.incrementToken()) {
|
||||
if (!stopWords.contains(termAtt.buffer(), 0, termAtt.length())) {
|
||||
if (enablePositionIncrements) {
|
||||
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
skippedPositions += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
// reached EOS -- return false
|
||||
return false;
|
||||
protected boolean accept() throws IOException {
|
||||
return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #setEnablePositionIncrements(boolean)
|
||||
*/
|
||||
public boolean getEnablePositionIncrements() {
|
||||
return enablePositionIncrements;
|
||||
}
|
||||
|
||||
/**
|
||||
* If <code>true</code>, this StopFilter will preserve
|
||||
* positions of the incoming tokens (ie, accumulate and
|
||||
* set position increments of the removed stop tokens).
|
||||
* Generally, <code>true</code> is best as it does not
|
||||
* lose information (positions of the original tokens)
|
||||
* during indexing.
|
||||
*
|
||||
* Default is true.
|
||||
*
|
||||
* <p> When set, when a token is stopped
|
||||
* (omitted), the position increment of the following
|
||||
* token is incremented.
|
||||
*
|
||||
* <p> <b>NOTE</b>: be sure to also
|
||||
* set {@link QueryParser#setEnablePositionIncrements} if
|
||||
* you use QueryParser to create queries.
|
||||
*/
|
||||
public void setEnablePositionIncrements(boolean enable) {
|
||||
this.enablePositionIncrements = enable;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
|
@ -30,22 +31,19 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
*
|
||||
* @since solr 1.3
|
||||
*/
|
||||
public final class KeepWordFilter extends TokenFilter {
|
||||
public final class KeepWordFilter extends FilteringTokenFilter {
|
||||
private final CharArraySet words;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/** The words set passed to this constructor will be directly used by this filter
|
||||
* and should not be modified, */
|
||||
public KeepWordFilter(TokenStream in, CharArraySet words) {
|
||||
super(in);
|
||||
public KeepWordFilter(boolean enablePositionIncrements, TokenStream in, CharArraySet words) {
|
||||
super(enablePositionIncrements, in);
|
||||
this.words = words;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (input.incrementToken()) {
|
||||
if (words.contains(termAtt.buffer(), 0, termAtt.length())) return true;
|
||||
}
|
||||
return false;
|
||||
public boolean accept() throws IOException {
|
||||
return words.contains(termAtt.buffer(), 0, termAtt.length());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.util.FilteringTokenFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/**
|
||||
|
@ -29,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
* Note: Length is calculated as the number of UTF-16 code units.
|
||||
* </p>
|
||||
*/
|
||||
public final class LengthFilter extends TokenFilter {
|
||||
public final class LengthFilter extends FilteringTokenFilter {
|
||||
|
||||
private final int min;
|
||||
private final int max;
|
||||
|
@ -40,27 +41,15 @@ public final class LengthFilter extends TokenFilter {
|
|||
* Build a filter that removes words that are too long or too
|
||||
* short from the text.
|
||||
*/
|
||||
public LengthFilter(TokenStream in, int min, int max)
|
||||
{
|
||||
super(in);
|
||||
public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) {
|
||||
super(enablePositionIncrements, in);
|
||||
this.min = min;
|
||||
this.max = max;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next input Token whose term() is the right len
|
||||
*/
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
// return the first non-stop word found
|
||||
while (input.incrementToken()) {
|
||||
int len = termAtt.length();
|
||||
if (len >= min && len <= max) {
|
||||
return true;
|
||||
}
|
||||
// note: else we ignore it but should we index each part of it?
|
||||
}
|
||||
// reached EOS -- return false
|
||||
return false;
|
||||
public boolean accept() throws IOException {
|
||||
final int len = termAtt.length();
|
||||
return (len >= min && len <= max);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,96 @@
|
|||
package org.apache.lucene.analysis.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.queryParser.QueryParser; // for javadoc
|
||||
|
||||
/**
|
||||
* Abstract base class for TokenFilters that may remove tokens.
|
||||
* You have to implement {@link #accept} and return a boolean if the current
|
||||
* token should be preserved. {@link #incrementToken} uses this method
|
||||
* to decide if a token should be passed to the caller.
|
||||
*/
|
||||
public abstract class FilteringTokenFilter extends TokenFilter {
|
||||
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
|
||||
|
||||
public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
|
||||
super(input);
|
||||
this.enablePositionIncrements = enablePositionIncrements;
|
||||
}
|
||||
|
||||
/** Override this method and return if the current input token should be returned by {@link #incrementToken}. */
|
||||
protected abstract boolean accept() throws IOException;
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (enablePositionIncrements) {
|
||||
int skippedPositions = 0;
|
||||
while (input.incrementToken()) {
|
||||
if (accept()) {
|
||||
if (skippedPositions != 0) {
|
||||
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
skippedPositions += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
} else {
|
||||
while (input.incrementToken()) {
|
||||
if (accept()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// reached EOS -- return false
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #setEnablePositionIncrements(boolean)
|
||||
*/
|
||||
public boolean getEnablePositionIncrements() {
|
||||
return enablePositionIncrements;
|
||||
}
|
||||
|
||||
/**
|
||||
* If <code>true</code>, this TokenFilter will preserve
|
||||
* positions of the incoming tokens (ie, accumulate and
|
||||
* set position increments of the removed tokens).
|
||||
* Generally, <code>true</code> is best as it does not
|
||||
* lose information (positions of the original tokens)
|
||||
* during indexing.
|
||||
*
|
||||
* <p> When set, when a token is stopped
|
||||
* (omitted), the position increment of the following
|
||||
* token is incremented.
|
||||
*
|
||||
* <p> <b>NOTE</b>: be sure to also
|
||||
* set {@link QueryParser#setEnablePositionIncrements} if
|
||||
* you use QueryParser to create queries.
|
||||
*/
|
||||
public void setEnablePositionIncrements(boolean enable) {
|
||||
this.enablePositionIncrements = enable;
|
||||
}
|
||||
}
|
|
@ -35,16 +35,26 @@ public class TestKeepWordFilter extends BaseTokenStreamTestCase {
|
|||
words.add( "aaa" );
|
||||
words.add( "bbb" );
|
||||
|
||||
String input = "aaa BBB ccc ddd EEE";
|
||||
String input = "xxx yyy aaa zzz BBB ccc ddd EEE";
|
||||
|
||||
// Test Stopwords
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
stream = new KeepWordFilter(stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
|
||||
stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 });
|
||||
|
||||
// Now force case
|
||||
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
stream = new KeepWordFilter(stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa" });
|
||||
stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 });
|
||||
|
||||
// Test Stopwords
|
||||
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 });
|
||||
|
||||
// Now force case
|
||||
stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,19 +24,24 @@ import java.io.StringReader;
|
|||
|
||||
public class TestLengthFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testFilter() throws Exception {
|
||||
public void testFilterNoPosIncr() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
|
||||
LengthFilter filter = new LengthFilter(stream, 2, 6);
|
||||
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||
LengthFilter filter = new LengthFilter(false, stream, 2, 6);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[]{"short", "ab", "foo"},
|
||||
new int[]{1, 1, 1}
|
||||
);
|
||||
}
|
||||
|
||||
assertTrue(filter.incrementToken());
|
||||
assertEquals("short", termAtt.toString());
|
||||
assertTrue(filter.incrementToken());
|
||||
assertEquals("ab", termAtt.toString());
|
||||
assertTrue(filter.incrementToken());
|
||||
assertEquals("foo", termAtt.toString());
|
||||
assertFalse(filter.incrementToken());
|
||||
public void testFilterWithPosIncr() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
|
||||
LengthFilter filter = new LengthFilter(true, stream, 2, 6);
|
||||
assertTokenStreamContents(filter,
|
||||
new String[]{"short", "ab", "foo"},
|
||||
new int[]{1, 4, 2}
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -23,22 +23,27 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
* @since solr 1.3
|
||||
*/
|
||||
public class KeepWordFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
|
||||
private CharArraySet words;
|
||||
private boolean ignoreCase;
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init(args);
|
||||
assureMatchVersion();
|
||||
}
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
String wordFiles = args.get("words");
|
||||
ignoreCase = getBoolean("ignoreCase", false);
|
||||
if (wordFiles != null) {
|
||||
enablePositionIncrements = getBoolean("enablePositionIncrements",false);
|
||||
|
||||
if (wordFiles != null) {
|
||||
try {
|
||||
words = getWordSet(loader, wordFiles, ignoreCase);
|
||||
} catch (IOException e) {
|
||||
|
@ -47,6 +52,10 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
|
|||
}
|
||||
}
|
||||
|
||||
private CharArraySet words;
|
||||
private boolean ignoreCase;
|
||||
private boolean enablePositionIncrements;
|
||||
|
||||
/**
|
||||
* Set the keep word list.
|
||||
* NOTE: if ignoreCase==true, the words are expected to be lowercase
|
||||
|
@ -62,15 +71,19 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
|
|||
this.ignoreCase = ignoreCase;
|
||||
}
|
||||
|
||||
public KeepWordFilter create(TokenStream input) {
|
||||
return new KeepWordFilter(input, words);
|
||||
public boolean isEnablePositionIncrements() {
|
||||
return enablePositionIncrements;
|
||||
}
|
||||
|
||||
public boolean isIgnoreCase() {
|
||||
return ignoreCase;
|
||||
}
|
||||
|
||||
public CharArraySet getWords() {
|
||||
return words;
|
||||
}
|
||||
|
||||
public boolean isIgnoreCase() {
|
||||
return ignoreCase;
|
||||
public KeepWordFilter create(TokenStream input) {
|
||||
return new KeepWordFilter(enablePositionIncrements, input, words);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,6 +27,7 @@ import java.util.Map;
|
|||
*/
|
||||
public class LengthFilterFactory extends BaseTokenFilterFactory {
|
||||
int min,max;
|
||||
boolean enablePositionIncrements;
|
||||
public static final String MIN_KEY = "min";
|
||||
public static final String MAX_KEY = "max";
|
||||
|
||||
|
@ -35,8 +36,10 @@ public class LengthFilterFactory extends BaseTokenFilterFactory {
|
|||
super.init(args);
|
||||
min=Integer.parseInt(args.get(MIN_KEY));
|
||||
max=Integer.parseInt(args.get(MAX_KEY));
|
||||
enablePositionIncrements = getBoolean("enablePositionIncrements",false);
|
||||
}
|
||||
|
||||
public LengthFilter create(TokenStream input) {
|
||||
return new LengthFilter(input,min,max);
|
||||
return new LengthFilter(enablePositionIncrements, input,min,max);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -31,9 +31,19 @@ public class LengthFilterTest extends BaseTokenTestCase {
|
|||
Map<String, String> args = new HashMap<String, String>();
|
||||
args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
|
||||
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
|
||||
// default: args.put("enablePositionIncrements", "false");
|
||||
factory.init(args);
|
||||
String test = "foo foobar super-duper-trooper";
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test)));
|
||||
assertTokenStreamContents(stream, new String[] { "foobar" });
|
||||
assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 1 });
|
||||
|
||||
factory = new LengthFilterFactory();
|
||||
args = new HashMap<String, String>();
|
||||
args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
|
||||
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
|
||||
args.put("enablePositionIncrements", "true");
|
||||
factory.init(args);
|
||||
stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test)));
|
||||
assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 2 });
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue