LUCENE-4843: Add LimitTokenPositionFilter: don't emit tokens with positions that exceed the configured limit

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1457572 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2013-03-17 22:09:44 +00:00
parent 4ba86e84ae
commit 42dd280c97
6 changed files with 328 additions and 0 deletions

View File

@ -86,6 +86,9 @@ New Features
per-IndexReader-open to compute its ordinal map, but it requires no per-IndexReader-open to compute its ordinal map, but it requires no
taxonomy index and it tie-breaks facet labels in an understandable taxonomy index and it tie-breaks facet labels in an understandable
(by Unicode sort order) way. (Robert Muir, Mike McCandless) (by Unicode sort order) way. (Robert Muir, Mike McCandless)
* LUCENE-4843: Add LimitTokenPositionFilter: don't emit tokens with
positions that exceed the configured limit. (Steve Rowe)
Optimizations Optimizations

View File

@ -0,0 +1,100 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import java.io.IOException;
/**
* This TokenFilter limits its emitted tokens to those with positions that
* are not greater than the configured limit.
* <p>
* By default, this filter ignores any tokens in the wrapped {@code TokenStream}
* once the limit has been exceeded, which can result in {@code reset()} being
* called prior to {@code incrementToken()} returning {@code false}. For most
* {@code TokenStream} implementations this should be acceptable, and faster
* then consuming the full stream. If you are wrapping a {@code TokenStream}
* which requires that the full stream of tokens be exhausted in order to
* function properly, use the
* {@link #LimitTokenPositionFilter(TokenStream,int,boolean) consumeAllTokens}
* option.
*/
public final class LimitTokenPositionFilter extends TokenFilter {
private final int maxTokenPosition;
private final boolean consumeAllTokens;
private int tokenPosition = 0;
private boolean exhausted = false;
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
/**
* Build a filter that only accepts tokens up to and including the given maximum position.
* This filter will not consume any tokens with position greater than the maxTokenPosition limit.
* @param in the stream to wrap
* @param maxTokenPosition max position of tokens to produce (1st token always has position 1)
*
* @see #LimitTokenPositionFilter(TokenStream,int,boolean)
*/
public LimitTokenPositionFilter(TokenStream in, int maxTokenPosition) {
this(in, maxTokenPosition, false);
}
/**
* Build a filter that limits the maximum position of tokens to emit.
*
* @param in the stream to wrap
* @param maxTokenPosition max position of tokens to produce (1st token always has position 1)
* @param consumeAllTokens whether all tokens from the wrapped input stream must be consumed
* even if maxTokenPosition is exceeded.
*/
public LimitTokenPositionFilter(TokenStream in, int maxTokenPosition, boolean consumeAllTokens) {
super(in);
this.maxTokenPosition = maxTokenPosition;
this.consumeAllTokens = consumeAllTokens;
}
@Override
public boolean incrementToken() throws IOException {
if (exhausted) {
return false;
}
if (input.incrementToken()) {
tokenPosition += posIncAtt.getPositionIncrement();
if (tokenPosition <= maxTokenPosition) {
return true;
} else {
while (consumeAllTokens && input.incrementToken()) { /* NOOP */ }
exhausted = true;
return false;
}
} else {
exhausted = true;
return false;
}
}
@Override
public void reset() throws IOException {
super.reset();
tokenPosition = 0;
exhausted = false;
}
}

View File

@ -0,0 +1,56 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link LimitTokenPositionFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_limit_pos" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.LimitTokenPositionFilterFactory" maxTokenPosition="3" consumeAllTokens="false" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
* <p>
* The {@code consumeAllTokens} property is optional and defaults to {@code false}.
* See {@link LimitTokenPositionFilter} for an explanation of its use.
*/
public class LimitTokenPositionFilterFactory extends TokenFilterFactory {
public static final String MAX_TOKEN_POSITION_KEY = "maxTokenPosition";
public static final String CONSUME_ALL_TOKENS_KEY = "consumeAllTokens";
int maxTokenPosition;
boolean consumeAllTokens;
@Override
public void init(Map<String,String> args) {
super.init(args);
maxTokenPosition = getInt(MAX_TOKEN_POSITION_KEY);
consumeAllTokens = getBoolean(CONSUME_ALL_TOKENS_KEY, false);
}
@Override
public TokenStream create(TokenStream input) {
return new LimitTokenPositionFilter(input, maxTokenPosition, consumeAllTokens);
}
}

View File

@ -61,6 +61,7 @@ org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory
org.apache.lucene.analysis.miscellaneous.LengthFilterFactory org.apache.lucene.analysis.miscellaneous.LengthFilterFactory
org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory
org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
org.apache.lucene.analysis.miscellaneous.TrimFilterFactory org.apache.lucene.analysis.miscellaneous.TrimFilterFactory

View File

@ -0,0 +1,84 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.util.CharsRef;
public class TestLimitTokenPositionFilter extends BaseTokenStreamTestCase {
public void testMaxPosition2() throws IOException {
for (final boolean consumeAll : new boolean[] { true, false }) {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
MockTokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
// if we are consuming all tokens, we can use the checks, otherwise we can't
tokenizer.setEnableChecks(consumeAll);
return new TokenStreamComponents(tokenizer, new LimitTokenPositionFilter(tokenizer, 2, consumeAll));
}
};
// dont use assertAnalyzesTo here, as the end offset is not the end of the string (unless consumeAll is true, in which case its correct)!
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")),
new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, consumeAll ? 16 : null);
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")),
new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, consumeAll ? 9 : null);
// less than the limit, ensure we behave correctly
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 ")),
new String[] { "1" }, new int[] { 0 }, new int[] { 1 }, consumeAll ? 3 : null);
// equal to limit
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 ")),
new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, consumeAll ? 6 : null);
}
}
public void testMaxPosition3WithSynomyms() throws IOException {
MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false);
tokenizer.setEnableChecks(false); // LimitTokenPositionFilter doesn't consume the entire stream that it wraps
SynonymMap.Builder builder = new SynonymMap.Builder(true);
builder.add(new CharsRef("one"), new CharsRef("first"), true);
builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
CharsRef multiWordCharsRef = new CharsRef();
SynonymMap.Builder.join(new String[] { "and", "indubitably", "single", "only" }, multiWordCharsRef);
builder.add(new CharsRef("one"), multiWordCharsRef, true);
SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef);
builder.add(new CharsRef("two"), multiWordCharsRef, true);
SynonymMap synonymMap = builder.build();
TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
stream = new LimitTokenPositionFilter(stream, 3); // consumeAllTokens defaults to false
// "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
assertTokenStreamContents(stream,
new String[] { "one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger" },
new int[] { 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0 });
}
}

View File

@ -0,0 +1,84 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.shingle.ShingleFilter;
public class TestLimitTokenPositionFilterFactory extends BaseTokenStreamTestCase {
public void testMaxPosition1() throws IOException {
LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory();
Map<String, String> args = new HashMap<String, String>();
args.put(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "1");
factory.init(args);
String test = "A1 B2 C3 D4 E5 F6";
MockTokenizer tok = new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false);
// LimitTokenPositionFilter doesn't consume the entire stream that it wraps
tok.setEnableChecks(false);
TokenStream stream = factory.create(tok);
assertTokenStreamContents(stream, new String[] { "A1" });
}
public void testMissingParam() {
LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory();
Map<String, String> args = new HashMap<String, String>();
IllegalArgumentException iae = null;
try {
factory.init(args);
} catch (IllegalArgumentException e) {
assertTrue("exception doesn't mention param: " + e.getMessage(),
0 < e.getMessage().indexOf(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY));
iae = e;
}
assertNotNull("no exception thrown", iae);
}
public void testMaxPosition1WithShingles() throws IOException {
LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory();
Map<String, String> args = new HashMap<String, String>();
args.put(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "1");
factory.init(args);
String input = "one two three four five";
MockTokenizer tok = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
// LimitTokenPositionFilter doesn't consume the entire stream that it wraps
tok.setEnableChecks(false);
ShingleFilter shingleFilter = new ShingleFilter(tok, 2, 3);
shingleFilter.setOutputUnigrams(true);
TokenStream stream = factory.create(shingleFilter);
assertTokenStreamContents(stream, new String[] { "one", "one two", "one two three" });
}
public void testConsumeAllTokens() throws IOException {
LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory();
Map<String, String> args = new HashMap<String, String>();
args.put(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "3");
args.put(LimitTokenPositionFilterFactory.CONSUME_ALL_TOKENS_KEY, "true");
factory.init(args);
String test = "A1 B2 C3 D4 E5 F6";
MockTokenizer tok = new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false);
TokenStream stream = factory.create(tok);
assertTokenStreamContents(stream, new String[] { "A1", "B2", "C3" });
}
}