mirror of https://github.com/apache/lucene.git
LUCENE-4843: Add LimitTokenPositionFilter: don't emit tokens with positions that exceed the configured limit
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1457572 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4ba86e84ae
commit
42dd280c97
|
@ -87,6 +87,9 @@ New Features
|
||||||
taxonomy index and it tie-breaks facet labels in an understandable
|
taxonomy index and it tie-breaks facet labels in an understandable
|
||||||
(by Unicode sort order) way. (Robert Muir, Mike McCandless)
|
(by Unicode sort order) way. (Robert Muir, Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-4843: Add LimitTokenPositionFilter: don't emit tokens with
|
||||||
|
positions that exceed the configured limit. (Steve Rowe)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-4819: Added Sorted[Set]DocValues.termsEnum(), and optimized the
|
* LUCENE-4819: Added Sorted[Set]DocValues.termsEnum(), and optimized the
|
||||||
|
|
|
@ -0,0 +1,100 @@
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This TokenFilter limits its emitted tokens to those with positions that
|
||||||
|
* are not greater than the configured limit.
|
||||||
|
* <p>
|
||||||
|
* By default, this filter ignores any tokens in the wrapped {@code TokenStream}
|
||||||
|
* once the limit has been exceeded, which can result in {@code reset()} being
|
||||||
|
* called prior to {@code incrementToken()} returning {@code false}. For most
|
||||||
|
* {@code TokenStream} implementations this should be acceptable, and faster
|
||||||
|
* then consuming the full stream. If you are wrapping a {@code TokenStream}
|
||||||
|
* which requires that the full stream of tokens be exhausted in order to
|
||||||
|
* function properly, use the
|
||||||
|
* {@link #LimitTokenPositionFilter(TokenStream,int,boolean) consumeAllTokens}
|
||||||
|
* option.
|
||||||
|
*/
|
||||||
|
public final class LimitTokenPositionFilter extends TokenFilter {
|
||||||
|
|
||||||
|
private final int maxTokenPosition;
|
||||||
|
private final boolean consumeAllTokens;
|
||||||
|
private int tokenPosition = 0;
|
||||||
|
private boolean exhausted = false;
|
||||||
|
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build a filter that only accepts tokens up to and including the given maximum position.
|
||||||
|
* This filter will not consume any tokens with position greater than the maxTokenPosition limit.
|
||||||
|
|
||||||
|
* @param in the stream to wrap
|
||||||
|
* @param maxTokenPosition max position of tokens to produce (1st token always has position 1)
|
||||||
|
*
|
||||||
|
* @see #LimitTokenPositionFilter(TokenStream,int,boolean)
|
||||||
|
*/
|
||||||
|
public LimitTokenPositionFilter(TokenStream in, int maxTokenPosition) {
|
||||||
|
this(in, maxTokenPosition, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build a filter that limits the maximum position of tokens to emit.
|
||||||
|
*
|
||||||
|
* @param in the stream to wrap
|
||||||
|
* @param maxTokenPosition max position of tokens to produce (1st token always has position 1)
|
||||||
|
* @param consumeAllTokens whether all tokens from the wrapped input stream must be consumed
|
||||||
|
* even if maxTokenPosition is exceeded.
|
||||||
|
*/
|
||||||
|
public LimitTokenPositionFilter(TokenStream in, int maxTokenPosition, boolean consumeAllTokens) {
|
||||||
|
super(in);
|
||||||
|
this.maxTokenPosition = maxTokenPosition;
|
||||||
|
this.consumeAllTokens = consumeAllTokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (exhausted) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
tokenPosition += posIncAtt.getPositionIncrement();
|
||||||
|
if (tokenPosition <= maxTokenPosition) {
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
while (consumeAllTokens && input.incrementToken()) { /* NOOP */ }
|
||||||
|
exhausted = true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
exhausted = true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
super.reset();
|
||||||
|
tokenPosition = 0;
|
||||||
|
exhausted = false;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,56 @@
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link LimitTokenPositionFilter}.
|
||||||
|
* <pre class="prettyprint" >
|
||||||
|
* <fieldType name="text_limit_pos" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
* <analyzer>
|
||||||
|
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
* <filter class="solr.LimitTokenPositionFilterFactory" maxTokenPosition="3" consumeAllTokens="false" />
|
||||||
|
* </analyzer>
|
||||||
|
* </fieldType></pre>
|
||||||
|
* <p>
|
||||||
|
* The {@code consumeAllTokens} property is optional and defaults to {@code false}.
|
||||||
|
* See {@link LimitTokenPositionFilter} for an explanation of its use.
|
||||||
|
*/
|
||||||
|
public class LimitTokenPositionFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
|
public static final String MAX_TOKEN_POSITION_KEY = "maxTokenPosition";
|
||||||
|
public static final String CONSUME_ALL_TOKENS_KEY = "consumeAllTokens";
|
||||||
|
int maxTokenPosition;
|
||||||
|
boolean consumeAllTokens;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void init(Map<String,String> args) {
|
||||||
|
super.init(args);
|
||||||
|
maxTokenPosition = getInt(MAX_TOKEN_POSITION_KEY);
|
||||||
|
consumeAllTokens = getBoolean(CONSUME_ALL_TOKENS_KEY, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new LimitTokenPositionFilter(input, maxTokenPosition, consumeAllTokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -61,6 +61,7 @@ org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory
|
org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.LengthFilterFactory
|
org.apache.lucene.analysis.miscellaneous.LengthFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory
|
org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory
|
||||||
|
org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
|
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
|
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
|
org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
|
||||||
|
|
|
@ -0,0 +1,84 @@
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||||
|
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
|
||||||
|
public class TestLimitTokenPositionFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
public void testMaxPosition2() throws IOException {
|
||||||
|
for (final boolean consumeAll : new boolean[] { true, false }) {
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
MockTokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
// if we are consuming all tokens, we can use the checks, otherwise we can't
|
||||||
|
tokenizer.setEnableChecks(consumeAll);
|
||||||
|
return new TokenStreamComponents(tokenizer, new LimitTokenPositionFilter(tokenizer, 2, consumeAll));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// dont use assertAnalyzesTo here, as the end offset is not the end of the string (unless consumeAll is true, in which case its correct)!
|
||||||
|
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")),
|
||||||
|
new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, consumeAll ? 16 : null);
|
||||||
|
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")),
|
||||||
|
new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, consumeAll ? 9 : null);
|
||||||
|
|
||||||
|
// less than the limit, ensure we behave correctly
|
||||||
|
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 ")),
|
||||||
|
new String[] { "1" }, new int[] { 0 }, new int[] { 1 }, consumeAll ? 3 : null);
|
||||||
|
|
||||||
|
// equal to limit
|
||||||
|
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 ")),
|
||||||
|
new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, consumeAll ? 6 : null);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMaxPosition3WithSynomyms() throws IOException {
|
||||||
|
MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false);
|
||||||
|
tokenizer.setEnableChecks(false); // LimitTokenPositionFilter doesn't consume the entire stream that it wraps
|
||||||
|
|
||||||
|
SynonymMap.Builder builder = new SynonymMap.Builder(true);
|
||||||
|
builder.add(new CharsRef("one"), new CharsRef("first"), true);
|
||||||
|
builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
|
||||||
|
builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
|
||||||
|
CharsRef multiWordCharsRef = new CharsRef();
|
||||||
|
SynonymMap.Builder.join(new String[] { "and", "indubitably", "single", "only" }, multiWordCharsRef);
|
||||||
|
builder.add(new CharsRef("one"), multiWordCharsRef, true);
|
||||||
|
SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef);
|
||||||
|
builder.add(new CharsRef("two"), multiWordCharsRef, true);
|
||||||
|
SynonymMap synonymMap = builder.build();
|
||||||
|
TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
|
||||||
|
stream = new LimitTokenPositionFilter(stream, 3); // consumeAllTokens defaults to false
|
||||||
|
|
||||||
|
// "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
|
||||||
|
assertTokenStreamContents(stream,
|
||||||
|
new String[] { "one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger" },
|
||||||
|
new int[] { 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0 });
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,84 @@
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||||
|
|
||||||
|
public class TestLimitTokenPositionFilterFactory extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
public void testMaxPosition1() throws IOException {
|
||||||
|
LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory();
|
||||||
|
Map<String, String> args = new HashMap<String, String>();
|
||||||
|
args.put(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "1");
|
||||||
|
factory.init(args);
|
||||||
|
String test = "A1 B2 C3 D4 E5 F6";
|
||||||
|
MockTokenizer tok = new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false);
|
||||||
|
// LimitTokenPositionFilter doesn't consume the entire stream that it wraps
|
||||||
|
tok.setEnableChecks(false);
|
||||||
|
TokenStream stream = factory.create(tok);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "A1" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMissingParam() {
|
||||||
|
LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory();
|
||||||
|
Map<String, String> args = new HashMap<String, String>();
|
||||||
|
IllegalArgumentException iae = null;
|
||||||
|
try {
|
||||||
|
factory.init(args);
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
assertTrue("exception doesn't mention param: " + e.getMessage(),
|
||||||
|
0 < e.getMessage().indexOf(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY));
|
||||||
|
iae = e;
|
||||||
|
}
|
||||||
|
assertNotNull("no exception thrown", iae);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMaxPosition1WithShingles() throws IOException {
|
||||||
|
LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory();
|
||||||
|
Map<String, String> args = new HashMap<String, String>();
|
||||||
|
args.put(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "1");
|
||||||
|
factory.init(args);
|
||||||
|
String input = "one two three four five";
|
||||||
|
MockTokenizer tok = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||||
|
// LimitTokenPositionFilter doesn't consume the entire stream that it wraps
|
||||||
|
tok.setEnableChecks(false);
|
||||||
|
ShingleFilter shingleFilter = new ShingleFilter(tok, 2, 3);
|
||||||
|
shingleFilter.setOutputUnigrams(true);
|
||||||
|
TokenStream stream = factory.create(shingleFilter);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "one", "one two", "one two three" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testConsumeAllTokens() throws IOException {
|
||||||
|
LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory();
|
||||||
|
Map<String, String> args = new HashMap<String, String>();
|
||||||
|
args.put(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "3");
|
||||||
|
args.put(LimitTokenPositionFilterFactory.CONSUME_ALL_TOKENS_KEY, "true");
|
||||||
|
factory.init(args);
|
||||||
|
String test = "A1 B2 C3 D4 E5 F6";
|
||||||
|
MockTokenizer tok = new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenStream stream = factory.create(tok);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "A1", "B2", "C3" });
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue