mirror of https://github.com/apache/lucene.git
LUCENE-4843: Add LimitTokenPositionFilter: don't emit tokens with positions that exceed the configured limit
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1457572 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4ba86e84ae
commit
42dd280c97
|
@ -86,6 +86,9 @@ New Features
|
|||
per-IndexReader-open to compute its ordinal map, but it requires no
|
||||
taxonomy index and it tie-breaks facet labels in an understandable
|
||||
(by Unicode sort order) way. (Robert Muir, Mike McCandless)
|
||||
|
||||
* LUCENE-4843: Add LimitTokenPositionFilter: don't emit tokens with
|
||||
positions that exceed the configured limit. (Steve Rowe)
|
||||
|
||||
Optimizations
|
||||
|
||||
|
|
|
@ -0,0 +1,100 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* This TokenFilter limits its emitted tokens to those with positions that
|
||||
* are not greater than the configured limit.
|
||||
* <p>
|
||||
* By default, this filter ignores any tokens in the wrapped {@code TokenStream}
|
||||
* once the limit has been exceeded, which can result in {@code reset()} being
|
||||
* called prior to {@code incrementToken()} returning {@code false}. For most
|
||||
* {@code TokenStream} implementations this should be acceptable, and faster
|
||||
* then consuming the full stream. If you are wrapping a {@code TokenStream}
|
||||
* which requires that the full stream of tokens be exhausted in order to
|
||||
* function properly, use the
|
||||
* {@link #LimitTokenPositionFilter(TokenStream,int,boolean) consumeAllTokens}
|
||||
* option.
|
||||
*/
|
||||
public final class LimitTokenPositionFilter extends TokenFilter {
|
||||
|
||||
private final int maxTokenPosition;
|
||||
private final boolean consumeAllTokens;
|
||||
private int tokenPosition = 0;
|
||||
private boolean exhausted = false;
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
/**
|
||||
* Build a filter that only accepts tokens up to and including the given maximum position.
|
||||
* This filter will not consume any tokens with position greater than the maxTokenPosition limit.
|
||||
|
||||
* @param in the stream to wrap
|
||||
* @param maxTokenPosition max position of tokens to produce (1st token always has position 1)
|
||||
*
|
||||
* @see #LimitTokenPositionFilter(TokenStream,int,boolean)
|
||||
*/
|
||||
public LimitTokenPositionFilter(TokenStream in, int maxTokenPosition) {
|
||||
this(in, maxTokenPosition, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a filter that limits the maximum position of tokens to emit.
|
||||
*
|
||||
* @param in the stream to wrap
|
||||
* @param maxTokenPosition max position of tokens to produce (1st token always has position 1)
|
||||
* @param consumeAllTokens whether all tokens from the wrapped input stream must be consumed
|
||||
* even if maxTokenPosition is exceeded.
|
||||
*/
|
||||
public LimitTokenPositionFilter(TokenStream in, int maxTokenPosition, boolean consumeAllTokens) {
|
||||
super(in);
|
||||
this.maxTokenPosition = maxTokenPosition;
|
||||
this.consumeAllTokens = consumeAllTokens;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (exhausted) {
|
||||
return false;
|
||||
}
|
||||
if (input.incrementToken()) {
|
||||
tokenPosition += posIncAtt.getPositionIncrement();
|
||||
if (tokenPosition <= maxTokenPosition) {
|
||||
return true;
|
||||
} else {
|
||||
while (consumeAllTokens && input.incrementToken()) { /* NOOP */ }
|
||||
exhausted = true;
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
exhausted = true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
tokenPosition = 0;
|
||||
exhausted = false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link LimitTokenPositionFilter}.
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_limit_pos" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.LimitTokenPositionFilterFactory" maxTokenPosition="3" consumeAllTokens="false" />
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
* <p>
|
||||
* The {@code consumeAllTokens} property is optional and defaults to {@code false}.
|
||||
* See {@link LimitTokenPositionFilter} for an explanation of its use.
|
||||
*/
|
||||
public class LimitTokenPositionFilterFactory extends TokenFilterFactory {
|
||||
|
||||
public static final String MAX_TOKEN_POSITION_KEY = "maxTokenPosition";
|
||||
public static final String CONSUME_ALL_TOKENS_KEY = "consumeAllTokens";
|
||||
int maxTokenPosition;
|
||||
boolean consumeAllTokens;
|
||||
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init(args);
|
||||
maxTokenPosition = getInt(MAX_TOKEN_POSITION_KEY);
|
||||
consumeAllTokens = getBoolean(CONSUME_ALL_TOKENS_KEY, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new LimitTokenPositionFilter(input, maxTokenPosition, consumeAllTokens);
|
||||
}
|
||||
|
||||
}
|
|
@ -61,6 +61,7 @@ org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
|
|||
org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.LengthFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
|
||||
|
|
|
@ -0,0 +1,84 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
|
||||
public class TestLimitTokenPositionFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testMaxPosition2() throws IOException {
|
||||
for (final boolean consumeAll : new boolean[] { true, false }) {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
MockTokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
// if we are consuming all tokens, we can use the checks, otherwise we can't
|
||||
tokenizer.setEnableChecks(consumeAll);
|
||||
return new TokenStreamComponents(tokenizer, new LimitTokenPositionFilter(tokenizer, 2, consumeAll));
|
||||
}
|
||||
};
|
||||
|
||||
// dont use assertAnalyzesTo here, as the end offset is not the end of the string (unless consumeAll is true, in which case its correct)!
|
||||
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")),
|
||||
new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, consumeAll ? 16 : null);
|
||||
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")),
|
||||
new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, consumeAll ? 9 : null);
|
||||
|
||||
// less than the limit, ensure we behave correctly
|
||||
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 ")),
|
||||
new String[] { "1" }, new int[] { 0 }, new int[] { 1 }, consumeAll ? 3 : null);
|
||||
|
||||
// equal to limit
|
||||
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 ")),
|
||||
new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, consumeAll ? 6 : null);
|
||||
}
|
||||
}
|
||||
|
||||
public void testMaxPosition3WithSynomyms() throws IOException {
|
||||
MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false);
|
||||
tokenizer.setEnableChecks(false); // LimitTokenPositionFilter doesn't consume the entire stream that it wraps
|
||||
|
||||
SynonymMap.Builder builder = new SynonymMap.Builder(true);
|
||||
builder.add(new CharsRef("one"), new CharsRef("first"), true);
|
||||
builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
|
||||
builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
|
||||
CharsRef multiWordCharsRef = new CharsRef();
|
||||
SynonymMap.Builder.join(new String[] { "and", "indubitably", "single", "only" }, multiWordCharsRef);
|
||||
builder.add(new CharsRef("one"), multiWordCharsRef, true);
|
||||
SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef);
|
||||
builder.add(new CharsRef("two"), multiWordCharsRef, true);
|
||||
SynonymMap synonymMap = builder.build();
|
||||
TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
|
||||
stream = new LimitTokenPositionFilter(stream, 3); // consumeAllTokens defaults to false
|
||||
|
||||
// "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger" },
|
||||
new int[] { 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0 });
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||
|
||||
public class TestLimitTokenPositionFilterFactory extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testMaxPosition1() throws IOException {
|
||||
LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
args.put(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "1");
|
||||
factory.init(args);
|
||||
String test = "A1 B2 C3 D4 E5 F6";
|
||||
MockTokenizer tok = new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false);
|
||||
// LimitTokenPositionFilter doesn't consume the entire stream that it wraps
|
||||
tok.setEnableChecks(false);
|
||||
TokenStream stream = factory.create(tok);
|
||||
assertTokenStreamContents(stream, new String[] { "A1" });
|
||||
}
|
||||
|
||||
public void testMissingParam() {
|
||||
LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
IllegalArgumentException iae = null;
|
||||
try {
|
||||
factory.init(args);
|
||||
} catch (IllegalArgumentException e) {
|
||||
assertTrue("exception doesn't mention param: " + e.getMessage(),
|
||||
0 < e.getMessage().indexOf(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY));
|
||||
iae = e;
|
||||
}
|
||||
assertNotNull("no exception thrown", iae);
|
||||
}
|
||||
|
||||
public void testMaxPosition1WithShingles() throws IOException {
|
||||
LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
args.put(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "1");
|
||||
factory.init(args);
|
||||
String input = "one two three four five";
|
||||
MockTokenizer tok = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
// LimitTokenPositionFilter doesn't consume the entire stream that it wraps
|
||||
tok.setEnableChecks(false);
|
||||
ShingleFilter shingleFilter = new ShingleFilter(tok, 2, 3);
|
||||
shingleFilter.setOutputUnigrams(true);
|
||||
TokenStream stream = factory.create(shingleFilter);
|
||||
assertTokenStreamContents(stream, new String[] { "one", "one two", "one two three" });
|
||||
}
|
||||
|
||||
public void testConsumeAllTokens() throws IOException {
|
||||
LimitTokenPositionFilterFactory factory = new LimitTokenPositionFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
args.put(LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "3");
|
||||
args.put(LimitTokenPositionFilterFactory.CONSUME_ALL_TOKENS_KEY, "true");
|
||||
factory.init(args);
|
||||
String test = "A1 B2 C3 D4 E5 F6";
|
||||
MockTokenizer tok = new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false);
|
||||
TokenStream stream = factory.create(tok);
|
||||
assertTokenStreamContents(stream, new String[] { "A1", "B2", "C3" });
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue