LUCENE-6423: New LimitTokenOffsetFilter to limit tokens <= a maxStartOffset

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1675473 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
David Wayne Smiley 2015-04-22 19:38:51 +00:00
parent dd68aaf7e0
commit af6fe4d174
6 changed files with 253 additions and 0 deletions

View File

@ -70,6 +70,9 @@ New Features
* LUCENE-6389: Added ScoreMode.Min that aggregates the lowest child score * LUCENE-6389: Added ScoreMode.Min that aggregates the lowest child score
to the parent hit. (Martijn van Groningen, Adrien Grand) to the parent hit. (Martijn van Groningen, Adrien Grand)
* LUCENE-6423: New LimitTokenOffsetFilter that limits tokens to those before
a configured maximum start offset. (David Smiley)
Optimizations Optimizations
* LUCENE-6379: IndexWriter.deleteDocuments(Query...) now detects if * LUCENE-6379: IndexWriter.deleteDocuments(Query...) now detects if

View File

@ -0,0 +1,82 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
/**
* Lets all tokens pass through until it sees one with a start offset &lt;= a
* configured limit, which won't pass and ends the stream. This can be useful to
* limit highlighting, for example.
* <p>
* By default, this filter ignores any tokens in the wrapped {@code TokenStream}
* once the limit has been exceeded, which can result in {@code reset()} being
* called prior to {@code incrementToken()} returning {@code false}. For most
* {@code TokenStream} implementations this should be acceptable, and faster
* then consuming the full stream. If you are wrapping a {@code TokenStream}
* which requires that the full stream of tokens be exhausted in order to
* function properly, use the
* {@link #LimitTokenOffsetFilter(TokenStream, int, boolean)} option.
*/
public final class LimitTokenOffsetFilter extends TokenFilter {
private final OffsetAttribute offsetAttrib = addAttribute(OffsetAttribute.class);
private int maxStartOffset;
private final boolean consumeAllTokens;
// some day we may limit by end offset too but no need right now
/**
* Lets all tokens pass through until it sees one with a start offset &lt;= {@code maxStartOffset}
* which won't pass and ends the stream. It won't consume any tokens afterwards.
*
* @param maxStartOffset the maximum start offset allowed
*/
public LimitTokenOffsetFilter(TokenStream input, int maxStartOffset) {
this(input, maxStartOffset, false);
}
public LimitTokenOffsetFilter(TokenStream input, int maxStartOffset, boolean consumeAllTokens) {
super(input);
if (maxStartOffset < 0) {
throw new IllegalArgumentException("maxStartOffset must be >= zero");
}
this.maxStartOffset = maxStartOffset;
this.consumeAllTokens = consumeAllTokens;
}
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken()) {
return false;
}
if (offsetAttrib.startOffset() <= maxStartOffset) {
return true;
}
if (consumeAllTokens) {
while (input.incrementToken()) {
// no-op
}
}
return false;
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link LimitTokenOffsetFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_limit_pos" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.LimitTokenOffsetFilter" maxStartOffset="100000" consumeAllTokens="false" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
* <p>
* The {@code consumeAllTokens} property is optional and defaults to {@code false}.
*/
public class LimitTokenOffsetFilterFactory extends TokenFilterFactory {
public static final String MAX_START_OFFSET = "maxStartOffset";
public static final String CONSUME_ALL_TOKENS_KEY = "consumeAllTokens";
private int maxStartOffset;
private boolean consumeAllTokens;
public LimitTokenOffsetFilterFactory(Map<String, String> args) {
super(args);
maxStartOffset = requireInt(args, MAX_START_OFFSET);
consumeAllTokens = getBoolean(args, CONSUME_ALL_TOKENS_KEY, false);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public TokenStream create(TokenStream input) {
return new LimitTokenOffsetFilter(input, maxStartOffset, consumeAllTokens);
}
}

View File

@ -66,6 +66,7 @@ org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory
org.apache.lucene.analysis.miscellaneous.LengthFilterFactory org.apache.lucene.analysis.miscellaneous.LengthFilterFactory
org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory
org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilterFactory
org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory

View File

@ -0,0 +1,41 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.junit.Test;
public class TestLimitTokenOffsetFilter extends BaseTokenStreamTestCase {
public void test() throws Exception {
for (final boolean consumeAll : new boolean[]{true, false}) {
MockTokenizer tokenizer = whitespaceMockTokenizer("A1 B2 C3 D4 E5 F6");
tokenizer.setEnableChecks(consumeAll);
//note with '3', this test would fail if erroneously the filter used endOffset instead
TokenStream stream = new LimitTokenOffsetFilter(tokenizer, 3, consumeAll);
assertTokenStreamContents(stream, new String[]{"A1", "B2"});
}
}
@Test(expected = IllegalArgumentException.class)
public void testIllegalArguments() throws Exception {
new LimitTokenOffsetFilter(whitespaceMockTokenizer("A1 B2 C3 D4 E5 F6"), -1);
}
}

View File

@ -0,0 +1,68 @@
package org.apache.lucene.analysis.miscellaneous;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
public class TestLimitTokenOffsetFilterFactory extends BaseTokenStreamFactoryTestCase {
public void test() throws Exception {
for (final boolean consumeAll : new boolean[]{true, false}) {
Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenizer.setReader(reader);
tokenizer.setEnableChecks(consumeAll);
TokenStream stream = tokenizer;
stream = tokenFilterFactory("LimitTokenOffset",
LimitTokenOffsetFilterFactory.MAX_START_OFFSET, "3",
LimitTokenOffsetFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll)
).create(stream);
assertTokenStreamContents(stream, new String[]{"A1", "B2"});
}
}
public void testRequired() throws Exception {
// param is required
try {
tokenFilterFactory("LimitTokenOffset");
fail();
} catch (IllegalArgumentException e) {
assertTrue("exception doesn't mention param: " + e.getMessage(),
0 < e.getMessage().indexOf(LimitTokenOffsetFilterFactory.MAX_START_OFFSET));
}
}
/**
* Test that bogus arguments result in exception
*/
public void testBogusArguments() throws Exception {
try {
tokenFilterFactory("LimitTokenOffset",
LimitTokenOffsetFilterFactory.MAX_START_OFFSET, "3",
"bogusArg", "bogusValue");
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
}