mirror of https://github.com/apache/lucene.git
LUCENE-6423: New LimitTokenOffsetFilter to limit tokens <= a maxStartOffset
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1675473 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
dd68aaf7e0
commit
af6fe4d174
|
@ -70,6 +70,9 @@ New Features
|
||||||
* LUCENE-6389: Added ScoreMode.Min that aggregates the lowest child score
|
* LUCENE-6389: Added ScoreMode.Min that aggregates the lowest child score
|
||||||
to the parent hit. (Martijn van Groningen, Adrien Grand)
|
to the parent hit. (Martijn van Groningen, Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-6423: New LimitTokenOffsetFilter that limits tokens to those before
|
||||||
|
a configured maximum start offset. (David Smiley)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-6379: IndexWriter.deleteDocuments(Query...) now detects if
|
* LUCENE-6379: IndexWriter.deleteDocuments(Query...) now detects if
|
||||||
|
|
|
@ -0,0 +1,82 @@
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lets all tokens pass through until it sees one with a start offset <= a
|
||||||
|
* configured limit, which won't pass and ends the stream. This can be useful to
|
||||||
|
* limit highlighting, for example.
|
||||||
|
* <p>
|
||||||
|
* By default, this filter ignores any tokens in the wrapped {@code TokenStream}
|
||||||
|
* once the limit has been exceeded, which can result in {@code reset()} being
|
||||||
|
* called prior to {@code incrementToken()} returning {@code false}. For most
|
||||||
|
* {@code TokenStream} implementations this should be acceptable, and faster
|
||||||
|
* then consuming the full stream. If you are wrapping a {@code TokenStream}
|
||||||
|
* which requires that the full stream of tokens be exhausted in order to
|
||||||
|
* function properly, use the
|
||||||
|
* {@link #LimitTokenOffsetFilter(TokenStream, int, boolean)} option.
|
||||||
|
*/
|
||||||
|
public final class LimitTokenOffsetFilter extends TokenFilter {
|
||||||
|
|
||||||
|
private final OffsetAttribute offsetAttrib = addAttribute(OffsetAttribute.class);
|
||||||
|
private int maxStartOffset;
|
||||||
|
private final boolean consumeAllTokens;
|
||||||
|
|
||||||
|
// some day we may limit by end offset too but no need right now
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lets all tokens pass through until it sees one with a start offset <= {@code maxStartOffset}
|
||||||
|
* which won't pass and ends the stream. It won't consume any tokens afterwards.
|
||||||
|
*
|
||||||
|
* @param maxStartOffset the maximum start offset allowed
|
||||||
|
*/
|
||||||
|
public LimitTokenOffsetFilter(TokenStream input, int maxStartOffset) {
|
||||||
|
this(input, maxStartOffset, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public LimitTokenOffsetFilter(TokenStream input, int maxStartOffset, boolean consumeAllTokens) {
|
||||||
|
super(input);
|
||||||
|
if (maxStartOffset < 0) {
|
||||||
|
throw new IllegalArgumentException("maxStartOffset must be >= zero");
|
||||||
|
}
|
||||||
|
this.maxStartOffset = maxStartOffset;
|
||||||
|
this.consumeAllTokens = consumeAllTokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (!input.incrementToken()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (offsetAttrib.startOffset() <= maxStartOffset) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (consumeAllTokens) {
|
||||||
|
while (input.incrementToken()) {
|
||||||
|
// no-op
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link LimitTokenOffsetFilter}.
|
||||||
|
* <pre class="prettyprint">
|
||||||
|
* <fieldType name="text_limit_pos" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
* <analyzer>
|
||||||
|
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
* <filter class="solr.LimitTokenOffsetFilter" maxStartOffset="100000" consumeAllTokens="false" />
|
||||||
|
* </analyzer>
|
||||||
|
* </fieldType></pre>
|
||||||
|
* <p>
|
||||||
|
* The {@code consumeAllTokens} property is optional and defaults to {@code false}.
|
||||||
|
*/
|
||||||
|
public class LimitTokenOffsetFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
|
public static final String MAX_START_OFFSET = "maxStartOffset";
|
||||||
|
public static final String CONSUME_ALL_TOKENS_KEY = "consumeAllTokens";
|
||||||
|
|
||||||
|
private int maxStartOffset;
|
||||||
|
private boolean consumeAllTokens;
|
||||||
|
|
||||||
|
public LimitTokenOffsetFilterFactory(Map<String, String> args) {
|
||||||
|
super(args);
|
||||||
|
maxStartOffset = requireInt(args, MAX_START_OFFSET);
|
||||||
|
consumeAllTokens = getBoolean(args, CONSUME_ALL_TOKENS_KEY, false);
|
||||||
|
if (!args.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new LimitTokenOffsetFilter(input, maxStartOffset, consumeAllTokens);
|
||||||
|
}
|
||||||
|
}
|
|
@ -66,6 +66,7 @@ org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory
|
org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.LengthFilterFactory
|
org.apache.lucene.analysis.miscellaneous.LengthFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory
|
org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory
|
||||||
|
org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
|
org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
|
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
|
||||||
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
|
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class TestLimitTokenOffsetFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
public void test() throws Exception {
|
||||||
|
for (final boolean consumeAll : new boolean[]{true, false}) {
|
||||||
|
MockTokenizer tokenizer = whitespaceMockTokenizer("A1 B2 C3 D4 E5 F6");
|
||||||
|
tokenizer.setEnableChecks(consumeAll);
|
||||||
|
//note with '3', this test would fail if erroneously the filter used endOffset instead
|
||||||
|
TokenStream stream = new LimitTokenOffsetFilter(tokenizer, 3, consumeAll);
|
||||||
|
assertTokenStreamContents(stream, new String[]{"A1", "B2"});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(expected = IllegalArgumentException.class)
|
||||||
|
public void testIllegalArguments() throws Exception {
|
||||||
|
new LimitTokenOffsetFilter(whitespaceMockTokenizer("A1 B2 C3 D4 E5 F6"), -1);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,68 @@
|
||||||
|
package org.apache.lucene.analysis.miscellaneous;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||||
|
|
||||||
|
public class TestLimitTokenOffsetFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
|
|
||||||
|
public void test() throws Exception {
|
||||||
|
for (final boolean consumeAll : new boolean[]{true, false}) {
|
||||||
|
Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
|
||||||
|
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
tokenizer.setReader(reader);
|
||||||
|
tokenizer.setEnableChecks(consumeAll);
|
||||||
|
TokenStream stream = tokenizer;
|
||||||
|
stream = tokenFilterFactory("LimitTokenOffset",
|
||||||
|
LimitTokenOffsetFilterFactory.MAX_START_OFFSET, "3",
|
||||||
|
LimitTokenOffsetFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll)
|
||||||
|
).create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[]{"A1", "B2"});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRequired() throws Exception {
|
||||||
|
// param is required
|
||||||
|
try {
|
||||||
|
tokenFilterFactory("LimitTokenOffset");
|
||||||
|
fail();
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
assertTrue("exception doesn't mention param: " + e.getMessage(),
|
||||||
|
0 < e.getMessage().indexOf(LimitTokenOffsetFilterFactory.MAX_START_OFFSET));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test that bogus arguments result in exception
|
||||||
|
*/
|
||||||
|
public void testBogusArguments() throws Exception {
|
||||||
|
try {
|
||||||
|
tokenFilterFactory("LimitTokenOffset",
|
||||||
|
LimitTokenOffsetFilterFactory.MAX_START_OFFSET, "3",
|
||||||
|
"bogusArg", "bogusValue");
|
||||||
|
fail();
|
||||||
|
} catch (IllegalArgumentException expected) {
|
||||||
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue