mirror of https://github.com/apache/lucene.git
LUCENE-6423: New LimitTokenOffsetFilter to limit tokens <= a maxStartOffset
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1675473 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
dd68aaf7e0
commit
af6fe4d174
|
@ -70,6 +70,9 @@ New Features
|
|||
* LUCENE-6389: Added ScoreMode.Min that aggregates the lowest child score
|
||||
to the parent hit. (Martijn van Groningen, Adrien Grand)
|
||||
|
||||
* LUCENE-6423: New LimitTokenOffsetFilter that limits tokens to those before
|
||||
a configured maximum start offset. (David Smiley)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-6379: IndexWriter.deleteDocuments(Query...) now detects if
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
|
||||
/**
|
||||
* Lets all tokens pass through until it sees one with a start offset <= a
|
||||
* configured limit, which won't pass and ends the stream. This can be useful to
|
||||
* limit highlighting, for example.
|
||||
* <p>
|
||||
* By default, this filter ignores any tokens in the wrapped {@code TokenStream}
|
||||
* once the limit has been exceeded, which can result in {@code reset()} being
|
||||
* called prior to {@code incrementToken()} returning {@code false}. For most
|
||||
* {@code TokenStream} implementations this should be acceptable, and faster
|
||||
* then consuming the full stream. If you are wrapping a {@code TokenStream}
|
||||
* which requires that the full stream of tokens be exhausted in order to
|
||||
* function properly, use the
|
||||
* {@link #LimitTokenOffsetFilter(TokenStream, int, boolean)} option.
|
||||
*/
|
||||
public final class LimitTokenOffsetFilter extends TokenFilter {
|
||||
|
||||
private final OffsetAttribute offsetAttrib = addAttribute(OffsetAttribute.class);
|
||||
private int maxStartOffset;
|
||||
private final boolean consumeAllTokens;
|
||||
|
||||
// some day we may limit by end offset too but no need right now
|
||||
|
||||
/**
|
||||
* Lets all tokens pass through until it sees one with a start offset <= {@code maxStartOffset}
|
||||
* which won't pass and ends the stream. It won't consume any tokens afterwards.
|
||||
*
|
||||
* @param maxStartOffset the maximum start offset allowed
|
||||
*/
|
||||
public LimitTokenOffsetFilter(TokenStream input, int maxStartOffset) {
|
||||
this(input, maxStartOffset, false);
|
||||
}
|
||||
|
||||
public LimitTokenOffsetFilter(TokenStream input, int maxStartOffset, boolean consumeAllTokens) {
|
||||
super(input);
|
||||
if (maxStartOffset < 0) {
|
||||
throw new IllegalArgumentException("maxStartOffset must be >= zero");
|
||||
}
|
||||
this.maxStartOffset = maxStartOffset;
|
||||
this.consumeAllTokens = consumeAllTokens;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
if (offsetAttrib.startOffset() <= maxStartOffset) {
|
||||
return true;
|
||||
}
|
||||
if (consumeAllTokens) {
|
||||
while (input.incrementToken()) {
|
||||
// no-op
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link LimitTokenOffsetFilter}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_limit_pos" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.LimitTokenOffsetFilter" maxStartOffset="100000" consumeAllTokens="false" />
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
* <p>
|
||||
* The {@code consumeAllTokens} property is optional and defaults to {@code false}.
|
||||
*/
|
||||
public class LimitTokenOffsetFilterFactory extends TokenFilterFactory {
|
||||
|
||||
public static final String MAX_START_OFFSET = "maxStartOffset";
|
||||
public static final String CONSUME_ALL_TOKENS_KEY = "consumeAllTokens";
|
||||
|
||||
private int maxStartOffset;
|
||||
private boolean consumeAllTokens;
|
||||
|
||||
public LimitTokenOffsetFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
maxStartOffset = requireInt(args, MAX_START_OFFSET);
|
||||
consumeAllTokens = getBoolean(args, CONSUME_ALL_TOKENS_KEY, false);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new LimitTokenOffsetFilter(input, maxStartOffset, consumeAllTokens);
|
||||
}
|
||||
}
|
|
@ -66,6 +66,7 @@ org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
|
|||
org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.LengthFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestLimitTokenOffsetFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void test() throws Exception {
|
||||
for (final boolean consumeAll : new boolean[]{true, false}) {
|
||||
MockTokenizer tokenizer = whitespaceMockTokenizer("A1 B2 C3 D4 E5 F6");
|
||||
tokenizer.setEnableChecks(consumeAll);
|
||||
//note with '3', this test would fail if erroneously the filter used endOffset instead
|
||||
TokenStream stream = new LimitTokenOffsetFilter(tokenizer, 3, consumeAll);
|
||||
assertTokenStreamContents(stream, new String[]{"A1", "B2"});
|
||||
}
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testIllegalArguments() throws Exception {
|
||||
new LimitTokenOffsetFilter(whitespaceMockTokenizer("A1 B2 C3 D4 E5 F6"), -1);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,68 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
|
||||
public class TestLimitTokenOffsetFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
|
||||
public void test() throws Exception {
|
||||
for (final boolean consumeAll : new boolean[]{true, false}) {
|
||||
Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
|
||||
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
tokenizer.setReader(reader);
|
||||
tokenizer.setEnableChecks(consumeAll);
|
||||
TokenStream stream = tokenizer;
|
||||
stream = tokenFilterFactory("LimitTokenOffset",
|
||||
LimitTokenOffsetFilterFactory.MAX_START_OFFSET, "3",
|
||||
LimitTokenOffsetFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll)
|
||||
).create(stream);
|
||||
assertTokenStreamContents(stream, new String[]{"A1", "B2"});
|
||||
}
|
||||
}
|
||||
|
||||
public void testRequired() throws Exception {
|
||||
// param is required
|
||||
try {
|
||||
tokenFilterFactory("LimitTokenOffset");
|
||||
fail();
|
||||
} catch (IllegalArgumentException e) {
|
||||
assertTrue("exception doesn't mention param: " + e.getMessage(),
|
||||
0 < e.getMessage().indexOf(LimitTokenOffsetFilterFactory.MAX_START_OFFSET));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that bogus arguments result in exception
|
||||
*/
|
||||
public void testBogusArguments() throws Exception {
|
||||
try {
|
||||
tokenFilterFactory("LimitTokenOffset",
|
||||
LimitTokenOffsetFilterFactory.MAX_START_OFFSET, "3",
|
||||
"bogusArg", "bogusValue");
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue