LUCENE-6423: New LimitTokenOffsetFilter to limit tokens <= a maxStartOffset

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1675473 13f79535-47bb-0310-9956-ffa450edef68
2015-04-22 19:38:51 +00:00 · 2015-04-22 19:38:51 +00:00 · af6fe4d174
parent dd68aaf7e0
commit af6fe4d174
6 changed files with 253 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -70,6 +70,9 @@ New Features
 * LUCENE-6389: Added ScoreMode.Min that aggregates the lowest child score
  to the parent hit. (Martijn van Groningen, Adrien Grand)
 * LUCENE-6423: New LimitTokenOffsetFilter that limits tokens to those before
  a configured maximum start offset. (David Smiley)
 Optimizations
 * LUCENE-6379: IndexWriter.deleteDocuments(Query...) now detects if
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenOffsetFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenOffsetFilter.java
@ -0,0 +1,82 @@
 package org.apache.lucene.analysis.miscellaneous;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 /**
 * Lets all tokens pass through until it sees one with a start offset &lt;= a
 * configured limit, which won't pass and ends the stream.  This can be useful to
 * limit highlighting, for example.
 * <p>
 * By default, this filter ignores any tokens in the wrapped {@code TokenStream}
 * once the limit has been exceeded, which can result in {@code reset()} being
 * called prior to {@code incrementToken()} returning {@code false}.  For most
 * {@code TokenStream} implementations this should be acceptable, and faster
 * then consuming the full stream. If you are wrapping a {@code TokenStream}
 * which requires that the full stream of tokens be exhausted in order to
 * function properly, use the
 * {@link #LimitTokenOffsetFilter(TokenStream, int, boolean)} option.
 */
 public final class LimitTokenOffsetFilter extends TokenFilter {
  private final OffsetAttribute offsetAttrib = addAttribute(OffsetAttribute.class);
  private int maxStartOffset;
  private final boolean consumeAllTokens;
  // some day we may limit by end offset too but no need right now
  /**
   * Lets all tokens pass through until it sees one with a start offset &lt;= {@code maxStartOffset}
   * which won't pass and ends the stream. It won't consume any tokens afterwards.
   *
   * @param maxStartOffset the maximum start offset allowed
   */
  public LimitTokenOffsetFilter(TokenStream input, int maxStartOffset) {
    this(input, maxStartOffset, false);
  }
  public LimitTokenOffsetFilter(TokenStream input, int maxStartOffset, boolean consumeAllTokens) {
    super(input);
    if (maxStartOffset < 0) {
      throw new IllegalArgumentException("maxStartOffset must be >= zero");
    }
    this.maxStartOffset = maxStartOffset;
    this.consumeAllTokens = consumeAllTokens;
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (!input.incrementToken()) {
      return false;
    }
    if (offsetAttrib.startOffset() <= maxStartOffset) {
      return true;
    }
    if (consumeAllTokens) {
      while (input.incrementToken()) {
        // no-op
      }
    }
    return false;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenOffsetFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenOffsetFilterFactory.java
@ -0,0 +1,58 @@
 package org.apache.lucene.analysis.miscellaneous;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.util.Map;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 /**
 * Factory for {@link LimitTokenOffsetFilter}.
 * <pre class="prettyprint">
 * &lt;fieldType name="text_limit_pos" class="solr.TextField" positionIncrementGap="100"&gt;
 *   &lt;analyzer&gt;
 *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
 *     &lt;filter class="solr.LimitTokenOffsetFilter" maxStartOffset="100000" consumeAllTokens="false" /&gt;
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre>
 * <p>
 * The {@code consumeAllTokens} property is optional and defaults to {@code false}.
 */
 public class LimitTokenOffsetFilterFactory extends TokenFilterFactory {
  public static final String MAX_START_OFFSET = "maxStartOffset";
  public static final String CONSUME_ALL_TOKENS_KEY = "consumeAllTokens";
  private int maxStartOffset;
  private boolean consumeAllTokens;
  public LimitTokenOffsetFilterFactory(Map<String, String> args) {
    super(args);
    maxStartOffset = requireInt(args, MAX_START_OFFSET);
    consumeAllTokens = getBoolean(args, CONSUME_ALL_TOKENS_KEY, false);
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
  }
  @Override
  public TokenStream create(TokenStream input) {
    return new LimitTokenOffsetFilter(input, maxStartOffset, consumeAllTokens);
  }
 }
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@ -66,6 +66,7 @@ org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
 org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory
 org.apache.lucene.analysis.miscellaneous.LengthFilterFactory
 org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory
 org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilterFactory
 org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
 org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
 org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenOffsetFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenOffsetFilter.java
@ -0,0 +1,41 @@
 package org.apache.lucene.analysis.miscellaneous;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.junit.Test;
 public class TestLimitTokenOffsetFilter extends BaseTokenStreamTestCase {
  public void test() throws Exception {
    for (final boolean consumeAll : new boolean[]{true, false}) {
      MockTokenizer tokenizer = whitespaceMockTokenizer("A1 B2 C3 D4 E5 F6");
      tokenizer.setEnableChecks(consumeAll);
      //note with '3', this test would fail if erroneously the filter used endOffset instead
      TokenStream stream = new LimitTokenOffsetFilter(tokenizer, 3, consumeAll);
      assertTokenStreamContents(stream, new String[]{"A1", "B2"});
    }
  }
  @Test(expected = IllegalArgumentException.class)
  public void testIllegalArguments() throws Exception {
    new LimitTokenOffsetFilter(whitespaceMockTokenizer("A1 B2 C3 D4 E5 F6"), -1);
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenOffsetFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenOffsetFilterFactory.java
@ -0,0 +1,68 @@
 package org.apache.lucene.analysis.miscellaneous;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.Reader;
 import java.io.StringReader;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
 public class TestLimitTokenOffsetFilterFactory extends BaseTokenStreamFactoryTestCase {
  public void test() throws Exception {
    for (final boolean consumeAll : new boolean[]{true, false}) {
      Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
      MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      tokenizer.setReader(reader);
      tokenizer.setEnableChecks(consumeAll);
      TokenStream stream = tokenizer;
      stream = tokenFilterFactory("LimitTokenOffset",
          LimitTokenOffsetFilterFactory.MAX_START_OFFSET, "3",
          LimitTokenOffsetFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll)
      ).create(stream);
      assertTokenStreamContents(stream, new String[]{"A1", "B2"});
    }
  }
  public void testRequired() throws Exception {
    // param is required
    try {
      tokenFilterFactory("LimitTokenOffset");
      fail();
    } catch (IllegalArgumentException e) {
      assertTrue("exception doesn't mention param: " + e.getMessage(),
          0 < e.getMessage().indexOf(LimitTokenOffsetFilterFactory.MAX_START_OFFSET));
    }
  }
  /**
   * Test that bogus arguments result in exception
   */
  public void testBogusArguments() throws Exception {
    try {
      tokenFilterFactory("LimitTokenOffset",
          LimitTokenOffsetFilterFactory.MAX_START_OFFSET, "3",
          "bogusArg", "bogusValue");
      fail();
    } catch (IllegalArgumentException expected) {
      assertTrue(expected.getMessage().contains("Unknown parameters"));
    }
  }
 }