LUCENE-6423: New LimitTokenOffsetFilter to limit tokens <= a maxStartOffset

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1675473 13f79535-47bb-0310-9956-ffa450edef68
2015-04-22 19:38:51 +00:00 · 2015-04-22 19:38:51 +00:00 · af6fe4d174
parent dd68aaf7e0
commit af6fe4d174
6 changed files with 253 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -70,6 +70,9 @@ New Features
 * LUCENE-6389: Added ScoreMode.Min that aggregates the lowest child score
  to the parent hit. (Martijn van Groningen, Adrien Grand)

+* LUCENE-6423: New LimitTokenOffsetFilter that limits tokens to those before
+  a configured maximum start offset. (David Smiley)
+
 Optimizations

 * LUCENE-6379: IndexWriter.deleteDocuments(Query...) now detects if
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenOffsetFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenOffsetFilter.java
@ -0,0 +1,82 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+/**
+ * Lets all tokens pass through until it sees one with a start offset &lt;= a
+ * configured limit, which won't pass and ends the stream.  This can be useful to
+ * limit highlighting, for example.
+ * <p>
+ * By default, this filter ignores any tokens in the wrapped {@code TokenStream}
+ * once the limit has been exceeded, which can result in {@code reset()} being
+ * called prior to {@code incrementToken()} returning {@code false}.  For most
+ * {@code TokenStream} implementations this should be acceptable, and faster
+ * then consuming the full stream. If you are wrapping a {@code TokenStream}
+ * which requires that the full stream of tokens be exhausted in order to
+ * function properly, use the
+ * {@link #LimitTokenOffsetFilter(TokenStream, int, boolean)} option.
+ */
+public final class LimitTokenOffsetFilter extends TokenFilter {
+
+  private final OffsetAttribute offsetAttrib = addAttribute(OffsetAttribute.class);
+  private int maxStartOffset;
+  private final boolean consumeAllTokens;
+
+  // some day we may limit by end offset too but no need right now
+
+  /**
+   * Lets all tokens pass through until it sees one with a start offset &lt;= {@code maxStartOffset}
+   * which won't pass and ends the stream. It won't consume any tokens afterwards.
+   *
+   * @param maxStartOffset the maximum start offset allowed
+   */
+  public LimitTokenOffsetFilter(TokenStream input, int maxStartOffset) {
+    this(input, maxStartOffset, false);
+  }
+
+  public LimitTokenOffsetFilter(TokenStream input, int maxStartOffset, boolean consumeAllTokens) {
+    super(input);
+    if (maxStartOffset < 0) {
+      throw new IllegalArgumentException("maxStartOffset must be >= zero");
+    }
+    this.maxStartOffset = maxStartOffset;
+    this.consumeAllTokens = consumeAllTokens;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (!input.incrementToken()) {
+      return false;
+    }
+    if (offsetAttrib.startOffset() <= maxStartOffset) {
+      return true;
+    }
+    if (consumeAllTokens) {
+      while (input.incrementToken()) {
+        // no-op
+      }
+    }
+    return false;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenOffsetFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenOffsetFilterFactory.java
@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link LimitTokenOffsetFilter}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_limit_pos" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.LimitTokenOffsetFilter" maxStartOffset="100000" consumeAllTokens="false" /&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ * <p>
+ * The {@code consumeAllTokens} property is optional and defaults to {@code false}.
+ */
+public class LimitTokenOffsetFilterFactory extends TokenFilterFactory {
+
+  public static final String MAX_START_OFFSET = "maxStartOffset";
+  public static final String CONSUME_ALL_TOKENS_KEY = "consumeAllTokens";
+
+  private int maxStartOffset;
+  private boolean consumeAllTokens;
+
+  public LimitTokenOffsetFilterFactory(Map<String, String> args) {
+    super(args);
+    maxStartOffset = requireInt(args, MAX_START_OFFSET);
+    consumeAllTokens = getBoolean(args, CONSUME_ALL_TOKENS_KEY, false);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new LimitTokenOffsetFilter(input, maxStartOffset, consumeAllTokens);
+  }
+}
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@ -66,6 +66,7 @@ org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
 org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory
 org.apache.lucene.analysis.miscellaneous.LengthFilterFactory
 org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory
+org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilterFactory
 org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
 org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
 org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenOffsetFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenOffsetFilter.java
@ -0,0 +1,41 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.junit.Test;
+
+public class TestLimitTokenOffsetFilter extends BaseTokenStreamTestCase {
+
+  public void test() throws Exception {
+    for (final boolean consumeAll : new boolean[]{true, false}) {
+      MockTokenizer tokenizer = whitespaceMockTokenizer("A1 B2 C3 D4 E5 F6");
+      tokenizer.setEnableChecks(consumeAll);
+      //note with '3', this test would fail if erroneously the filter used endOffset instead
+      TokenStream stream = new LimitTokenOffsetFilter(tokenizer, 3, consumeAll);
+      assertTokenStreamContents(stream, new String[]{"A1", "B2"});
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void testIllegalArguments() throws Exception {
+    new LimitTokenOffsetFilter(whitespaceMockTokenizer("A1 B2 C3 D4 E5 F6"), -1);
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenOffsetFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLimitTokenOffsetFilterFactory.java
@ -0,0 +1,68 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+public class TestLimitTokenOffsetFilterFactory extends BaseTokenStreamFactoryTestCase {
+
+  public void test() throws Exception {
+    for (final boolean consumeAll : new boolean[]{true, false}) {
+      Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
+      MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+      tokenizer.setReader(reader);
+      tokenizer.setEnableChecks(consumeAll);
+      TokenStream stream = tokenizer;
+      stream = tokenFilterFactory("LimitTokenOffset",
+          LimitTokenOffsetFilterFactory.MAX_START_OFFSET, "3",
+          LimitTokenOffsetFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll)
+      ).create(stream);
+      assertTokenStreamContents(stream, new String[]{"A1", "B2"});
+    }
+  }
+
+  public void testRequired() throws Exception {
+    // param is required
+    try {
+      tokenFilterFactory("LimitTokenOffset");
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertTrue("exception doesn't mention param: " + e.getMessage(),
+          0 < e.getMessage().indexOf(LimitTokenOffsetFilterFactory.MAX_START_OFFSET));
+    }
+  }
+
+  /**
+   * Test that bogus arguments result in exception
+   */
+  public void testBogusArguments() throws Exception {
+    try {
+      tokenFilterFactory("LimitTokenOffset",
+          LimitTokenOffsetFilterFactory.MAX_START_OFFSET, "3",
+          "bogusArg", "bogusValue");
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("Unknown parameters"));
+    }
+  }
+}