LUCENE-1253: LengthFilter (and Solr's KeepWordTokenFilter) now require up front specification of enablePositionIncrement. Together with StopFilter they have a common base class (FilteringTokenFilter) that handles the position increments automatically. Implementors only need to override an accept() method that filters tokens

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1065343 13f79535-47bb-0310-9956-ffa450edef68
2011-01-30 18:30:34 +00:00 · 2011-01-30 18:30:34 +00:00 · e7088279f7
parent 277dfa0e88
commit e7088279f7
10 changed files with 186 additions and 101 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -643,6 +643,12 @@ API Changes
  deletes remain buffered so that the next time you open an NRT reader
  and pass true, all deletes will be a applied.  (Mike McCandless)
 * LUCENE-1253: LengthFilter (and Solr's KeepWordTokenFilter) now
  require up front specification of enablePositionIncrement. Together with
  StopFilter they have a common base class (FilteringTokenFilter) that handles
  the position increments automatically. Implementors only need to override an
  accept() method that filters tokens.  (Uwe Schindler, Robert Muir)
 Bug fixes
 * LUCENE-2249: ParallelMultiSearcher should shut down thread pool on
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java
@ -22,10 +22,9 @@ import java.util.Arrays;
 import java.util.List;
 import java.util.Set;
-import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.util.FilteringTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.util.Version;
@ -42,14 +41,10 @@ import org.apache.lucene.util.Version;
 *         increments are preserved
 * </ul>
 */
-public final class StopFilter extends TokenFilter {
+public final class StopFilter extends FilteringTokenFilter {
  private final CharArraySet stopWords;
  private boolean enablePositionIncrements = true;
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  /**
   * Construct a token stream filtering the given input. If
@ -75,7 +70,7 @@ public final class StopFilter extends TokenFilter {
   */
  public StopFilter(Version matchVersion, TokenStream input, Set<?> stopWords, boolean ignoreCase)
  {
-    super(input);
+    super(true, input);
    this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet) stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase);
  }
@ -157,48 +152,8 @@ public final class StopFilter extends TokenFilter {
   * Returns the next input Token whose term() is not a stop word.
   */
  @Override
-  public final boolean incrementToken() throws IOException {
+  protected boolean accept() throws IOException {
-    // return the first non-stop word found
+    return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
    int skippedPositions = 0;
    while (input.incrementToken()) {
      if (!stopWords.contains(termAtt.buffer(), 0, termAtt.length())) {
        if (enablePositionIncrements) {
          posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
        }
        return true;
      }
      skippedPositions += posIncrAtt.getPositionIncrement();
    }
    // reached EOS -- return false
    return false;
  }
  /**
   * @see #setEnablePositionIncrements(boolean)
   */
  public boolean getEnablePositionIncrements() {
    return enablePositionIncrements;
  }
  /**
   * If <code>true</code>, this StopFilter will preserve
   * positions of the incoming tokens (ie, accumulate and
   * set position increments of the removed stop tokens).
   * Generally, <code>true</code> is best as it does not
   * lose information (positions of the original tokens)
   * during indexing.
   *
   * Default is true.
   * 
   * <p> When set, when a token is stopped
   * (omitted), the position increment of the following
   * token is incremented.
   *
   * <p> <b>NOTE</b>: be sure to also
   * set {@link QueryParser#setEnablePositionIncrements} if
   * you use QueryParser to create queries.
   */
  public void setEnablePositionIncrements(boolean enable) {
    this.enablePositionIncrements = enable;
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java
@ -21,6 +21,7 @@ import java.io.IOException;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.FilteringTokenFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
@ -30,22 +31,19 @@ import org.apache.lucene.analysis.util.CharArraySet;
 * 
 * @since solr 1.3
 */
-public final class KeepWordFilter extends TokenFilter {
+public final class KeepWordFilter extends FilteringTokenFilter {
  private final CharArraySet words;
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  /** The words set passed to this constructor will be directly used by this filter
   * and should not be modified, */
-  public KeepWordFilter(TokenStream in, CharArraySet words) {
+  public KeepWordFilter(boolean enablePositionIncrements, TokenStream in, CharArraySet words) {
-    super(in);
+    super(enablePositionIncrements, in);
    this.words = words;
  }
  @Override
-  public boolean incrementToken() throws IOException {
+  public boolean accept() throws IOException {
-    while (input.incrementToken()) {
+    return words.contains(termAtt.buffer(), 0, termAtt.length());
      if (words.contains(termAtt.buffer(), 0, termAtt.length())) return true;
    }
    return false;
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java
@ -21,6 +21,7 @@ import java.io.IOException;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.util.FilteringTokenFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 /**
@ -29,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 * Note: Length is calculated as the number of UTF-16 code units.
 * </p>
 */
-public final class LengthFilter extends TokenFilter {
+public final class LengthFilter extends FilteringTokenFilter {
  private final int min;
  private final int max;
@ -40,27 +41,15 @@ public final class LengthFilter extends TokenFilter {
   * Build a filter that removes words that are too long or too
   * short from the text.
   */
-  public LengthFilter(TokenStream in, int min, int max)
+  public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) {
-  {
+    super(enablePositionIncrements, in);
    super(in);
    this.min = min;
    this.max = max;
  }
  /**
   * Returns the next input Token whose term() is the right len
   */
  @Override
-  public final boolean incrementToken() throws IOException {
+  public boolean accept() throws IOException {
-    // return the first non-stop word found
+    final int len = termAtt.length();
-    while (input.incrementToken()) {
+    return (len >= min && len <= max);
      int len = termAtt.length();
      if (len >= min && len <= max) {
          return true;
      }
      // note: else we ignore it but should we index each part of it?
    }
    // reached EOS -- return false
    return false;
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java
@ -0,0 +1,96 @@
 package org.apache.lucene.analysis.util;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.queryParser.QueryParser; // for javadoc
 /**
 * Abstract base class for TokenFilters that may remove tokens.
 * You have to implement {@link #accept} and return a boolean if the current
 * token should be preserved. {@link #incrementToken} uses this method
 * to decide if a token should be passed to the caller.
 */
 public abstract class FilteringTokenFilter extends TokenFilter {
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
  public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
    super(input);
    this.enablePositionIncrements = enablePositionIncrements;
  }
  /** Override this method and return if the current input token should be returned by {@link #incrementToken}. */
  protected abstract boolean accept() throws IOException;
  @Override
  public final boolean incrementToken() throws IOException {
    if (enablePositionIncrements) {
      int skippedPositions = 0;
      while (input.incrementToken()) {
        if (accept()) {
          if (skippedPositions != 0) {
            posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
          }
          return true;
        }
        skippedPositions += posIncrAtt.getPositionIncrement();
      }
    } else {
      while (input.incrementToken()) {
        if (accept()) {
          return true;
        }
      }
    }
    // reached EOS -- return false
    return false;
  }
  /**
   * @see #setEnablePositionIncrements(boolean)
   */
  public boolean getEnablePositionIncrements() {
    return enablePositionIncrements;
  }
  /**
   * If <code>true</code>, this TokenFilter will preserve
   * positions of the incoming tokens (ie, accumulate and
   * set position increments of the removed tokens).
   * Generally, <code>true</code> is best as it does not
   * lose information (positions of the original tokens)
   * during indexing.
   * 
   * <p> When set, when a token is stopped
   * (omitted), the position increment of the following
   * token is incremented.
   *
   * <p> <b>NOTE</b>: be sure to also
   * set {@link QueryParser#setEnablePositionIncrements} if
   * you use QueryParser to create queries.
   */
  public void setEnablePositionIncrements(boolean enable) {
    this.enablePositionIncrements = enable;
  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java
@ -35,16 +35,26 @@ public class TestKeepWordFilter extends BaseTokenStreamTestCase {
    words.add( "aaa" );
    words.add( "bbb" );
-    String input = "aaa BBB ccc ddd EEE";
+    String input = "xxx yyy aaa zzz BBB ccc ddd EEE";
    // Test Stopwords
    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
-    stream = new KeepWordFilter(stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
+    stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
-    assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
+    assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 });
    // Now force case
    stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
-    stream = new KeepWordFilter(stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
+    stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
-    assertTokenStreamContents(stream, new String[] { "aaa" });
+    assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 });
    // Test Stopwords
    stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
    stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
    assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 });
    // Now force case
    stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
    stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
    assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java
@ -24,19 +24,24 @@ import java.io.StringReader;
 public class TestLengthFilter extends BaseTokenStreamTestCase {
-  public void testFilter() throws Exception {
+  public void testFilterNoPosIncr() throws Exception {
    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, 
        new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
-    LengthFilter filter = new LengthFilter(stream, 2, 6);
+    LengthFilter filter = new LengthFilter(false, stream, 2, 6);
-    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
+    assertTokenStreamContents(filter,
      new String[]{"short", "ab", "foo"},
      new int[]{1, 1, 1}
    );
  }
-    assertTrue(filter.incrementToken());
+  public void testFilterWithPosIncr() throws Exception {
-    assertEquals("short", termAtt.toString());
+    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, 
-    assertTrue(filter.incrementToken());
+        new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
-    assertEquals("ab", termAtt.toString());
+    LengthFilter filter = new LengthFilter(true, stream, 2, 6);
-    assertTrue(filter.incrementToken());
+    assertTokenStreamContents(filter,
-    assertEquals("foo", termAtt.toString());
+      new String[]{"short", "ab", "foo"},
-    assertFalse(filter.incrementToken());
+      new int[]{1, 4, 2}
    );
  }
 }
--- a/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java
@ -23,21 +23,26 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
 import org.apache.lucene.analysis.util.CharArraySet;
 import java.util.Map;
 import java.util.Set;
 import java.io.IOException;
 /**
 * @version $Id$
 * @since solr 1.3
 */
 public class KeepWordFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
-  private CharArraySet words;
+  @Override
-  private boolean ignoreCase;
+  public void init(Map<String,String> args) {
    super.init(args);
    assureMatchVersion();
  }
  public void inform(ResourceLoader loader) {
    String wordFiles = args.get("words");
    ignoreCase = getBoolean("ignoreCase", false);
    enablePositionIncrements = getBoolean("enablePositionIncrements",false);
    if (wordFiles != null) {
      try {
        words = getWordSet(loader, wordFiles, ignoreCase);
@ -47,6 +52,10 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
    }
  }
  private CharArraySet words;
  private boolean ignoreCase;
  private boolean enablePositionIncrements;
  /**
   * Set the keep word list.
   * NOTE: if ignoreCase==true, the words are expected to be lowercase
@ -62,15 +71,19 @@ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements Res
    this.ignoreCase = ignoreCase;
  }
-  public KeepWordFilter create(TokenStream input) {
+  public boolean isEnablePositionIncrements() {
-    return new KeepWordFilter(input, words);
+    return enablePositionIncrements;
  }
  public boolean isIgnoreCase() {
    return ignoreCase;
  }
  public CharArraySet getWords() {
    return words;
  }
-  public boolean isIgnoreCase() {
+  public KeepWordFilter create(TokenStream input) {
-    return ignoreCase;
+    return new KeepWordFilter(enablePositionIncrements, input, words);
  }
 }
--- a/solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java
@ -27,6 +27,7 @@ import java.util.Map;
 */
 public class LengthFilterFactory extends BaseTokenFilterFactory {
  int min,max;
  boolean enablePositionIncrements;
  public static final String MIN_KEY = "min";
  public static final String MAX_KEY = "max";
@ -35,8 +36,10 @@ public class LengthFilterFactory extends BaseTokenFilterFactory {
    super.init(args);
    min=Integer.parseInt(args.get(MIN_KEY));
    max=Integer.parseInt(args.get(MAX_KEY));
    enablePositionIncrements = getBoolean("enablePositionIncrements",false);
  }
  public LengthFilter create(TokenStream input) {
-    return new LengthFilter(input,min,max);
+    return new LengthFilter(enablePositionIncrements, input,min,max);
  }
 }
--- a/solr/src/test/org/apache/solr/analysis/LengthFilterTest.java
+++ b/solr/src/test/org/apache/solr/analysis/LengthFilterTest.java
@ -31,9 +31,19 @@ public class LengthFilterTest extends BaseTokenTestCase {
    Map<String, String> args = new HashMap<String, String>();
    args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
    args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
    // default: args.put("enablePositionIncrements", "false");
    factory.init(args);
    String test = "foo foobar super-duper-trooper";
    TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test)));
-    assertTokenStreamContents(stream, new String[] { "foobar" });
+    assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 1 });
    factory = new LengthFilterFactory();
    args = new HashMap<String, String>();
    args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
    args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
    args.put("enablePositionIncrements", "true");
    factory.init(args);
    stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(test)));
    assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 2 });
  }
 }