LUCENE-5908: Fix Lucene43NGramTokenizer to be final

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1620759 13f79535-47bb-0310-9956-ffa450edef68
2014-08-26 23:17:49 +00:00 · 2014-08-26 23:17:49 +00:00 · c5dd7783a3
parent 561fddd8a7
commit c5dd7783a3
5 changed files with 221 additions and 8 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -293,6 +293,8 @@ Bug Fixes
  in much, much faster tokenization for these text sequences.  
  (Chris Geeringh, Robert Muir, Steve Rowe)

+* LUCENE-5908: Fix Lucene43NGramTokenizer to be final
+  
 Test Framework

 * LUCENE-5786: Unflushed/ truncated events file (hung testing subprocess).
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43EdgeNGramTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43EdgeNGramTokenizer.java
@ -17,18 +17,105 @@ package org.apache.lucene.analysis.ngram;
 * limitations under the License.
 */

+import java.io.IOException;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.Version;

 /**
- * Tokenizes the input from an edge into n-grams of given size(s), using pre-4.4 behavior.
- *
- * @deprecated Use {@link org.apache.lucene.analysis.ngram.EdgeNGramTokenizer}.
+ * Old version of {@link EdgeNGramTokenizer} which doesn't handle correctly
+ * supplementary characters.
 */
@Deprecated
-public class Lucene43EdgeNGramTokenizer extends Lucene43NGramTokenizer {
+public final class Lucene43EdgeNGramTokenizer extends Tokenizer {
+  public static final Side DEFAULT_SIDE = Side.FRONT;
  public static final int DEFAULT_MAX_GRAM_SIZE = 1;
  public static final int DEFAULT_MIN_GRAM_SIZE = 1;

+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+
+  /** Specifies which side of the input the n-gram should be generated from */
+  public static enum Side {
+
+    /** Get the n-gram from the front of the input */
+    FRONT {
+      @Override
+      public String getLabel() { return "front"; }
+    },
+
+    /** Get the n-gram from the end of the input */
+    BACK  {
+      @Override
+      public String getLabel() { return "back"; }
+    };
+
+    public abstract String getLabel();
+
+    // Get the appropriate Side from a string
+    public static Side getSide(String sideName) {
+      if (FRONT.getLabel().equals(sideName)) {
+        return FRONT;
+      }
+      if (BACK.getLabel().equals(sideName)) {
+        return BACK;
+      }
+      return null;
+    }
+  }
+
+  private int minGram;
+  private int maxGram;
+  private int gramSize;
+  private Side side;
+  private boolean started;
+  private int inLen; // length of the input AFTER trim()
+  private int charsRead; // length of the input
+  private String inStr;
+
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param side the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public Lucene43EdgeNGramTokenizer(Side side, int minGram, int maxGram) {
+    init(side, minGram, maxGram);
+  }
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param factory {@link org.apache.lucene.util.AttributeFactory} to use
+   * @param side the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public Lucene43EdgeNGramTokenizer(AttributeFactory factory, Side side, int minGram, int maxGram) {
+    super(factory);
+    init(side, minGram, maxGram);
+  }
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param factory {@link org.apache.lucene.util.AttributeFactory} to use
+   * @param sideLabel the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public Lucene43EdgeNGramTokenizer(AttributeFactory factory, String sideLabel, int minGram, int maxGram) {
+    this(factory, Side.getSide(sideLabel), minGram, maxGram);
+  }
+
  /**
   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
   *
@ -36,7 +123,19 @@ public class Lucene43EdgeNGramTokenizer extends Lucene43NGramTokenizer {
   * @param maxGram the largest n-gram to generate
   */
  public Lucene43EdgeNGramTokenizer(int minGram, int maxGram) {
-    super(minGram, maxGram);
+    this(Side.FRONT, minGram, maxGram);
+  }
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  @Deprecated
+  public Lucene43EdgeNGramTokenizer(String sideLabel, int minGram, int maxGram) {
+    this(Side.getSide(sideLabel), minGram, maxGram);
  }

  /**
@ -47,7 +146,110 @@ public class Lucene43EdgeNGramTokenizer extends Lucene43NGramTokenizer {
   * @param maxGram the largest n-gram to generate
   */
  public Lucene43EdgeNGramTokenizer(AttributeFactory factory, int minGram, int maxGram) {
-    super(factory, minGram, maxGram);
+    this(factory, Side.FRONT, minGram, maxGram);
  }

+  private void init(Side side, int minGram, int maxGram) {
+
+    if (side == null) {
+      throw new IllegalArgumentException("sideLabel must be either front or back");
+    }
+
+    if (minGram < 1) {
+      throw new IllegalArgumentException("minGram must be greater than zero");
+    }
+
+    if (minGram > maxGram) {
+      throw new IllegalArgumentException("minGram must not be greater than maxGram");
+    }
+
+    maxGram = Math.min(maxGram, 1024);
+
+    this.minGram = minGram;
+    this.maxGram = maxGram;
+    this.side = side;
+  }
+
+  /** Returns the next token in the stream, or null at EOS. */
+  @Override
+  public boolean incrementToken() throws IOException {
+    clearAttributes();
+    // if we are just starting, read the whole input
+    if (!started) {
+      started = true;
+      gramSize = minGram;
+      final int limit = side == Side.FRONT ? maxGram : 1024;
+      char[] chars = new char[Math.min(1024, limit)];
+      charsRead = 0;
+      // TODO: refactor to a shared readFully somewhere:
+      boolean exhausted = false;
+      while (charsRead < limit) {
+        final int inc = input.read(chars, charsRead, chars.length-charsRead);
+        if (inc == -1) {
+          exhausted = true;
+          break;
+        }
+        charsRead += inc;
+        if (charsRead == chars.length && charsRead < limit) {
+          chars = ArrayUtil.grow(chars);
+        }
+      }
+
+      inStr = new String(chars, 0, charsRead);
+      inStr = inStr.trim();
+
+      if (!exhausted) {
+        // Read extra throwaway chars so that on end() we
+        // report the correct offset:
+        char[] throwaway = new char[1024];
+        while(true) {
+          final int inc = input.read(throwaway, 0, throwaway.length);
+          if (inc == -1) {
+            break;
+          }
+          charsRead += inc;
+        }
+      }
+
+      inLen = inStr.length();
+      if (inLen == 0) {
+        return false;
+      }
+      posIncrAtt.setPositionIncrement(1);
+    } else {
+      posIncrAtt.setPositionIncrement(0);
+    }
+
+    // if the remaining input is too short, we can't generate any n-grams
+    if (gramSize > inLen) {
+      return false;
+    }
+
+    // if we have hit the end of our n-gram size range, quit
+    if (gramSize > maxGram || gramSize > inLen) {
+      return false;
+    }
+
+    // grab gramSize chars from front or back
+    int start = side == Side.FRONT ? 0 : inLen - gramSize;
+    int end = start + gramSize;
+    termAtt.setEmpty().append(inStr, start, end);
+    offsetAtt.setOffset(correctOffset(start), correctOffset(end));
+    gramSize++;
+    return true;
+  }
+  
+  @Override
+  public void end() throws IOException {
+    super.end();
+    // set final offset
+    final int finalOffset = correctOffset(charsRead);
+    this.offsetAtt.setOffset(finalOffset, finalOffset);
+  }    
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    started = false;
+  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java
@ -18,7 +18,7 @@ package org.apache.lucene.analysis.ngram;
 */

 import java.io.IOException;
-import java.io.Reader;
+

 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -29,7 +29,7 @@ import org.apache.lucene.util.AttributeFactory;
 * Old broken version of {@link NGramTokenizer}.
 */
@Deprecated
-public class Lucene43NGramTokenizer extends Tokenizer {
+public final class Lucene43NGramTokenizer extends Tokenizer {
  public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
  public static final int DEFAULT_MAX_NGRAM_SIZE = 2;

--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
@ -235,4 +235,8 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
    assertFalse(tk.incrementToken());
  }

+  public void test43Tokenizer() {
+    new Lucene43EdgeNGramTokenizer(1, 1);
+  }
+
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
@ -246,4 +246,9 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
    testNGrams(minGram, maxGram, s, "abcdef");
  }

+  public void test43Tokenizer() {
+    // TODO: do more than instantiate (ie check the old broken behavior)
+    new Lucene43NGramTokenizer(1, 1);
+  }
+
 }