LUCENE-3905: sometimes run real-ish content (from LineFileDocs) through the analyzers too; fix end() offset bugs in the ngram tokenizers/filters

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1304525 13f79535-47bb-0310-9956-ffa450edef68
2012-03-23 17:39:13 +00:00 · 2012-03-23 17:39:13 +00:00 · 7291d38535
parent eca9908cbe
commit 7291d38535
5 changed files with 55 additions and 23 deletions
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -33,6 +33,7 @@ import org.apache.lucene.analysis.tokenattributes.*;
 import org.apache.lucene.util.Attribute;
 import org.apache.lucene.util.AttributeImpl;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.LineFileDocs;
 import org.apache.lucene.util._TestUtil;

 /** 
@ -359,8 +360,18 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
  }

  private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple) throws IOException {
+
+    final LineFileDocs docs = new LineFileDocs(random);
+
    for (int i = 0; i < iterations; i++) {
      String text;
+
+      if (random.nextInt(10) == 7) {
+        text = docs.nextDoc().get("body");
+        if (text.length() > maxWordLength) {
+          text = text.substring(0, maxWordLength);
+        }
+      } else {
        if (simple) { 
          text = random.nextBoolean() ? _TestUtil.randomSimpleString(random, maxWordLength) : _TestUtil.randomHtmlishString(random, maxWordLength);
        } else {
@ -378,6 +389,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
            text = _TestUtil.randomUnicodeString(random, maxWordLength);
          }
        }
+      }

      if (VERBOSE) {
        System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
@ -73,7 +73,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {
  private int maxGram;
  private int gramSize;
  private Side side;
-  private boolean started = false;
+  private boolean started;
  private int inLen; // length of the input AFTER trim()
  private int charsRead; // length of the input
  private String inStr;
@ -178,7 +178,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {

  /** Returns the next token in the stream, or null at EOS. */
  @Override
-  public final boolean incrementToken() throws IOException {
+  public boolean incrementToken() throws IOException {
    clearAttributes();
    // if we are just starting, read the whole input
    if (!started) {
@ -188,13 +188,28 @@ public final class EdgeNGramTokenizer extends Tokenizer {
      charsRead = 0;
      // TODO: refactor to a shared readFully somewhere:
      while (charsRead < chars.length) {
-        int inc = input.read(chars, charsRead, chars.length-charsRead);
+        final int inc = input.read(chars, charsRead, chars.length-charsRead);
        if (inc == -1) {
          break;
        }
        charsRead += inc;
      }
+
      inStr = new String(chars, 0, charsRead).trim();  // remove any trailing empty strings 
+
+      if (charsRead == chars.length) {
+        // Read extra throwaway chars so that on end() we
+        // report the correct offset:
+        char[] throwaway = new char[1024];
+        while(true) {
+          final int inc = input.read(throwaway, 0, throwaway.length);
+          if (inc == -1) {
+            break;
+          }
+          charsRead += inc;
+        }
+      }
+
      inLen = inStr.length();
      if (inLen == 0) {
        return false;
@ -221,21 +236,15 @@ public final class EdgeNGramTokenizer extends Tokenizer {
  }
  
  @Override
-  public final void end() {
+  public void end() {
    // set final offset
    final int finalOffset = correctOffset(charsRead);
    this.offsetAtt.setOffset(finalOffset, finalOffset);
  }    

-  @Override
-  public void reset(Reader input) throws IOException {
-    super.reset(input);
-  }
-
  @Override
  public void reset() throws IOException {
    super.reset();
    started = false;
-    charsRead = 0;
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
@ -34,11 +34,11 @@ public final class NGramTokenizer extends Tokenizer {

  private int minGram, maxGram;
  private int gramSize;
-  private int pos = 0;
+  private int pos;
  private int inLen; // length of the input AFTER trim()
  private int charsRead; // length of the input
  private String inStr;
-  private boolean started = false;
+  private boolean started;
  
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@ -99,7 +99,7 @@ public final class NGramTokenizer extends Tokenizer {

  /** Returns the next token in the stream, or null at EOS. */
  @Override
-  public final boolean incrementToken() throws IOException {
+  public boolean incrementToken() throws IOException {
    clearAttributes();
    if (!started) {
      started = true;
@ -115,6 +115,20 @@ public final class NGramTokenizer extends Tokenizer {
        charsRead += inc;
      }
      inStr = new String(chars, 0, charsRead).trim();  // remove any trailing empty strings 
+
+      if (charsRead == chars.length) {
+        // Read extra throwaway chars so that on end() we
+        // report the correct offset:
+        char[] throwaway = new char[1024];
+        while(true) {
+          final int inc = input.read(throwaway, 0, throwaway.length);
+          if (inc == -1) {
+            break;
+          }
+          charsRead += inc;
+        }
+      }
+
      inLen = inStr.length();
      if (inLen == 0) {
        return false;
@ -138,22 +152,16 @@ public final class NGramTokenizer extends Tokenizer {
  }
  
  @Override
-  public final void end() {
+  public void end() {
    // set final offset
    final int finalOffset = correctOffset(charsRead);
    this.offsetAtt.setOffset(finalOffset, finalOffset);
  }    
  
-  @Override
-  public void reset(Reader input) throws IOException {
-    super.reset(input);
-  }
-
  @Override
  public void reset() throws IOException {
    super.reset();
    started = false;
    pos = 0;
-    charsRead = 0;
  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
@ -110,6 +110,7 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
      }    
    };
    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
    
    Analyzer b = new Analyzer() {
      @Override
@ -119,5 +120,6 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
      }    
    };
    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+    checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192);
  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
@ -99,5 +99,6 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
      }    
    };
    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
  }
 }