From 7291d38535791c84aeafdf29c103d05982937587 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Fri, 23 Mar 2012 17:39:13 +0000 Subject: [PATCH] LUCENE-3905: sometimes run real-ish content (from LineFileDocs) through the analyzers too; fix end() offset bugs in the ngram tokenizers/filters git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1304525 13f79535-47bb-0310-9956-ffa450edef68 --- .../analysis/BaseTokenStreamTestCase.java | 18 ++++++++++-- .../analysis/ngram/EdgeNGramTokenizer.java | 29 ++++++++++++------- .../lucene/analysis/ngram/NGramTokenizer.java | 28 +++++++++++------- .../ngram/EdgeNGramTokenizerTest.java | 2 ++ .../analysis/ngram/NGramTokenizerTest.java | 1 + 5 files changed, 55 insertions(+), 23 deletions(-) diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java index b1960d7bc05..9b4e013cb86 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.LineFileDocs; import org.apache.lucene.util._TestUtil; /** @@ -359,12 +360,22 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { } private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple) throws IOException { + + final LineFileDocs docs = new LineFileDocs(random); + for (int i = 0; i < iterations; i++) { String text; - if (simple) { - text = random.nextBoolean() ? _TestUtil.randomSimpleString(random, maxWordLength) : _TestUtil.randomHtmlishString(random, maxWordLength); + + if (random.nextInt(10) == 7) { + text = docs.nextDoc().get("body"); + if (text.length() > maxWordLength) { + text = text.substring(0, maxWordLength); + } } else { - switch(_TestUtil.nextInt(random, 0, 4)) { + if (simple) { + text = random.nextBoolean() ? _TestUtil.randomSimpleString(random, maxWordLength) : _TestUtil.randomHtmlishString(random, maxWordLength); + } else { + switch(_TestUtil.nextInt(random, 0, 4)) { case 0: text = _TestUtil.randomSimpleString(random, maxWordLength); break; @@ -376,6 +387,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { break; default: text = _TestUtil.randomUnicodeString(random, maxWordLength); + } } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java index cfbf4b58295..f647a1778c5 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java @@ -73,7 +73,7 @@ public final class EdgeNGramTokenizer extends Tokenizer { private int maxGram; private int gramSize; private Side side; - private boolean started = false; + private boolean started; private int inLen; // length of the input AFTER trim() private int charsRead; // length of the input private String inStr; @@ -178,7 +178,7 @@ public final class EdgeNGramTokenizer extends Tokenizer { /** Returns the next token in the stream, or null at EOS. */ @Override - public final boolean incrementToken() throws IOException { + public boolean incrementToken() throws IOException { clearAttributes(); // if we are just starting, read the whole input if (!started) { @@ -188,13 +188,28 @@ public final class EdgeNGramTokenizer extends Tokenizer { charsRead = 0; // TODO: refactor to a shared readFully somewhere: while (charsRead < chars.length) { - int inc = input.read(chars, charsRead, chars.length-charsRead); + final int inc = input.read(chars, charsRead, chars.length-charsRead); if (inc == -1) { break; } charsRead += inc; } + inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings + + if (charsRead == chars.length) { + // Read extra throwaway chars so that on end() we + // report the correct offset: + char[] throwaway = new char[1024]; + while(true) { + final int inc = input.read(throwaway, 0, throwaway.length); + if (inc == -1) { + break; + } + charsRead += inc; + } + } + inLen = inStr.length(); if (inLen == 0) { return false; @@ -221,21 +236,15 @@ public final class EdgeNGramTokenizer extends Tokenizer { } @Override - public final void end() { + public void end() { // set final offset final int finalOffset = correctOffset(charsRead); this.offsetAtt.setOffset(finalOffset, finalOffset); } - @Override - public void reset(Reader input) throws IOException { - super.reset(input); - } - @Override public void reset() throws IOException { super.reset(); started = false; - charsRead = 0; } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java index d8595cca501..574eeec31bb 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java @@ -34,11 +34,11 @@ public final class NGramTokenizer extends Tokenizer { private int minGram, maxGram; private int gramSize; - private int pos = 0; + private int pos; private int inLen; // length of the input AFTER trim() private int charsRead; // length of the input private String inStr; - private boolean started = false; + private boolean started; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); @@ -99,7 +99,7 @@ public final class NGramTokenizer extends Tokenizer { /** Returns the next token in the stream, or null at EOS. */ @Override - public final boolean incrementToken() throws IOException { + public boolean incrementToken() throws IOException { clearAttributes(); if (!started) { started = true; @@ -115,6 +115,20 @@ public final class NGramTokenizer extends Tokenizer { charsRead += inc; } inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings + + if (charsRead == chars.length) { + // Read extra throwaway chars so that on end() we + // report the correct offset: + char[] throwaway = new char[1024]; + while(true) { + final int inc = input.read(throwaway, 0, throwaway.length); + if (inc == -1) { + break; + } + charsRead += inc; + } + } + inLen = inStr.length(); if (inLen == 0) { return false; @@ -138,22 +152,16 @@ public final class NGramTokenizer extends Tokenizer { } @Override - public final void end() { + public void end() { // set final offset final int finalOffset = correctOffset(charsRead); this.offsetAtt.setOffset(finalOffset, finalOffset); } - @Override - public void reset(Reader input) throws IOException { - super.reset(input); - } - @Override public void reset() throws IOException { super.reset(); started = false; pos = 0; - charsRead = 0; } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java index 5d4976f0566..90611a1f2ec 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java @@ -110,6 +110,7 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase { } }; checkRandomData(random, a, 10000*RANDOM_MULTIPLIER); + checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192); Analyzer b = new Analyzer() { @Override @@ -119,5 +120,6 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase { } }; checkRandomData(random, b, 10000*RANDOM_MULTIPLIER); + checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192); } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java index 49e00a878f7..9dd3c65723f 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java @@ -99,5 +99,6 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase { } }; checkRandomData(random, a, 10000*RANDOM_MULTIPLIER); + checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192); } }