LUCENE-3905: sometimes run real-ish content (from LineFileDocs) through the analyzers too; fix end() offset bugs in the ngram tokenizers/filters

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1304525 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-03-23 17:39:13 +00:00
parent eca9908cbe
commit 7291d38535
5 changed files with 55 additions and 23 deletions

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util._TestUtil;
/**
@ -359,8 +360,18 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple) throws IOException {
final LineFileDocs docs = new LineFileDocs(random);
for (int i = 0; i < iterations; i++) {
String text;
if (random.nextInt(10) == 7) {
text = docs.nextDoc().get("body");
if (text.length() > maxWordLength) {
text = text.substring(0, maxWordLength);
}
} else {
if (simple) {
text = random.nextBoolean() ? _TestUtil.randomSimpleString(random, maxWordLength) : _TestUtil.randomHtmlishString(random, maxWordLength);
} else {
@ -378,6 +389,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
text = _TestUtil.randomUnicodeString(random, maxWordLength);
}
}
}
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);

View File

@ -73,7 +73,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {
private int maxGram;
private int gramSize;
private Side side;
private boolean started = false;
private boolean started;
private int inLen; // length of the input AFTER trim()
private int charsRead; // length of the input
private String inStr;
@ -178,7 +178,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {
/** Returns the next token in the stream, or null at EOS. */
@Override
public final boolean incrementToken() throws IOException {
public boolean incrementToken() throws IOException {
clearAttributes();
// if we are just starting, read the whole input
if (!started) {
@ -188,13 +188,28 @@ public final class EdgeNGramTokenizer extends Tokenizer {
charsRead = 0;
// TODO: refactor to a shared readFully somewhere:
while (charsRead < chars.length) {
int inc = input.read(chars, charsRead, chars.length-charsRead);
final int inc = input.read(chars, charsRead, chars.length-charsRead);
if (inc == -1) {
break;
}
charsRead += inc;
}
inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings
if (charsRead == chars.length) {
// Read extra throwaway chars so that on end() we
// report the correct offset:
char[] throwaway = new char[1024];
while(true) {
final int inc = input.read(throwaway, 0, throwaway.length);
if (inc == -1) {
break;
}
charsRead += inc;
}
}
inLen = inStr.length();
if (inLen == 0) {
return false;
@ -221,21 +236,15 @@ public final class EdgeNGramTokenizer extends Tokenizer {
}
@Override
public final void end() {
public void end() {
// set final offset
final int finalOffset = correctOffset(charsRead);
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
}
@Override
public void reset() throws IOException {
super.reset();
started = false;
charsRead = 0;
}
}

View File

@ -34,11 +34,11 @@ public final class NGramTokenizer extends Tokenizer {
private int minGram, maxGram;
private int gramSize;
private int pos = 0;
private int pos;
private int inLen; // length of the input AFTER trim()
private int charsRead; // length of the input
private String inStr;
private boolean started = false;
private boolean started;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@ -99,7 +99,7 @@ public final class NGramTokenizer extends Tokenizer {
/** Returns the next token in the stream, or null at EOS. */
@Override
public final boolean incrementToken() throws IOException {
public boolean incrementToken() throws IOException {
clearAttributes();
if (!started) {
started = true;
@ -115,6 +115,20 @@ public final class NGramTokenizer extends Tokenizer {
charsRead += inc;
}
inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings
if (charsRead == chars.length) {
// Read extra throwaway chars so that on end() we
// report the correct offset:
char[] throwaway = new char[1024];
while(true) {
final int inc = input.read(throwaway, 0, throwaway.length);
if (inc == -1) {
break;
}
charsRead += inc;
}
}
inLen = inStr.length();
if (inLen == 0) {
return false;
@ -138,22 +152,16 @@ public final class NGramTokenizer extends Tokenizer {
}
@Override
public final void end() {
public void end() {
// set final offset
final int finalOffset = correctOffset(charsRead);
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
}
@Override
public void reset() throws IOException {
super.reset();
started = false;
pos = 0;
charsRead = 0;
}
}

View File

@ -110,6 +110,7 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
Analyzer b = new Analyzer() {
@Override
@ -119,5 +120,6 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
}
};
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192);
}
}

View File

@ -99,5 +99,6 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
}
}