mirror of https://github.com/apache/lucene.git
LUCENE-3905: sometimes run real-ish content (from LineFileDocs) through the analyzers too; fix end() offset bugs in the ngram tokenizers/filters
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1304525 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
eca9908cbe
commit
7291d38535
|
@ -33,6 +33,7 @@ import org.apache.lucene.analysis.tokenattributes.*;
|
|||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.LineFileDocs;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
/**
|
||||
|
@ -359,8 +360,18 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
|
||||
private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple) throws IOException {
|
||||
|
||||
final LineFileDocs docs = new LineFileDocs(random);
|
||||
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
String text;
|
||||
|
||||
if (random.nextInt(10) == 7) {
|
||||
text = docs.nextDoc().get("body");
|
||||
if (text.length() > maxWordLength) {
|
||||
text = text.substring(0, maxWordLength);
|
||||
}
|
||||
} else {
|
||||
if (simple) {
|
||||
text = random.nextBoolean() ? _TestUtil.randomSimpleString(random, maxWordLength) : _TestUtil.randomHtmlishString(random, maxWordLength);
|
||||
} else {
|
||||
|
@ -378,6 +389,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
text = _TestUtil.randomUnicodeString(random, maxWordLength);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
|
||||
|
|
|
@ -73,7 +73,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {
|
|||
private int maxGram;
|
||||
private int gramSize;
|
||||
private Side side;
|
||||
private boolean started = false;
|
||||
private boolean started;
|
||||
private int inLen; // length of the input AFTER trim()
|
||||
private int charsRead; // length of the input
|
||||
private String inStr;
|
||||
|
@ -178,7 +178,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {
|
|||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
public boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
// if we are just starting, read the whole input
|
||||
if (!started) {
|
||||
|
@ -188,13 +188,28 @@ public final class EdgeNGramTokenizer extends Tokenizer {
|
|||
charsRead = 0;
|
||||
// TODO: refactor to a shared readFully somewhere:
|
||||
while (charsRead < chars.length) {
|
||||
int inc = input.read(chars, charsRead, chars.length-charsRead);
|
||||
final int inc = input.read(chars, charsRead, chars.length-charsRead);
|
||||
if (inc == -1) {
|
||||
break;
|
||||
}
|
||||
charsRead += inc;
|
||||
}
|
||||
|
||||
inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings
|
||||
|
||||
if (charsRead == chars.length) {
|
||||
// Read extra throwaway chars so that on end() we
|
||||
// report the correct offset:
|
||||
char[] throwaway = new char[1024];
|
||||
while(true) {
|
||||
final int inc = input.read(throwaway, 0, throwaway.length);
|
||||
if (inc == -1) {
|
||||
break;
|
||||
}
|
||||
charsRead += inc;
|
||||
}
|
||||
}
|
||||
|
||||
inLen = inStr.length();
|
||||
if (inLen == 0) {
|
||||
return false;
|
||||
|
@ -221,21 +236,15 @@ public final class EdgeNGramTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
public void end() {
|
||||
// set final offset
|
||||
final int finalOffset = correctOffset(charsRead);
|
||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
started = false;
|
||||
charsRead = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,11 +34,11 @@ public final class NGramTokenizer extends Tokenizer {
|
|||
|
||||
private int minGram, maxGram;
|
||||
private int gramSize;
|
||||
private int pos = 0;
|
||||
private int pos;
|
||||
private int inLen; // length of the input AFTER trim()
|
||||
private int charsRead; // length of the input
|
||||
private String inStr;
|
||||
private boolean started = false;
|
||||
private boolean started;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
@ -99,7 +99,7 @@ public final class NGramTokenizer extends Tokenizer {
|
|||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
public boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
if (!started) {
|
||||
started = true;
|
||||
|
@ -115,6 +115,20 @@ public final class NGramTokenizer extends Tokenizer {
|
|||
charsRead += inc;
|
||||
}
|
||||
inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings
|
||||
|
||||
if (charsRead == chars.length) {
|
||||
// Read extra throwaway chars so that on end() we
|
||||
// report the correct offset:
|
||||
char[] throwaway = new char[1024];
|
||||
while(true) {
|
||||
final int inc = input.read(throwaway, 0, throwaway.length);
|
||||
if (inc == -1) {
|
||||
break;
|
||||
}
|
||||
charsRead += inc;
|
||||
}
|
||||
}
|
||||
|
||||
inLen = inStr.length();
|
||||
if (inLen == 0) {
|
||||
return false;
|
||||
|
@ -138,22 +152,16 @@ public final class NGramTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
public void end() {
|
||||
// set final offset
|
||||
final int finalOffset = correctOffset(charsRead);
|
||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
started = false;
|
||||
pos = 0;
|
||||
charsRead = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -110,6 +110,7 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
|
||||
|
||||
Analyzer b = new Analyzer() {
|
||||
@Override
|
||||
|
@ -119,5 +120,6 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
};
|
||||
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -99,5 +99,6 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
};
|
||||
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue