mirror of https://github.com/apache/lucene.git
LUCENE-3741: MockCharFilter offset correction is wrong
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1239040 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3f99c3c2fd
commit
9f3bffcb1f
|
@ -52,7 +52,7 @@ class MockCharFilter extends CharStream {
|
|||
bufferedCh = -1;
|
||||
currentOffset++;
|
||||
|
||||
addOffCorrectMap(currentOffset+delta, delta-1);
|
||||
addOffCorrectMap(currentOffset, delta-1);
|
||||
delta--;
|
||||
return ch;
|
||||
}
|
||||
|
|
|
@ -75,6 +75,7 @@ public class MockTokenizer extends Tokenizer {
|
|||
};
|
||||
|
||||
private State streamState = State.CLOSE;
|
||||
private int lastOffset = 0; // only for asserting
|
||||
private boolean enableChecks = true;
|
||||
|
||||
public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
|
||||
|
@ -116,7 +117,15 @@ public class MockTokenizer extends Tokenizer {
|
|||
}
|
||||
cp = readCodePoint();
|
||||
} while (cp >= 0 && isTokenChar(cp));
|
||||
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));
|
||||
|
||||
int correctedStartOffset = correctOffset(startOffset);
|
||||
int correctedEndOffset = correctOffset(endOffset);
|
||||
assert correctedStartOffset >= 0;
|
||||
assert correctedEndOffset >= 0;
|
||||
assert correctedStartOffset >= lastOffset;
|
||||
lastOffset = correctedStartOffset;
|
||||
assert correctedEndOffset >= correctedStartOffset;
|
||||
offsetAtt.setOffset(correctedStartOffset, correctedEndOffset);
|
||||
streamState = State.INCREMENT;
|
||||
return true;
|
||||
}
|
||||
|
@ -162,7 +171,7 @@ public class MockTokenizer extends Tokenizer {
|
|||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
state = runAutomaton.getInitialState();
|
||||
off = 0;
|
||||
lastOffset = off = 0;
|
||||
assert !enableChecks || streamState != State.RESET : "double reset()";
|
||||
streamState = State.RESET;
|
||||
}
|
||||
|
|
|
@ -3,6 +3,9 @@ package org.apache.lucene.analysis;
|
|||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.BasicAutomata;
|
||||
import org.apache.lucene.util.automaton.BasicOperations;
|
||||
|
@ -116,4 +119,21 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
|||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random, new MockAnalyzer(random), atLeast(1000));
|
||||
}
|
||||
|
||||
public void testForwardOffsets() throws Exception {
|
||||
int num = atLeast(10000);
|
||||
for (int i = 0; i < num; i++) {
|
||||
String s = _TestUtil.randomHtmlishString(random, 20);
|
||||
StringReader reader = new StringReader(s);
|
||||
MockCharFilter charfilter = new MockCharFilter(CharReader.get(reader), 2);
|
||||
MockAnalyzer analyzer = new MockAnalyzer(random);
|
||||
TokenStream ts = analyzer.tokenStream("bogus", charfilter);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
;
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue