mirror of https://github.com/apache/lucene.git
LUCENE-3741: MockCharFilter offset correction is wrong
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1239040 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3f99c3c2fd
commit
9f3bffcb1f
|
@ -52,7 +52,7 @@ class MockCharFilter extends CharStream {
|
||||||
bufferedCh = -1;
|
bufferedCh = -1;
|
||||||
currentOffset++;
|
currentOffset++;
|
||||||
|
|
||||||
addOffCorrectMap(currentOffset+delta, delta-1);
|
addOffCorrectMap(currentOffset, delta-1);
|
||||||
delta--;
|
delta--;
|
||||||
return ch;
|
return ch;
|
||||||
}
|
}
|
||||||
|
|
|
@ -75,6 +75,7 @@ public class MockTokenizer extends Tokenizer {
|
||||||
};
|
};
|
||||||
|
|
||||||
private State streamState = State.CLOSE;
|
private State streamState = State.CLOSE;
|
||||||
|
private int lastOffset = 0; // only for asserting
|
||||||
private boolean enableChecks = true;
|
private boolean enableChecks = true;
|
||||||
|
|
||||||
public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
|
public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
|
||||||
|
@ -116,7 +117,15 @@ public class MockTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
cp = readCodePoint();
|
cp = readCodePoint();
|
||||||
} while (cp >= 0 && isTokenChar(cp));
|
} while (cp >= 0 && isTokenChar(cp));
|
||||||
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));
|
|
||||||
|
int correctedStartOffset = correctOffset(startOffset);
|
||||||
|
int correctedEndOffset = correctOffset(endOffset);
|
||||||
|
assert correctedStartOffset >= 0;
|
||||||
|
assert correctedEndOffset >= 0;
|
||||||
|
assert correctedStartOffset >= lastOffset;
|
||||||
|
lastOffset = correctedStartOffset;
|
||||||
|
assert correctedEndOffset >= correctedStartOffset;
|
||||||
|
offsetAtt.setOffset(correctedStartOffset, correctedEndOffset);
|
||||||
streamState = State.INCREMENT;
|
streamState = State.INCREMENT;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -162,7 +171,7 @@ public class MockTokenizer extends Tokenizer {
|
||||||
public void reset() throws IOException {
|
public void reset() throws IOException {
|
||||||
super.reset();
|
super.reset();
|
||||||
state = runAutomaton.getInitialState();
|
state = runAutomaton.getInitialState();
|
||||||
off = 0;
|
lastOffset = off = 0;
|
||||||
assert !enableChecks || streamState != State.RESET : "double reset()";
|
assert !enableChecks || streamState != State.RESET : "double reset()";
|
||||||
streamState = State.RESET;
|
streamState = State.RESET;
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,9 @@ package org.apache.lucene.analysis;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
import org.apache.lucene.util.automaton.Automaton;
|
import org.apache.lucene.util.automaton.Automaton;
|
||||||
import org.apache.lucene.util.automaton.BasicAutomata;
|
import org.apache.lucene.util.automaton.BasicAutomata;
|
||||||
import org.apache.lucene.util.automaton.BasicOperations;
|
import org.apache.lucene.util.automaton.BasicOperations;
|
||||||
|
@ -116,4 +119,21 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random, new MockAnalyzer(random), atLeast(1000));
|
checkRandomData(random, new MockAnalyzer(random), atLeast(1000));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testForwardOffsets() throws Exception {
|
||||||
|
int num = atLeast(10000);
|
||||||
|
for (int i = 0; i < num; i++) {
|
||||||
|
String s = _TestUtil.randomHtmlishString(random, 20);
|
||||||
|
StringReader reader = new StringReader(s);
|
||||||
|
MockCharFilter charfilter = new MockCharFilter(CharReader.get(reader), 2);
|
||||||
|
MockAnalyzer analyzer = new MockAnalyzer(random);
|
||||||
|
TokenStream ts = analyzer.tokenStream("bogus", charfilter);
|
||||||
|
ts.reset();
|
||||||
|
while (ts.incrementToken()) {
|
||||||
|
;
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
|
ts.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue