LUCENE-3741: MockCharFilter offset correction is wrong

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1239040 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-02-01 10:24:57 +00:00
parent 3f99c3c2fd
commit 9f3bffcb1f
3 changed files with 32 additions and 3 deletions

View File

@ -52,7 +52,7 @@ class MockCharFilter extends CharStream {
bufferedCh = -1; bufferedCh = -1;
currentOffset++; currentOffset++;
addOffCorrectMap(currentOffset+delta, delta-1); addOffCorrectMap(currentOffset, delta-1);
delta--; delta--;
return ch; return ch;
} }

View File

@ -75,6 +75,7 @@ public class MockTokenizer extends Tokenizer {
}; };
private State streamState = State.CLOSE; private State streamState = State.CLOSE;
private int lastOffset = 0; // only for asserting
private boolean enableChecks = true; private boolean enableChecks = true;
public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) { public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
@ -116,7 +117,15 @@ public class MockTokenizer extends Tokenizer {
} }
cp = readCodePoint(); cp = readCodePoint();
} while (cp >= 0 && isTokenChar(cp)); } while (cp >= 0 && isTokenChar(cp));
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));
int correctedStartOffset = correctOffset(startOffset);
int correctedEndOffset = correctOffset(endOffset);
assert correctedStartOffset >= 0;
assert correctedEndOffset >= 0;
assert correctedStartOffset >= lastOffset;
lastOffset = correctedStartOffset;
assert correctedEndOffset >= correctedStartOffset;
offsetAtt.setOffset(correctedStartOffset, correctedEndOffset);
streamState = State.INCREMENT; streamState = State.INCREMENT;
return true; return true;
} }
@ -162,7 +171,7 @@ public class MockTokenizer extends Tokenizer {
public void reset() throws IOException { public void reset() throws IOException {
super.reset(); super.reset();
state = runAutomaton.getInitialState(); state = runAutomaton.getInitialState();
off = 0; lastOffset = off = 0;
assert !enableChecks || streamState != State.RESET : "double reset()"; assert !enableChecks || streamState != State.RESET : "double reset()";
streamState = State.RESET; streamState = State.RESET;
} }

View File

@ -3,6 +3,9 @@ package org.apache.lucene.analysis;
import java.io.StringReader; import java.io.StringReader;
import java.util.Arrays; import java.util.Arrays;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicAutomata; import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.BasicOperations; import org.apache.lucene.util.automaton.BasicOperations;
@ -116,4 +119,21 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
checkRandomData(random, new MockAnalyzer(random), atLeast(1000)); checkRandomData(random, new MockAnalyzer(random), atLeast(1000));
} }
public void testForwardOffsets() throws Exception {
int num = atLeast(10000);
for (int i = 0; i < num; i++) {
String s = _TestUtil.randomHtmlishString(random, 20);
StringReader reader = new StringReader(s);
MockCharFilter charfilter = new MockCharFilter(CharReader.get(reader), 2);
MockAnalyzer analyzer = new MockAnalyzer(random);
TokenStream ts = analyzer.tokenStream("bogus", charfilter);
ts.reset();
while (ts.incrementToken()) {
;
}
ts.end();
ts.close();
}
}
} }