From 9f3bffcb1f4fbbdbcadda174c19711a869eab060 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 1 Feb 2012 10:24:57 +0000 Subject: [PATCH] LUCENE-3741: MockCharFilter offset correction is wrong git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1239040 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/MockCharFilter.java | 2 +- .../apache/lucene/analysis/MockTokenizer.java | 13 ++++++++++-- .../lucene/analysis/TestMockAnalyzer.java | 20 +++++++++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java b/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java index e1e6e88c78c..0db21515147 100644 --- a/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java +++ b/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java @@ -52,7 +52,7 @@ class MockCharFilter extends CharStream { bufferedCh = -1; currentOffset++; - addOffCorrectMap(currentOffset+delta, delta-1); + addOffCorrectMap(currentOffset, delta-1); delta--; return ch; } diff --git a/lucene/src/test-framework/java/org/apache/lucene/analysis/MockTokenizer.java b/lucene/src/test-framework/java/org/apache/lucene/analysis/MockTokenizer.java index 316fe9fbfb0..8b14cff4b8b 100644 --- a/lucene/src/test-framework/java/org/apache/lucene/analysis/MockTokenizer.java +++ b/lucene/src/test-framework/java/org/apache/lucene/analysis/MockTokenizer.java @@ -75,6 +75,7 @@ public class MockTokenizer extends Tokenizer { }; private State streamState = State.CLOSE; + private int lastOffset = 0; // only for asserting private boolean enableChecks = true; public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) { @@ -116,7 +117,15 @@ public class MockTokenizer extends Tokenizer { } cp = readCodePoint(); } while (cp >= 0 && isTokenChar(cp)); - offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset)); + + int correctedStartOffset = correctOffset(startOffset); + int correctedEndOffset = correctOffset(endOffset); + assert correctedStartOffset >= 0; + assert correctedEndOffset >= 0; + assert correctedStartOffset >= lastOffset; + lastOffset = correctedStartOffset; + assert correctedEndOffset >= correctedStartOffset; + offsetAtt.setOffset(correctedStartOffset, correctedEndOffset); streamState = State.INCREMENT; return true; } @@ -162,7 +171,7 @@ public class MockTokenizer extends Tokenizer { public void reset() throws IOException { super.reset(); state = runAutomaton.getInitialState(); - off = 0; + lastOffset = off = 0; assert !enableChecks || streamState != State.RESET : "double reset()"; streamState = State.RESET; } diff --git a/lucene/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java b/lucene/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java index f840700d280..71d6e3a343f 100644 --- a/lucene/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java +++ b/lucene/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java @@ -3,6 +3,9 @@ package org.apache.lucene.analysis; import java.io.StringReader; import java.util.Arrays; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.util._TestUtil; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.BasicAutomata; import org.apache.lucene.util.automaton.BasicOperations; @@ -116,4 +119,21 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, new MockAnalyzer(random), atLeast(1000)); } + + public void testForwardOffsets() throws Exception { + int num = atLeast(10000); + for (int i = 0; i < num; i++) { + String s = _TestUtil.randomHtmlishString(random, 20); + StringReader reader = new StringReader(s); + MockCharFilter charfilter = new MockCharFilter(CharReader.get(reader), 2); + MockAnalyzer analyzer = new MockAnalyzer(random); + TokenStream ts = analyzer.tokenStream("bogus", charfilter); + ts.reset(); + while (ts.incrementToken()) { + ; + } + ts.end(); + ts.close(); + } + } }