LUCENE-3717: add better offsets testing to BaseTokenStreamTestCase, fix offsets bugs in ThaiWordFilter and ICUTokenizer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1234652 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-01-23 00:08:52 +00:00
parent f7a474d603
commit c754c1c9c8
8 changed files with 205 additions and 19 deletions

View File

@ -814,10 +814,10 @@ Bug fixes
* LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
to clones/reopened readers. (Uwe Schindler)
* LUCENE-3642, SOLR-2891: Fixed bugs in CharTokenizer, n-gram filters,
compound token filters, and smart chinese where they would create invalid
offsets in some situations, leading to problems in highlighting.
(Max Beutel, Edwin Steiner via Robert Muir)
* LUCENE-3642, SOLR-2891, LUCENE-3717: Fixed bugs in CharTokenizer, n-gram filters,
compound token filters, thai word filter, icutokenizer, and smart chinese
where they would create invalid offsets in some situations, leading to problems
in highlighting. (Max Beutel, Edwin Steiner via Robert Muir)
* LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
Float.MIN_VALUE when it should be Float.NaN, when there were 0

View File

@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import java.io.IOException;
import java.util.ArrayList;
@ -291,6 +292,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
};
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean());
}
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter) throws IOException {
for (int i = 0; i < iterations; i++) {
String text;
switch(_TestUtil.nextInt(random, 0, 4)) {
@ -311,7 +316,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
}
TokenStream ts = a.tokenStream("dummy", new StringReader(text));
int remainder = random.nextInt(10);
Reader reader = new StringReader(text);
TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
@ -339,30 +346,38 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
if (VERBOSE) {
System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
}
reader = new StringReader(text);
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
// offset + pos + type
assertAnalyzesToReuse(a, text,
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
types.toArray(new String[types.size()]),
toIntArray(positions));
toIntArray(positions),
text.length());
} else if (posIncAtt != null && offsetAtt != null) {
// offset + pos
assertAnalyzesToReuse(a, text,
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
toIntArray(positions));
null,
toIntArray(positions),
text.length());
} else if (offsetAtt != null) {
// offset
assertAnalyzesToReuse(a, text,
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets));
toIntArray(endOffsets),
null,
null,
text.length());
} else {
// terms only
assertAnalyzesToReuse(a, text,
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]));
}
}

View File

@ -0,0 +1,100 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.SortedMap;
import java.util.TreeMap;
// the purpose of this charfilter is to send offsets out of bounds
// if the analyzer doesn't use correctOffset or does incorrect offset math.
class MockCharFilter extends CharStream {
final Reader in;
final int remainder;
// for testing only
public MockCharFilter(Reader in, int remainder) {
this.in = in;
this.remainder = remainder;
assert remainder >= 0 && remainder < 10 : "invalid parameter";
}
@Override
public void close() throws IOException {
in.close();
}
int currentOffset = -1;
int delta = 0;
int bufferedCh = -1;
@Override
public int read() throws IOException {
// we have a buffered character, add an offset correction and return it
if (bufferedCh >= 0) {
int ch = bufferedCh;
bufferedCh = -1;
currentOffset++;
addOffCorrectMap(currentOffset+delta, delta-1);
delta--;
return ch;
}
// otherwise actually read one
int ch = in.read();
if (ch < 0)
return ch;
currentOffset++;
if ((ch % 10) != remainder || Character.isHighSurrogate((char)ch) || Character.isLowSurrogate((char)ch)) {
return ch;
}
// we will double this character, so buffer it.
bufferedCh = ch;
return ch;
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
int numRead = 0;
for (int i = off; i < off + len; i++) {
int c = read();
if (c == -1) break;
cbuf[i] = (char) c;
numRead++;
}
return numRead == 0 ? -1 : numRead;
}
@Override
public int correctOffset(int currentOff) {
SortedMap<Integer,Integer> subMap = corrections.subMap(0, currentOff+1);
int ret = subMap.isEmpty() ? currentOff : currentOff + subMap.get(subMap.lastKey());
assert ret >= 0 : "currentOff=" + currentOff + ",diff=" + (ret-currentOff);
return ret;
}
protected void addOffCorrectMap(int off, int cumulativeDiff) {
corrections.put(off, cumulativeDiff);
}
TreeMap<Integer,Integer> corrections = new TreeMap<Integer,Integer>();
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestMockCharFilter extends BaseTokenStreamTestCase {
public void test() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(Reader reader) {
return new MockCharFilter(CharReader.get(reader), 7);
}
};
assertAnalyzesTo(analyzer, "ab",
new String[] { "aab" },
new int[] { 0 },
new int[] { 2 }
);
assertAnalyzesTo(analyzer, "aba",
new String[] { "aabaa" },
new int[] { 0 },
new int[] { 3 }
);
assertAnalyzesTo(analyzer, "abcdefga",
new String[] { "aabcdefgaa" },
new int[] { 0 },
new int[] { 8 }
);
}
}

View File

@ -68,6 +68,7 @@ public final class ThaiWordFilter extends TokenFilter {
private CharTermAttribute clonedTermAtt = null;
private OffsetAttribute clonedOffsetAtt = null;
private boolean hasMoreTokensInClone = false;
private boolean hasIllegalOffsets = false; // only if the length changed before this filter
/** Creates a new ThaiWordFilter with the specified match version. */
public ThaiWordFilter(Version matchVersion, TokenStream input) {
@ -86,7 +87,11 @@ public final class ThaiWordFilter extends TokenFilter {
if (end != BreakIterator.DONE) {
clonedToken.copyTo(this);
termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start);
if (hasIllegalOffsets) {
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
} else {
offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
}
if (handlePosIncr) posAtt.setPositionIncrement(1);
return true;
}
@ -103,6 +108,10 @@ public final class ThaiWordFilter extends TokenFilter {
hasMoreTokensInClone = true;
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length();
// we lazy init the cloned token, as in ctor not all attributes may be added
if (clonedToken == null) {
clonedToken = cloneAttributes();
@ -118,7 +127,11 @@ public final class ThaiWordFilter extends TokenFilter {
int end = breaker.next();
if (end != BreakIterator.DONE) {
termAtt.setLength(end);
if (hasIllegalOffsets) {
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
} else {
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
}
// position increment keeps as it is for first token
return true;
}

View File

@ -503,7 +503,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
@Override
protected Reader initReader(Reader reader) {
return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
return new HTMLStripCharFilter(CharReader.get(reader));
}
};

View File

@ -160,7 +160,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
hasSentence = false;
clearAttributes();
termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
offsetAtt.setOffset(offset+sentenceStart, offset+sentenceEnd);
offsetAtt.setOffset(correctOffset(offset+sentenceStart), correctOffset(offset+sentenceEnd));
return true;
} else {
return false;
@ -215,7 +215,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
clearAttributes();
termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
offsetAtt.setOffset(offset+wordStart, offset+wordEnd);
offsetAtt.setOffset(correctOffset(offset+wordStart), correctOffset(offset+wordEnd));
posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
posBoost = 0;
return true;

View File

@ -111,7 +111,7 @@ public final class ICUTokenizer extends Tokenizer {
@Override
public void end() throws IOException {
final int finalOffset = (length < 0) ? offset : offset + length;
offsetAtt.setOffset(finalOffset, finalOffset);
offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
}
/*