LUCENE-3717: add better offsets testing to BaseTokenStreamTestCase, fix offsets bugs in ThaiWordFilter and ICUTokenizer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1234652 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-01-23 00:08:52 +00:00
parent f7a474d603
commit c754c1c9c8
8 changed files with 205 additions and 19 deletions

View File

@ -814,10 +814,10 @@ Bug fixes
* LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners * LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
to clones/reopened readers. (Uwe Schindler) to clones/reopened readers. (Uwe Schindler)
* LUCENE-3642, SOLR-2891: Fixed bugs in CharTokenizer, n-gram filters, * LUCENE-3642, SOLR-2891, LUCENE-3717: Fixed bugs in CharTokenizer, n-gram filters,
compound token filters, and smart chinese where they would create invalid compound token filters, thai word filter, icutokenizer, and smart chinese
offsets in some situations, leading to problems in highlighting. where they would create invalid offsets in some situations, leading to problems
(Max Beutel, Edwin Steiner via Robert Muir) in highlighting. (Max Beutel, Edwin Steiner via Robert Muir)
* LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to * LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
Float.MIN_VALUE when it should be Float.NaN, when there were 0 Float.MIN_VALUE when it should be Float.NaN, when there were 0

View File

@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
* limitations under the License. * limitations under the License.
*/ */
import java.io.Reader;
import java.io.StringReader; import java.io.StringReader;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
@ -291,6 +292,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}; };
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException { public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean());
}
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter) throws IOException {
for (int i = 0; i < iterations; i++) { for (int i = 0; i < iterations; i++) {
String text; String text;
switch(_TestUtil.nextInt(random, 0, 4)) { switch(_TestUtil.nextInt(random, 0, 4)) {
@ -311,7 +316,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text); System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
} }
TokenStream ts = a.tokenStream("dummy", new StringReader(text)); int remainder = random.nextInt(10);
Reader reader = new StringReader(text);
TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class)); assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null; OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
@ -339,30 +346,38 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
if (VERBOSE) { if (VERBOSE) {
System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis"); System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
} }
reader = new StringReader(text);
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
if (typeAtt != null && posIncAtt != null && offsetAtt != null) { if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
// offset + pos + type // offset + pos + type
assertAnalyzesToReuse(a, text, assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]), tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets), toIntArray(startOffsets),
toIntArray(endOffsets), toIntArray(endOffsets),
types.toArray(new String[types.size()]), types.toArray(new String[types.size()]),
toIntArray(positions)); toIntArray(positions),
text.length());
} else if (posIncAtt != null && offsetAtt != null) { } else if (posIncAtt != null && offsetAtt != null) {
// offset + pos // offset + pos
assertAnalyzesToReuse(a, text, assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]), tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets), toIntArray(startOffsets),
toIntArray(endOffsets), toIntArray(endOffsets),
toIntArray(positions)); null,
toIntArray(positions),
text.length());
} else if (offsetAtt != null) { } else if (offsetAtt != null) {
// offset // offset
assertAnalyzesToReuse(a, text, assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]), tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets), toIntArray(startOffsets),
toIntArray(endOffsets)); toIntArray(endOffsets),
null,
null,
text.length());
} else { } else {
// terms only // terms only
assertAnalyzesToReuse(a, text, assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()])); tokens.toArray(new String[tokens.size()]));
} }
} }

View File

@ -0,0 +1,100 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.SortedMap;
import java.util.TreeMap;
// the purpose of this charfilter is to send offsets out of bounds
// if the analyzer doesn't use correctOffset or does incorrect offset math.
class MockCharFilter extends CharStream {
final Reader in;
final int remainder;
// for testing only
public MockCharFilter(Reader in, int remainder) {
this.in = in;
this.remainder = remainder;
assert remainder >= 0 && remainder < 10 : "invalid parameter";
}
@Override
public void close() throws IOException {
in.close();
}
int currentOffset = -1;
int delta = 0;
int bufferedCh = -1;
@Override
public int read() throws IOException {
// we have a buffered character, add an offset correction and return it
if (bufferedCh >= 0) {
int ch = bufferedCh;
bufferedCh = -1;
currentOffset++;
addOffCorrectMap(currentOffset+delta, delta-1);
delta--;
return ch;
}
// otherwise actually read one
int ch = in.read();
if (ch < 0)
return ch;
currentOffset++;
if ((ch % 10) != remainder || Character.isHighSurrogate((char)ch) || Character.isLowSurrogate((char)ch)) {
return ch;
}
// we will double this character, so buffer it.
bufferedCh = ch;
return ch;
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
int numRead = 0;
for (int i = off; i < off + len; i++) {
int c = read();
if (c == -1) break;
cbuf[i] = (char) c;
numRead++;
}
return numRead == 0 ? -1 : numRead;
}
@Override
public int correctOffset(int currentOff) {
SortedMap<Integer,Integer> subMap = corrections.subMap(0, currentOff+1);
int ret = subMap.isEmpty() ? currentOff : currentOff + subMap.get(subMap.lastKey());
assert ret >= 0 : "currentOff=" + currentOff + ",diff=" + (ret-currentOff);
return ret;
}
protected void addOffCorrectMap(int off, int cumulativeDiff) {
corrections.put(off, cumulativeDiff);
}
TreeMap<Integer,Integer> corrections = new TreeMap<Integer,Integer>();
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestMockCharFilter extends BaseTokenStreamTestCase {
public void test() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(Reader reader) {
return new MockCharFilter(CharReader.get(reader), 7);
}
};
assertAnalyzesTo(analyzer, "ab",
new String[] { "aab" },
new int[] { 0 },
new int[] { 2 }
);
assertAnalyzesTo(analyzer, "aba",
new String[] { "aabaa" },
new int[] { 0 },
new int[] { 3 }
);
assertAnalyzesTo(analyzer, "abcdefga",
new String[] { "aabcdefgaa" },
new int[] { 0 },
new int[] { 8 }
);
}
}

View File

@ -68,6 +68,7 @@ public final class ThaiWordFilter extends TokenFilter {
private CharTermAttribute clonedTermAtt = null; private CharTermAttribute clonedTermAtt = null;
private OffsetAttribute clonedOffsetAtt = null; private OffsetAttribute clonedOffsetAtt = null;
private boolean hasMoreTokensInClone = false; private boolean hasMoreTokensInClone = false;
private boolean hasIllegalOffsets = false; // only if the length changed before this filter
/** Creates a new ThaiWordFilter with the specified match version. */ /** Creates a new ThaiWordFilter with the specified match version. */
public ThaiWordFilter(Version matchVersion, TokenStream input) { public ThaiWordFilter(Version matchVersion, TokenStream input) {
@ -86,7 +87,11 @@ public final class ThaiWordFilter extends TokenFilter {
if (end != BreakIterator.DONE) { if (end != BreakIterator.DONE) {
clonedToken.copyTo(this); clonedToken.copyTo(this);
termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start); termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start);
offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end); if (hasIllegalOffsets) {
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
} else {
offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
}
if (handlePosIncr) posAtt.setPositionIncrement(1); if (handlePosIncr) posAtt.setPositionIncrement(1);
return true; return true;
} }
@ -103,6 +108,10 @@ public final class ThaiWordFilter extends TokenFilter {
hasMoreTokensInClone = true; hasMoreTokensInClone = true;
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length();
// we lazy init the cloned token, as in ctor not all attributes may be added // we lazy init the cloned token, as in ctor not all attributes may be added
if (clonedToken == null) { if (clonedToken == null) {
clonedToken = cloneAttributes(); clonedToken = cloneAttributes();
@ -118,7 +127,11 @@ public final class ThaiWordFilter extends TokenFilter {
int end = breaker.next(); int end = breaker.next();
if (end != BreakIterator.DONE) { if (end != BreakIterator.DONE) {
termAtt.setLength(end); termAtt.setLength(end);
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end); if (hasIllegalOffsets) {
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
} else {
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
}
// position increment keeps as it is for first token // position increment keeps as it is for first token
return true; return true;
} }

View File

@ -503,7 +503,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
@Override @Override
protected Reader initReader(Reader reader) { protected Reader initReader(Reader reader) {
return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader))); return new HTMLStripCharFilter(CharReader.get(reader));
} }
}; };

View File

@ -160,7 +160,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
hasSentence = false; hasSentence = false;
clearAttributes(); clearAttributes();
termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart); termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
offsetAtt.setOffset(offset+sentenceStart, offset+sentenceEnd); offsetAtt.setOffset(correctOffset(offset+sentenceStart), correctOffset(offset+sentenceEnd));
return true; return true;
} else { } else {
return false; return false;
@ -215,7 +215,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
clearAttributes(); clearAttributes();
termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart); termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
offsetAtt.setOffset(offset+wordStart, offset+wordEnd); offsetAtt.setOffset(correctOffset(offset+wordStart), correctOffset(offset+wordEnd));
posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost); posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
posBoost = 0; posBoost = 0;
return true; return true;

View File

@ -111,7 +111,7 @@ public final class ICUTokenizer extends Tokenizer {
@Override @Override
public void end() throws IOException { public void end() throws IOException {
final int finalOffset = (length < 0) ? offset : offset + length; final int finalOffset = (length < 0) ? offset : offset + length;
offsetAtt.setOffset(finalOffset, finalOffset); offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
} }
/* /*