mirror of https://github.com/apache/lucene.git
LUCENE-3717: add better offsets testing to BaseTokenStreamTestCase, fix offsets bugs in ThaiWordFilter and ICUTokenizer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1234652 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f7a474d603
commit
c754c1c9c8
|
@ -814,10 +814,10 @@ Bug fixes
|
|||
* LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
|
||||
to clones/reopened readers. (Uwe Schindler)
|
||||
|
||||
* LUCENE-3642, SOLR-2891: Fixed bugs in CharTokenizer, n-gram filters,
|
||||
compound token filters, and smart chinese where they would create invalid
|
||||
offsets in some situations, leading to problems in highlighting.
|
||||
(Max Beutel, Edwin Steiner via Robert Muir)
|
||||
* LUCENE-3642, SOLR-2891, LUCENE-3717: Fixed bugs in CharTokenizer, n-gram filters,
|
||||
compound token filters, thai word filter, icutokenizer, and smart chinese
|
||||
where they would create invalid offsets in some situations, leading to problems
|
||||
in highlighting. (Max Beutel, Edwin Steiner via Robert Muir)
|
||||
|
||||
* LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
|
||||
Float.MIN_VALUE when it should be Float.NaN, when there were 0
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
|
@ -289,8 +290,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
|
||||
checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean());
|
||||
}
|
||||
|
||||
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter) throws IOException {
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
String text;
|
||||
switch(_TestUtil.nextInt(random, 0, 4)) {
|
||||
|
@ -311,7 +316,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
|
||||
}
|
||||
|
||||
TokenStream ts = a.tokenStream("dummy", new StringReader(text));
|
||||
int remainder = random.nextInt(10);
|
||||
Reader reader = new StringReader(text);
|
||||
TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||
assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
|
||||
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
|
||||
|
@ -339,30 +346,38 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
|
||||
}
|
||||
reader = new StringReader(text);
|
||||
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||
if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos + type
|
||||
assertAnalyzesToReuse(a, text,
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
types.toArray(new String[types.size()]),
|
||||
toIntArray(positions));
|
||||
toIntArray(positions),
|
||||
text.length());
|
||||
} else if (posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos
|
||||
assertAnalyzesToReuse(a, text,
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
toIntArray(positions));
|
||||
null,
|
||||
toIntArray(positions),
|
||||
text.length());
|
||||
} else if (offsetAtt != null) {
|
||||
// offset
|
||||
assertAnalyzesToReuse(a, text,
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets));
|
||||
toIntArray(endOffsets),
|
||||
null,
|
||||
null,
|
||||
text.length());
|
||||
} else {
|
||||
// terms only
|
||||
assertAnalyzesToReuse(a, text,
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,100 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
|
||||
// the purpose of this charfilter is to send offsets out of bounds
|
||||
// if the analyzer doesn't use correctOffset or does incorrect offset math.
|
||||
class MockCharFilter extends CharStream {
|
||||
final Reader in;
|
||||
final int remainder;
|
||||
|
||||
// for testing only
|
||||
public MockCharFilter(Reader in, int remainder) {
|
||||
this.in = in;
|
||||
this.remainder = remainder;
|
||||
assert remainder >= 0 && remainder < 10 : "invalid parameter";
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
in.close();
|
||||
}
|
||||
|
||||
int currentOffset = -1;
|
||||
int delta = 0;
|
||||
int bufferedCh = -1;
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
// we have a buffered character, add an offset correction and return it
|
||||
if (bufferedCh >= 0) {
|
||||
int ch = bufferedCh;
|
||||
bufferedCh = -1;
|
||||
currentOffset++;
|
||||
|
||||
addOffCorrectMap(currentOffset+delta, delta-1);
|
||||
delta--;
|
||||
return ch;
|
||||
}
|
||||
|
||||
// otherwise actually read one
|
||||
int ch = in.read();
|
||||
if (ch < 0)
|
||||
return ch;
|
||||
|
||||
currentOffset++;
|
||||
if ((ch % 10) != remainder || Character.isHighSurrogate((char)ch) || Character.isLowSurrogate((char)ch)) {
|
||||
return ch;
|
||||
}
|
||||
|
||||
// we will double this character, so buffer it.
|
||||
bufferedCh = ch;
|
||||
return ch;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
int numRead = 0;
|
||||
for (int i = off; i < off + len; i++) {
|
||||
int c = read();
|
||||
if (c == -1) break;
|
||||
cbuf[i] = (char) c;
|
||||
numRead++;
|
||||
}
|
||||
return numRead == 0 ? -1 : numRead;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int correctOffset(int currentOff) {
|
||||
SortedMap<Integer,Integer> subMap = corrections.subMap(0, currentOff+1);
|
||||
int ret = subMap.isEmpty() ? currentOff : currentOff + subMap.get(subMap.lastKey());
|
||||
assert ret >= 0 : "currentOff=" + currentOff + ",diff=" + (ret-currentOff);
|
||||
return ret;
|
||||
}
|
||||
|
||||
protected void addOffCorrectMap(int off, int cumulativeDiff) {
|
||||
corrections.put(off, cumulativeDiff);
|
||||
}
|
||||
|
||||
TreeMap<Integer,Integer> corrections = new TreeMap<Integer,Integer>();
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public class TestMockCharFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void test() throws IOException {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new MockCharFilter(CharReader.get(reader), 7);
|
||||
}
|
||||
};
|
||||
|
||||
assertAnalyzesTo(analyzer, "ab",
|
||||
new String[] { "aab" },
|
||||
new int[] { 0 },
|
||||
new int[] { 2 }
|
||||
);
|
||||
|
||||
assertAnalyzesTo(analyzer, "aba",
|
||||
new String[] { "aabaa" },
|
||||
new int[] { 0 },
|
||||
new int[] { 3 }
|
||||
);
|
||||
|
||||
assertAnalyzesTo(analyzer, "abcdefga",
|
||||
new String[] { "aabcdefgaa" },
|
||||
new int[] { 0 },
|
||||
new int[] { 8 }
|
||||
);
|
||||
}
|
||||
}
|
|
@ -68,6 +68,7 @@ public final class ThaiWordFilter extends TokenFilter {
|
|||
private CharTermAttribute clonedTermAtt = null;
|
||||
private OffsetAttribute clonedOffsetAtt = null;
|
||||
private boolean hasMoreTokensInClone = false;
|
||||
private boolean hasIllegalOffsets = false; // only if the length changed before this filter
|
||||
|
||||
/** Creates a new ThaiWordFilter with the specified match version. */
|
||||
public ThaiWordFilter(Version matchVersion, TokenStream input) {
|
||||
|
@ -86,7 +87,11 @@ public final class ThaiWordFilter extends TokenFilter {
|
|||
if (end != BreakIterator.DONE) {
|
||||
clonedToken.copyTo(this);
|
||||
termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start);
|
||||
offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
|
||||
if (hasIllegalOffsets) {
|
||||
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
|
||||
} else {
|
||||
offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
|
||||
}
|
||||
if (handlePosIncr) posAtt.setPositionIncrement(1);
|
||||
return true;
|
||||
}
|
||||
|
@ -102,6 +107,10 @@ public final class ThaiWordFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
hasMoreTokensInClone = true;
|
||||
|
||||
// if length by start + end offsets doesn't match the term text then assume
|
||||
// this is a synonym and don't adjust the offsets.
|
||||
hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length();
|
||||
|
||||
// we lazy init the cloned token, as in ctor not all attributes may be added
|
||||
if (clonedToken == null) {
|
||||
|
@ -118,7 +127,11 @@ public final class ThaiWordFilter extends TokenFilter {
|
|||
int end = breaker.next();
|
||||
if (end != BreakIterator.DONE) {
|
||||
termAtt.setLength(end);
|
||||
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
|
||||
if (hasIllegalOffsets) {
|
||||
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
|
||||
} else {
|
||||
offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
|
||||
}
|
||||
// position increment keeps as it is for first token
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -503,7 +503,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
|
||||
return new HTMLStripCharFilter(CharReader.get(reader));
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -160,7 +160,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
|
|||
hasSentence = false;
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
|
||||
offsetAtt.setOffset(offset+sentenceStart, offset+sentenceEnd);
|
||||
offsetAtt.setOffset(correctOffset(offset+sentenceStart), correctOffset(offset+sentenceEnd));
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
|
@ -215,7 +215,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
|
|||
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
|
||||
offsetAtt.setOffset(offset+wordStart, offset+wordEnd);
|
||||
offsetAtt.setOffset(correctOffset(offset+wordStart), correctOffset(offset+wordEnd));
|
||||
posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
|
||||
posBoost = 0;
|
||||
return true;
|
||||
|
|
|
@ -111,7 +111,7 @@ public final class ICUTokenizer extends Tokenizer {
|
|||
@Override
|
||||
public void end() throws IOException {
|
||||
final int finalOffset = (length < 0) ? offset : offset + length;
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
Loading…
Reference in New Issue