LUCENE-3642: fix invalid offsets from CharTokenizer, [Edge]NGramFilters, SmartChinese, add sanity check to BaseTokenStreamTestCase

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1213329 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-12-12 17:28:09 +00:00
parent 3899e18ca3
commit 7dc025bdce
11 changed files with 329 additions and 6 deletions

View File

@ -731,6 +731,10 @@ Bug fixes
* LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
to clones/reopened readers. (Uwe Schindler)
* LUCENE-3642: Fixed bugs in CharTokenizer, n-gram filters, and smart chinese
where they would create invalid offsets in some situations, leading to problems
in highlighting. (Max Beutel via Robert Muir)
Documentation
* LUCENE-3597: Fixed incorrect grouping documentation. (Martijn van Groningen, Robert Muir)

View File

@ -135,6 +135,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertTrue("startOffset must be >= 0", offsetAtt.startOffset() >= 0);
assertTrue("endOffset must be >= 0", offsetAtt.endOffset() >= 0);
assertTrue("endOffset must be >= startOffset", offsetAtt.endOffset() >= offsetAtt.startOffset());
if (finalOffset != null) {
assertTrue("startOffset must be <= finalOffset", offsetAtt.startOffset() <= finalOffset.intValue());
assertTrue("endOffset must be <= finalOffset", offsetAtt.endOffset() <= finalOffset.intValue());
}
}
if (posIncrAtt != null) {
assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);

View File

@ -71,6 +71,8 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
private int curTermLength;
private int curGramSize;
private int tokStart;
private int tokEnd; // only used if the length changed before this filter
private boolean hasIllegalOffsets; // only if the length changed before this filter
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@ -126,6 +128,10 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
curTermLength = termAtt.length();
curGramSize = minGram;
tokStart = offsetAtt.startOffset();
tokEnd = offsetAtt.endOffset();
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
}
}
if (curGramSize <= maxGram) {
@ -135,7 +141,11 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
int end = start + curGramSize;
clearAttributes();
offsetAtt.setOffset(tokStart + start, tokStart + end);
if (hasIllegalOffsets) {
offsetAtt.setOffset(tokStart, tokEnd);
} else {
offsetAtt.setOffset(tokStart + start, tokStart + end);
}
termAtt.copyBuffer(curTermBuffer, start, curGramSize);
curGramSize++;
return true;

View File

@ -38,6 +38,8 @@ public final class NGramTokenFilter extends TokenFilter {
private int curGramSize;
private int curPos;
private int tokStart;
private int tokEnd; // only used if the length changed before this filter
private boolean hasIllegalOffsets; // only if the length changed before this filter
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@ -81,13 +83,21 @@ public final class NGramTokenFilter extends TokenFilter {
curGramSize = minGram;
curPos = 0;
tokStart = offsetAtt.startOffset();
tokEnd = offsetAtt.endOffset();
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
}
}
while (curGramSize <= maxGram) {
while (curPos+curGramSize <= curTermLength) { // while there is input
clearAttributes();
termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
if (hasIllegalOffsets) {
offsetAtt.setOffset(tokStart, tokEnd);
} else {
offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
}
curPos++;
return true;
}

View File

@ -144,6 +144,7 @@ public abstract class CharTokenizer extends Tokenizer {
clearAttributes();
int length = 0;
int start = -1; // this variable is always initialized
int end = -1;
char[] buffer = termAtt.buffer();
while (true) {
if (bufferIndex >= dataLen) {
@ -162,15 +163,18 @@ public abstract class CharTokenizer extends Tokenizer {
}
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex);
bufferIndex += Character.charCount(c);
final int charCount = Character.charCount(c);
bufferIndex += charCount;
if (isTokenChar(c)) { // if it's a token char
if (length == 0) { // start of token
assert start == -1;
start = offset + bufferIndex - 1;
start = offset + bufferIndex - charCount;
end = start;
} else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds
buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer
}
end += charCount;
length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test
break;
@ -180,7 +184,7 @@ public abstract class CharTokenizer extends Tokenizer {
termAtt.setLength(length);
assert start != -1;
offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start+length));
offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(end));
return true;
}

View File

@ -0,0 +1,123 @@
package org.apache.lucene.analysis.core;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.State;
import org.apache.lucene.util.automaton.Transition;
/**
* Compares MockTokenizer (which is simple with no optimizations) with equivalent
* core tokenizers (that have optimizations like buffering).
*
* Any tests here need to probably consider unicode version of the JRE (it could
* cause false fails).
*/
public class TestDuelingAnalyzers extends LuceneTestCase {
private CharacterRunAutomaton jvmLetter;
public void setUp() throws Exception {
super.setUp();
// build an automaton matching this jvm's letter definition
State initial = new State();
State accept = new State();
accept.setAccept(true);
for (int i = 0; i <= 0x10FFFF; i++) {
if (Character.isLetter(i)) {
initial.addTransition(new Transition(i, i, accept));
}
}
Automaton single = new Automaton(initial);
single.reduce();
Automaton repeat = BasicOperations.repeat(single);
jvmLetter = new CharacterRunAutomaton(repeat);
}
public void testLetterAscii() throws Exception {
Analyzer left = new MockAnalyzer(random, jvmLetter, false);
Analyzer right = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
for (int i = 0; i < 10000; i++) {
String s = _TestUtil.randomSimpleString(random);
assertEquals(s, left.tokenStream("foo", new StringReader(s)),
right.tokenStream("foo", new StringReader(s)));
}
}
public void testLetterUnicode() throws Exception {
Analyzer left = new MockAnalyzer(random, jvmLetter, false);
Analyzer right = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
for (int i = 0; i < 10000; i++) {
String s = _TestUtil.randomUnicodeString(random);
assertEquals(s, left.tokenStream("foo", new StringReader(s)),
right.tokenStream("foo", new StringReader(s)));
}
}
// we only check a few core attributes here.
// TODO: test other things
public void assertEquals(String s, TokenStream left, TokenStream right) throws Exception {
left.reset();
right.reset();
CharTermAttribute leftTerm = left.addAttribute(CharTermAttribute.class);
CharTermAttribute rightTerm = right.addAttribute(CharTermAttribute.class);
OffsetAttribute leftOffset = left.addAttribute(OffsetAttribute.class);
OffsetAttribute rightOffset = right.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute leftPos = left.addAttribute(PositionIncrementAttribute.class);
PositionIncrementAttribute rightPos = right.addAttribute(PositionIncrementAttribute.class);
while (left.incrementToken()) {
assertTrue("wrong number of tokens for input: " + s, right.incrementToken());
assertEquals("wrong term text for input: " + s, leftTerm.toString(), rightTerm.toString());
assertEquals("wrong position for input: " + s, leftPos.getPositionIncrement(), rightPos.getPositionIncrement());
assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset());
assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
};
assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
left.end();
right.end();
assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
left.close();
right.close();
}
}

View File

@ -17,11 +17,16 @@ package org.apache.lucene.analysis.ngram;
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import java.io.Reader;
import java.io.StringReader;
/**
@ -104,4 +109,24 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
tokenizer.reset(new StringReader("abcde"));
assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3});
}
// LUCENE-3642
// EdgeNgram blindly adds term length to offset, but this can take things out of bounds
// wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
// so in this case we behave like WDF, and preserve any modified offsets
public void testInvalidOffsets() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
filters = new EdgeNGramTokenFilter(filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
return new TokenStreamComponents(tokenizer, filters);
}
};
assertAnalyzesTo(analyzer, "mosfellsbær",
new String[] { "mo", "mos", "mosf", "mosfe", "mosfel", "mosfell", "mosfells", "mosfellsb", "mosfellsba", "mosfellsbae", "mosfellsbaer" },
new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 });
}
}

View File

@ -17,11 +17,16 @@ package org.apache.lucene.analysis.ngram;
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import java.io.Reader;
import java.io.StringReader;
/**
@ -93,4 +98,24 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
tokenizer.reset(new StringReader("abcde"));
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
}
// LUCENE-3642
// EdgeNgram blindly adds term length to offset, but this can take things out of bounds
// wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
// so in this case we behave like WDF, and preserve any modified offsets
public void testInvalidOffsets() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
filters = new NGramTokenFilter(filters, 2, 2);
return new TokenStreamComponents(tokenizer, filters);
}
};
assertAnalyzesTo(analyzer, "mosfellsbær",
new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 });
}
}

View File

@ -18,11 +18,17 @@ package org.apache.lucene.analysis.util;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util._TestUtil;
/**
@ -94,4 +100,80 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
}
// LUCENE-3642: normalize SMP->BMP and check that offsets are correct
public void testCrossPlaneNormalization() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader) {
@Override
protected int normalize(int c) {
if (c > 0xffff) {
return 'δ';
} else {
return c;
}
}
};
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
int num = 10000 * RANDOM_MULTIPLIER;
for (int i = 0; i < num; i++) {
String s = _TestUtil.randomUnicodeString(random);
TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
ts.reset();
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
while (ts.incrementToken()) {
String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
cp = highlightedText.codePointAt(j);
assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
}
}
ts.end();
ts.close();
}
// just for fun
checkRandomData(random, analyzer, num);
}
// LUCENE-3642: normalize BMP->SMP and check that offsets are correct
public void testCrossPlaneNormalization2() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader) {
@Override
protected int normalize(int c) {
if (c <= 0xffff) {
return 0x1043C;
} else {
return c;
}
}
};
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
int num = 10000 * RANDOM_MULTIPLIER;
for (int i = 0; i < num; i++) {
String s = _TestUtil.randomUnicodeString(random);
TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
ts.reset();
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
while (ts.incrementToken()) {
String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
cp = highlightedText.codePointAt(j);
assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
}
}
ts.end();
ts.close();
}
// just for fun
checkRandomData(random, analyzer, num);
}
}

View File

@ -43,6 +43,10 @@ public final class WordTokenFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private int tokStart; // only used if the length changed before this filter
private int tokEnd; // only used if the length changed before this filter
private boolean hasIllegalOffsets; // only if the length changed before this filter
/**
* Construct a new WordTokenizer.
@ -59,6 +63,11 @@ public final class WordTokenFilter extends TokenFilter {
if (tokenIter == null || !tokenIter.hasNext()) {
// there are no remaining tokens from the current sentence... are there more sentences?
if (input.incrementToken()) {
tokStart = offsetAtt.startOffset();
tokEnd = offsetAtt.endOffset();
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
// a new sentence is available: process it.
tokenBuffer = wordSegmenter.segmentSentence(termAtt.toString(), offsetAtt.startOffset());
tokenIter = tokenBuffer.iterator();
@ -77,7 +86,11 @@ public final class WordTokenFilter extends TokenFilter {
// There are remaining tokens from the current sentence, return the next one.
SegToken nextWord = tokenIter.next();
termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length);
offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
if (hasIllegalOffsets) {
offsetAtt.setOffset(tokStart, tokEnd);
} else {
offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
}
typeAtt.setType("word");
return true;
}

View File

@ -17,11 +17,16 @@
package org.apache.lucene.analysis.cn.smart;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.util.Version;
public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
@ -196,6 +201,24 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
}
}
// LUCENE-3642
public void testInvalidOffset() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
filters = new WordTokenFilter(filters);
return new TokenStreamComponents(tokenizer, filters);
}
};
assertAnalyzesTo(analyzer, "mosfellsbær",
new String[] { "mosfellsbaer" },
new int[] { 0 },
new int[] { 11 });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random, new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);