mirror of https://github.com/apache/lucene.git
LUCENE-3642: fix invalid offsets from CharTokenizer, [Edge]NGramFilters, SmartChinese, add sanity check to BaseTokenStreamTestCase
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1213329 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3899e18ca3
commit
7dc025bdce
|
@ -731,6 +731,10 @@ Bug fixes
|
|||
* LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
|
||||
to clones/reopened readers. (Uwe Schindler)
|
||||
|
||||
* LUCENE-3642: Fixed bugs in CharTokenizer, n-gram filters, and smart chinese
|
||||
where they would create invalid offsets in some situations, leading to problems
|
||||
in highlighting. (Max Beutel via Robert Muir)
|
||||
|
||||
Documentation
|
||||
|
||||
* LUCENE-3597: Fixed incorrect grouping documentation. (Martijn van Groningen, Robert Muir)
|
||||
|
|
|
@ -135,6 +135,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
assertTrue("startOffset must be >= 0", offsetAtt.startOffset() >= 0);
|
||||
assertTrue("endOffset must be >= 0", offsetAtt.endOffset() >= 0);
|
||||
assertTrue("endOffset must be >= startOffset", offsetAtt.endOffset() >= offsetAtt.startOffset());
|
||||
if (finalOffset != null) {
|
||||
assertTrue("startOffset must be <= finalOffset", offsetAtt.startOffset() <= finalOffset.intValue());
|
||||
assertTrue("endOffset must be <= finalOffset", offsetAtt.endOffset() <= finalOffset.intValue());
|
||||
}
|
||||
}
|
||||
if (posIncrAtt != null) {
|
||||
assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
|
||||
|
|
|
@ -71,6 +71,8 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
private int curTermLength;
|
||||
private int curGramSize;
|
||||
private int tokStart;
|
||||
private int tokEnd; // only used if the length changed before this filter
|
||||
private boolean hasIllegalOffsets; // only if the length changed before this filter
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
@ -126,6 +128,10 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
curTermLength = termAtt.length();
|
||||
curGramSize = minGram;
|
||||
tokStart = offsetAtt.startOffset();
|
||||
tokEnd = offsetAtt.endOffset();
|
||||
// if length by start + end offsets doesn't match the term text then assume
|
||||
// this is a synonym and don't adjust the offsets.
|
||||
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
|
||||
}
|
||||
}
|
||||
if (curGramSize <= maxGram) {
|
||||
|
@ -135,7 +141,11 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
|
||||
int end = start + curGramSize;
|
||||
clearAttributes();
|
||||
offsetAtt.setOffset(tokStart + start, tokStart + end);
|
||||
if (hasIllegalOffsets) {
|
||||
offsetAtt.setOffset(tokStart, tokEnd);
|
||||
} else {
|
||||
offsetAtt.setOffset(tokStart + start, tokStart + end);
|
||||
}
|
||||
termAtt.copyBuffer(curTermBuffer, start, curGramSize);
|
||||
curGramSize++;
|
||||
return true;
|
||||
|
|
|
@ -38,6 +38,8 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
private int curGramSize;
|
||||
private int curPos;
|
||||
private int tokStart;
|
||||
private int tokEnd; // only used if the length changed before this filter
|
||||
private boolean hasIllegalOffsets; // only if the length changed before this filter
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
@ -81,13 +83,21 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
curGramSize = minGram;
|
||||
curPos = 0;
|
||||
tokStart = offsetAtt.startOffset();
|
||||
tokEnd = offsetAtt.endOffset();
|
||||
// if length by start + end offsets doesn't match the term text then assume
|
||||
// this is a synonym and don't adjust the offsets.
|
||||
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
|
||||
}
|
||||
}
|
||||
while (curGramSize <= maxGram) {
|
||||
while (curPos+curGramSize <= curTermLength) { // while there is input
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
|
||||
offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
|
||||
if (hasIllegalOffsets) {
|
||||
offsetAtt.setOffset(tokStart, tokEnd);
|
||||
} else {
|
||||
offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
|
||||
}
|
||||
curPos++;
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -144,6 +144,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
clearAttributes();
|
||||
int length = 0;
|
||||
int start = -1; // this variable is always initialized
|
||||
int end = -1;
|
||||
char[] buffer = termAtt.buffer();
|
||||
while (true) {
|
||||
if (bufferIndex >= dataLen) {
|
||||
|
@ -162,15 +163,18 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
}
|
||||
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
|
||||
final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex);
|
||||
bufferIndex += Character.charCount(c);
|
||||
final int charCount = Character.charCount(c);
|
||||
bufferIndex += charCount;
|
||||
|
||||
if (isTokenChar(c)) { // if it's a token char
|
||||
if (length == 0) { // start of token
|
||||
assert start == -1;
|
||||
start = offset + bufferIndex - 1;
|
||||
start = offset + bufferIndex - charCount;
|
||||
end = start;
|
||||
} else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds
|
||||
buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer
|
||||
}
|
||||
end += charCount;
|
||||
length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
|
||||
if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test
|
||||
break;
|
||||
|
@ -180,7 +184,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
|
||||
termAtt.setLength(length);
|
||||
assert start != -1;
|
||||
offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start+length));
|
||||
offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(end));
|
||||
return true;
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,123 @@
|
|||
package org.apache.lucene.analysis.core;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.BasicOperations;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.State;
|
||||
import org.apache.lucene.util.automaton.Transition;
|
||||
|
||||
/**
|
||||
* Compares MockTokenizer (which is simple with no optimizations) with equivalent
|
||||
* core tokenizers (that have optimizations like buffering).
|
||||
*
|
||||
* Any tests here need to probably consider unicode version of the JRE (it could
|
||||
* cause false fails).
|
||||
*/
|
||||
public class TestDuelingAnalyzers extends LuceneTestCase {
|
||||
private CharacterRunAutomaton jvmLetter;
|
||||
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
// build an automaton matching this jvm's letter definition
|
||||
State initial = new State();
|
||||
State accept = new State();
|
||||
accept.setAccept(true);
|
||||
for (int i = 0; i <= 0x10FFFF; i++) {
|
||||
if (Character.isLetter(i)) {
|
||||
initial.addTransition(new Transition(i, i, accept));
|
||||
}
|
||||
}
|
||||
Automaton single = new Automaton(initial);
|
||||
single.reduce();
|
||||
Automaton repeat = BasicOperations.repeat(single);
|
||||
jvmLetter = new CharacterRunAutomaton(repeat);
|
||||
}
|
||||
|
||||
public void testLetterAscii() throws Exception {
|
||||
Analyzer left = new MockAnalyzer(random, jvmLetter, false);
|
||||
Analyzer right = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
String s = _TestUtil.randomSimpleString(random);
|
||||
assertEquals(s, left.tokenStream("foo", new StringReader(s)),
|
||||
right.tokenStream("foo", new StringReader(s)));
|
||||
}
|
||||
}
|
||||
|
||||
public void testLetterUnicode() throws Exception {
|
||||
Analyzer left = new MockAnalyzer(random, jvmLetter, false);
|
||||
Analyzer right = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
String s = _TestUtil.randomUnicodeString(random);
|
||||
assertEquals(s, left.tokenStream("foo", new StringReader(s)),
|
||||
right.tokenStream("foo", new StringReader(s)));
|
||||
}
|
||||
}
|
||||
|
||||
// we only check a few core attributes here.
|
||||
// TODO: test other things
|
||||
public void assertEquals(String s, TokenStream left, TokenStream right) throws Exception {
|
||||
left.reset();
|
||||
right.reset();
|
||||
CharTermAttribute leftTerm = left.addAttribute(CharTermAttribute.class);
|
||||
CharTermAttribute rightTerm = right.addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute leftOffset = left.addAttribute(OffsetAttribute.class);
|
||||
OffsetAttribute rightOffset = right.addAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute leftPos = left.addAttribute(PositionIncrementAttribute.class);
|
||||
PositionIncrementAttribute rightPos = right.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
while (left.incrementToken()) {
|
||||
assertTrue("wrong number of tokens for input: " + s, right.incrementToken());
|
||||
assertEquals("wrong term text for input: " + s, leftTerm.toString(), rightTerm.toString());
|
||||
assertEquals("wrong position for input: " + s, leftPos.getPositionIncrement(), rightPos.getPositionIncrement());
|
||||
assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset());
|
||||
assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
|
||||
};
|
||||
assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
|
||||
left.end();
|
||||
right.end();
|
||||
assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
|
||||
left.close();
|
||||
right.close();
|
||||
}
|
||||
}
|
|
@ -17,11 +17,16 @@ package org.apache.lucene.analysis.ngram;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
/**
|
||||
|
@ -104,4 +109,24 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
tokenizer.reset(new StringReader("abcde"));
|
||||
assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3});
|
||||
}
|
||||
|
||||
// LUCENE-3642
|
||||
// EdgeNgram blindly adds term length to offset, but this can take things out of bounds
|
||||
// wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
|
||||
// so in this case we behave like WDF, and preserve any modified offsets
|
||||
public void testInvalidOffsets() throws Exception {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
|
||||
filters = new EdgeNGramTokenFilter(filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
|
||||
return new TokenStreamComponents(tokenizer, filters);
|
||||
}
|
||||
};
|
||||
assertAnalyzesTo(analyzer, "mosfellsbær",
|
||||
new String[] { "mo", "mos", "mosf", "mosfe", "mosfel", "mosfell", "mosfells", "mosfellsb", "mosfellsba", "mosfellsbae", "mosfellsbaer" },
|
||||
new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
|
||||
new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,11 +17,16 @@ package org.apache.lucene.analysis.ngram;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
/**
|
||||
|
@ -93,4 +98,24 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
tokenizer.reset(new StringReader("abcde"));
|
||||
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
|
||||
}
|
||||
|
||||
// LUCENE-3642
|
||||
// EdgeNgram blindly adds term length to offset, but this can take things out of bounds
|
||||
// wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
|
||||
// so in this case we behave like WDF, and preserve any modified offsets
|
||||
public void testInvalidOffsets() throws Exception {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
|
||||
filters = new NGramTokenFilter(filters, 2, 2);
|
||||
return new TokenStreamComponents(tokenizer, filters);
|
||||
}
|
||||
};
|
||||
assertAnalyzesTo(analyzer, "mosfellsbær",
|
||||
new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
|
||||
new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
|
||||
new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,11 +18,17 @@ package org.apache.lucene.analysis.util;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -94,4 +100,80 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
|||
Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
|
||||
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
|
||||
}
|
||||
|
||||
// LUCENE-3642: normalize SMP->BMP and check that offsets are correct
|
||||
public void testCrossPlaneNormalization() throws IOException {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader) {
|
||||
@Override
|
||||
protected int normalize(int c) {
|
||||
if (c > 0xffff) {
|
||||
return 'δ';
|
||||
} else {
|
||||
return c;
|
||||
}
|
||||
}
|
||||
};
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
int num = 10000 * RANDOM_MULTIPLIER;
|
||||
for (int i = 0; i < num; i++) {
|
||||
String s = _TestUtil.randomUnicodeString(random);
|
||||
TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
|
||||
ts.reset();
|
||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
while (ts.incrementToken()) {
|
||||
String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||
for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
|
||||
cp = highlightedText.codePointAt(j);
|
||||
assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
|
||||
}
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
// just for fun
|
||||
checkRandomData(random, analyzer, num);
|
||||
}
|
||||
|
||||
// LUCENE-3642: normalize BMP->SMP and check that offsets are correct
|
||||
public void testCrossPlaneNormalization2() throws IOException {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader) {
|
||||
@Override
|
||||
protected int normalize(int c) {
|
||||
if (c <= 0xffff) {
|
||||
return 0x1043C;
|
||||
} else {
|
||||
return c;
|
||||
}
|
||||
}
|
||||
};
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
int num = 10000 * RANDOM_MULTIPLIER;
|
||||
for (int i = 0; i < num; i++) {
|
||||
String s = _TestUtil.randomUnicodeString(random);
|
||||
TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
|
||||
ts.reset();
|
||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
while (ts.incrementToken()) {
|
||||
String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||
for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
|
||||
cp = highlightedText.codePointAt(j);
|
||||
assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
|
||||
}
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
// just for fun
|
||||
checkRandomData(random, analyzer, num);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -44,6 +44,10 @@ public final class WordTokenFilter extends TokenFilter {
|
|||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
|
||||
private int tokStart; // only used if the length changed before this filter
|
||||
private int tokEnd; // only used if the length changed before this filter
|
||||
private boolean hasIllegalOffsets; // only if the length changed before this filter
|
||||
|
||||
/**
|
||||
* Construct a new WordTokenizer.
|
||||
*
|
||||
|
@ -59,6 +63,11 @@ public final class WordTokenFilter extends TokenFilter {
|
|||
if (tokenIter == null || !tokenIter.hasNext()) {
|
||||
// there are no remaining tokens from the current sentence... are there more sentences?
|
||||
if (input.incrementToken()) {
|
||||
tokStart = offsetAtt.startOffset();
|
||||
tokEnd = offsetAtt.endOffset();
|
||||
// if length by start + end offsets doesn't match the term text then assume
|
||||
// this is a synonym and don't adjust the offsets.
|
||||
hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
|
||||
// a new sentence is available: process it.
|
||||
tokenBuffer = wordSegmenter.segmentSentence(termAtt.toString(), offsetAtt.startOffset());
|
||||
tokenIter = tokenBuffer.iterator();
|
||||
|
@ -77,7 +86,11 @@ public final class WordTokenFilter extends TokenFilter {
|
|||
// There are remaining tokens from the current sentence, return the next one.
|
||||
SegToken nextWord = tokenIter.next();
|
||||
termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length);
|
||||
offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
|
||||
if (hasIllegalOffsets) {
|
||||
offsetAtt.setOffset(tokStart, tokEnd);
|
||||
} else {
|
||||
offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
|
||||
}
|
||||
typeAtt.setType("word");
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -17,11 +17,16 @@
|
|||
|
||||
package org.apache.lucene.analysis.cn.smart;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
@ -196,6 +201,24 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
// LUCENE-3642
|
||||
public void testInvalidOffset() throws Exception {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
|
||||
filters = new WordTokenFilter(filters);
|
||||
return new TokenStreamComponents(tokenizer, filters);
|
||||
}
|
||||
};
|
||||
|
||||
assertAnalyzesTo(analyzer, "mosfellsbær",
|
||||
new String[] { "mosfellsbaer" },
|
||||
new int[] { 0 },
|
||||
new int[] { 11 });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random, new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
|
|
Loading…
Reference in New Issue