SOLR-2891: fix CompoundWordTokenFilter to not create invalid offsets when the length of the text was changed by a previous filter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1234546 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-01-22 16:41:06 +00:00
parent f3a363708f
commit a7cfee6b07
3 changed files with 54 additions and 9 deletions

View File

@ -814,9 +814,10 @@ Bug fixes
* LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
to clones/reopened readers. (Uwe Schindler)
* LUCENE-3642: Fixed bugs in CharTokenizer, n-gram filters, and smart chinese
where they would create invalid offsets in some situations, leading to problems
in highlighting. (Max Beutel via Robert Muir)
* LUCENE-3642, SOLR-2891: Fixed bugs in CharTokenizer, n-gram filters,
compound token filters, and smart chinese where they would create invalid
offsets in some situations, leading to problems in highlighting.
(Max Beutel, Edwin Steiner via Robert Muir)
* LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
Float.MIN_VALUE when it should be Float.NaN, when there were 0

View File

@ -154,13 +154,22 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
/** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
public CompoundToken(int offset, int length) {
final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset;
this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
// TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed
// chars from the term, offsets may not match correctly (other filters producing tokens
// may also have this problem):
this.startOffset = newStart;
this.endOffset = newStart + length;
// offsets of the original word
int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset();
int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset();
if (endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) {
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
this.startOffset = startOff;
this.endOffset = endOff;
} else {
final int newStart = startOff + offset;
this.startOffset = newStart;
this.endOffset = newStart + length;
}
}
}

View File

@ -18,14 +18,19 @@ package org.apache.lucene.analysis.compound;
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
@ -299,5 +304,35 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
}
}
}
// SOLR-2891
// *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of bounds
// wrt original text if a previous filter increases the length of the word (in this case ü -> ue)
// so in this case we behave like WDF, and preserve any modified offsets
public void testInvalidOffsets() throws Exception {
final CharArraySet dict = makeDictionary("fall");
final NormalizeCharMap normMap = new NormalizeCharMap();
normMap.add("ü", "ue");
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenFilter filter = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict);
return new TokenStreamComponents(tokenizer, filter);
}
@Override
protected Reader initReader(Reader reader) {
return new MappingCharFilter(normMap, CharReader.get(reader));
}
};
assertAnalyzesTo(analyzer, "banküberfall",
new String[] { "bankueberfall", "fall" },
new int[] { 0, 0 },
new int[] { 12, 12 });
}
}