SOLR-2891: fix CompoundWordTokenFilter to not create invalid offsets when the length of the text was changed by a previous filter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1234546 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-01-22 16:41:06 +00:00
parent f3a363708f
commit a7cfee6b07
3 changed files with 54 additions and 9 deletions

View File

@ -814,9 +814,10 @@ Bug fixes
* LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners * LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
to clones/reopened readers. (Uwe Schindler) to clones/reopened readers. (Uwe Schindler)
* LUCENE-3642: Fixed bugs in CharTokenizer, n-gram filters, and smart chinese * LUCENE-3642, SOLR-2891: Fixed bugs in CharTokenizer, n-gram filters,
where they would create invalid offsets in some situations, leading to problems compound token filters, and smart chinese where they would create invalid
in highlighting. (Max Beutel via Robert Muir) offsets in some situations, leading to problems in highlighting.
(Max Beutel, Edwin Steiner via Robert Muir)
* LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to * LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
Float.MIN_VALUE when it should be Float.NaN, when there were 0 Float.MIN_VALUE when it should be Float.NaN, when there were 0

View File

@ -154,14 +154,23 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
/** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */ /** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
public CompoundToken(int offset, int length) { public CompoundToken(int offset, int length) {
final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset;
this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length); this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
// TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed
// chars from the term, offsets may not match correctly (other filters producing tokens // offsets of the original word
// may also have this problem): int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset();
int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset();
if (endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) {
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
this.startOffset = startOff;
this.endOffset = endOff;
} else {
final int newStart = startOff + offset;
this.startOffset = newStart; this.startOffset = newStart;
this.endOffset = newStart + length; this.endOffset = newStart + length;
} }
}
} }
} }

View File

@ -18,14 +18,19 @@ package org.apache.lucene.analysis.compound;
*/ */
import java.io.IOException; import java.io.IOException;
import java.io.Reader;
import java.io.StringReader; import java.io.StringReader;
import java.util.Arrays; import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
@ -300,4 +305,34 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
} }
} }
// SOLR-2891
// *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of bounds
// wrt original text if a previous filter increases the length of the word (in this case ü -> ue)
// so in this case we behave like WDF, and preserve any modified offsets
public void testInvalidOffsets() throws Exception {
final CharArraySet dict = makeDictionary("fall");
final NormalizeCharMap normMap = new NormalizeCharMap();
normMap.add("ü", "ue");
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenFilter filter = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict);
return new TokenStreamComponents(tokenizer, filter);
}
@Override
protected Reader initReader(Reader reader) {
return new MappingCharFilter(normMap, CharReader.get(reader));
}
};
assertAnalyzesTo(analyzer, "banküberfall",
new String[] { "bankueberfall", "fall" },
new int[] { 0, 0 },
new int[] { 12, 12 });
}
} }