mirror of https://github.com/apache/lucene.git
SOLR-2891: fix CompoundWordTokenFilter to not create invalid offsets when the length of the text was changed by a previous filter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1234546 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f3a363708f
commit
a7cfee6b07
|
@ -814,9 +814,10 @@ Bug fixes
|
|||
* LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
|
||||
to clones/reopened readers. (Uwe Schindler)
|
||||
|
||||
* LUCENE-3642: Fixed bugs in CharTokenizer, n-gram filters, and smart chinese
|
||||
where they would create invalid offsets in some situations, leading to problems
|
||||
in highlighting. (Max Beutel via Robert Muir)
|
||||
* LUCENE-3642, SOLR-2891: Fixed bugs in CharTokenizer, n-gram filters,
|
||||
compound token filters, and smart chinese where they would create invalid
|
||||
offsets in some situations, leading to problems in highlighting.
|
||||
(Max Beutel, Edwin Steiner via Robert Muir)
|
||||
|
||||
* LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
|
||||
Float.MIN_VALUE when it should be Float.NaN, when there were 0
|
||||
|
|
|
@ -154,13 +154,22 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
|
||||
/** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
|
||||
public CompoundToken(int offset, int length) {
|
||||
final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset;
|
||||
this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
|
||||
// TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed
|
||||
// chars from the term, offsets may not match correctly (other filters producing tokens
|
||||
// may also have this problem):
|
||||
this.startOffset = newStart;
|
||||
this.endOffset = newStart + length;
|
||||
|
||||
// offsets of the original word
|
||||
int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset();
|
||||
int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset();
|
||||
|
||||
if (endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) {
|
||||
// if length by start + end offsets doesn't match the term text then assume
|
||||
// this is a synonym and don't adjust the offsets.
|
||||
this.startOffset = startOff;
|
||||
this.endOffset = endOff;
|
||||
} else {
|
||||
final int newStart = startOff + offset;
|
||||
this.startOffset = newStart;
|
||||
this.endOffset = newStart + length;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -18,14 +18,19 @@ package org.apache.lucene.analysis.compound;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
|
||||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
|
@ -299,5 +304,35 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// SOLR-2891
|
||||
// *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of bounds
|
||||
// wrt original text if a previous filter increases the length of the word (in this case ü -> ue)
|
||||
// so in this case we behave like WDF, and preserve any modified offsets
|
||||
public void testInvalidOffsets() throws Exception {
|
||||
final CharArraySet dict = makeDictionary("fall");
|
||||
final NormalizeCharMap normMap = new NormalizeCharMap();
|
||||
normMap.add("ü", "ue");
|
||||
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
TokenFilter filter = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new MappingCharFilter(normMap, CharReader.get(reader));
|
||||
}
|
||||
};
|
||||
|
||||
assertAnalyzesTo(analyzer, "banküberfall",
|
||||
new String[] { "bankueberfall", "fall" },
|
||||
new int[] { 0, 0 },
|
||||
new int[] { 12, 12 });
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue