mirror of https://github.com/apache/lucene.git
SOLR-2891: fix CompoundWordTokenFilter to not create invalid offsets when the length of the text was changed by a previous filter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1234546 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f3a363708f
commit
a7cfee6b07
|
@ -814,9 +814,10 @@ Bug fixes
|
||||||
* LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
|
* LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
|
||||||
to clones/reopened readers. (Uwe Schindler)
|
to clones/reopened readers. (Uwe Schindler)
|
||||||
|
|
||||||
* LUCENE-3642: Fixed bugs in CharTokenizer, n-gram filters, and smart chinese
|
* LUCENE-3642, SOLR-2891: Fixed bugs in CharTokenizer, n-gram filters,
|
||||||
where they would create invalid offsets in some situations, leading to problems
|
compound token filters, and smart chinese where they would create invalid
|
||||||
in highlighting. (Max Beutel via Robert Muir)
|
offsets in some situations, leading to problems in highlighting.
|
||||||
|
(Max Beutel, Edwin Steiner via Robert Muir)
|
||||||
|
|
||||||
* LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
|
* LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
|
||||||
Float.MIN_VALUE when it should be Float.NaN, when there were 0
|
Float.MIN_VALUE when it should be Float.NaN, when there were 0
|
||||||
|
|
|
@ -154,14 +154,23 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||||
|
|
||||||
/** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
|
/** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
|
||||||
public CompoundToken(int offset, int length) {
|
public CompoundToken(int offset, int length) {
|
||||||
final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset;
|
|
||||||
this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
|
this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
|
||||||
// TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed
|
|
||||||
// chars from the term, offsets may not match correctly (other filters producing tokens
|
// offsets of the original word
|
||||||
// may also have this problem):
|
int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset();
|
||||||
|
int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset();
|
||||||
|
|
||||||
|
if (endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) {
|
||||||
|
// if length by start + end offsets doesn't match the term text then assume
|
||||||
|
// this is a synonym and don't adjust the offsets.
|
||||||
|
this.startOffset = startOff;
|
||||||
|
this.endOffset = endOff;
|
||||||
|
} else {
|
||||||
|
final int newStart = startOff + offset;
|
||||||
this.startOffset = newStart;
|
this.startOffset = newStart;
|
||||||
this.endOffset = newStart + length;
|
this.endOffset = newStart + length;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,14 +18,19 @@ package org.apache.lucene.analysis.compound;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.CharReader;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
|
||||||
|
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
@ -300,4 +305,34 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SOLR-2891
|
||||||
|
// *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of bounds
|
||||||
|
// wrt original text if a previous filter increases the length of the word (in this case ü -> ue)
|
||||||
|
// so in this case we behave like WDF, and preserve any modified offsets
|
||||||
|
public void testInvalidOffsets() throws Exception {
|
||||||
|
final CharArraySet dict = makeDictionary("fall");
|
||||||
|
final NormalizeCharMap normMap = new NormalizeCharMap();
|
||||||
|
normMap.add("ü", "ue");
|
||||||
|
|
||||||
|
Analyzer analyzer = new Analyzer() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||||
|
TokenFilter filter = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict);
|
||||||
|
return new TokenStreamComponents(tokenizer, filter);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Reader initReader(Reader reader) {
|
||||||
|
return new MappingCharFilter(normMap, CharReader.get(reader));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
assertAnalyzesTo(analyzer, "banküberfall",
|
||||||
|
new String[] { "bankueberfall", "fall" },
|
||||||
|
new int[] { 0, 0 },
|
||||||
|
new int[] { 12, 12 });
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue