diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 99224b0bed8..8bf8e10197f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -814,9 +814,10 @@ Bug fixes * LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners to clones/reopened readers. (Uwe Schindler) -* LUCENE-3642: Fixed bugs in CharTokenizer, n-gram filters, and smart chinese - where they would create invalid offsets in some situations, leading to problems - in highlighting. (Max Beutel via Robert Muir) +* LUCENE-3642, SOLR-2891: Fixed bugs in CharTokenizer, n-gram filters, + compound token filters, and smart chinese where they would create invalid + offsets in some situations, leading to problems in highlighting. + (Max Beutel, Edwin Steiner via Robert Muir) * LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to Float.MIN_VALUE when it should be Float.NaN, when there were 0 diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java index 7e3b136fe9f..756221baf9f 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java @@ -154,13 +154,22 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { /** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */ public CompoundToken(int offset, int length) { - final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset; this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length); - // TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed - // chars from the term, offsets may not match correctly (other filters producing tokens - // may also have this problem): - this.startOffset = newStart; - this.endOffset = newStart + length; + + // offsets of the original word + int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset(); + int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset(); + + if (endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) { + // if length by start + end offsets doesn't match the term text then assume + // this is a synonym and don't adjust the offsets. + this.startOffset = startOff; + this.endOffset = endOff; + } else { + final int newStart = startOff + offset; + this.startOffset = newStart; + this.endOffset = newStart + length; + } } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java index bf500476111..07310c90d10 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java @@ -18,14 +18,19 @@ package org.apache.lucene.analysis.compound; */ import java.io.IOException; +import java.io.Reader; import java.io.StringReader; import java.util.Arrays; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.charfilter.MappingCharFilter; +import org.apache.lucene.analysis.charfilter.NormalizeCharMap; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.util.CharArraySet; @@ -299,5 +304,35 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { } } } + + // SOLR-2891 + // *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of bounds + // wrt original text if a previous filter increases the length of the word (in this case ü -> ue) + // so in this case we behave like WDF, and preserve any modified offsets + public void testInvalidOffsets() throws Exception { + final CharArraySet dict = makeDictionary("fall"); + final NormalizeCharMap normMap = new NormalizeCharMap(); + normMap.add("ü", "ue"); + + Analyzer analyzer = new Analyzer() { + + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenFilter filter = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict); + return new TokenStreamComponents(tokenizer, filter); + } + + @Override + protected Reader initReader(Reader reader) { + return new MappingCharFilter(normMap, CharReader.get(reader)); + } + }; + + assertAnalyzesTo(analyzer, "banküberfall", + new String[] { "bankueberfall", "fall" }, + new int[] { 0, 0 }, + new int[] { 12, 12 }); + } }