SOLR-2891: fix CompoundWordTokenFilter to not create invalid offsets when the length of the text was changed by a previous filter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1234546 13f79535-47bb-0310-9956-ffa450edef68
2012-01-22 16:41:06 +00:00 · 2012-01-22 16:41:06 +00:00 · a7cfee6b07
parent f3a363708f
commit a7cfee6b07
3 changed files with 54 additions and 9 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -814,9 +814,10 @@ Bug fixes
 * LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
  to clones/reopened readers.  (Uwe Schindler)
-* LUCENE-3642: Fixed bugs in CharTokenizer, n-gram filters, and smart chinese 
+* LUCENE-3642, SOLR-2891: Fixed bugs in CharTokenizer, n-gram filters, 
-  where they would create invalid offsets in some situations, leading to problems
+  compound token filters, and smart chinese where they would create invalid 
-  in highlighting. (Max Beutel via Robert Muir)
+  offsets in some situations, leading to problems in highlighting. 
  (Max Beutel, Edwin Steiner via Robert Muir)
 * LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
  Float.MIN_VALUE when it should be Float.NaN, when there were 0
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
@ -154,14 +154,23 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
    /** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
    public CompoundToken(int offset, int length) {
      final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset;
      this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
-      // TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed
+      
-      // chars from the term, offsets may not match correctly (other filters producing tokens
+      // offsets of the original word
-      // may also have this problem):
+      int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset();
      int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset();
      if (endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) {
        // if length by start + end offsets doesn't match the term text then assume
        // this is a synonym and don't adjust the offsets.
        this.startOffset = startOff;
        this.endOffset = endOff;
      } else {
        final int newStart = startOff + offset;
        this.startOffset = newStart;
        this.endOffset = newStart + length;
      }
    }
  }  
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
@ -18,14 +18,19 @@ package org.apache.lucene.analysis.compound;
 */
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.Arrays;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.charfilter.MappingCharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
@ -300,4 +305,34 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
    }
  }
  // SOLR-2891
  // *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of bounds
  // wrt original text if a previous filter increases the length of the word (in this case ü -> ue)
  // so in this case we behave like WDF, and preserve any modified offsets
  public void testInvalidOffsets() throws Exception {
    final CharArraySet dict = makeDictionary("fall");
    final NormalizeCharMap normMap = new NormalizeCharMap();
    normMap.add("ü", "ue");
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        TokenFilter filter = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict);
        return new TokenStreamComponents(tokenizer, filter);
      }
      @Override
      protected Reader initReader(Reader reader) {
        return new MappingCharFilter(normMap, CharReader.get(reader));
      }
    };
    assertAnalyzesTo(analyzer, "banküberfall", 
        new String[] { "bankueberfall", "fall" },
        new int[] { 0,  0 },
        new int[] { 12, 12 });
  }
 }