SOLR-2891: fix CompoundWordTokenFilter to not create invalid offsets when the length of the text was changed by a previous filter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1234546 13f79535-47bb-0310-9956-ffa450edef68
2012-01-22 16:41:06 +00:00 · 2012-01-22 16:41:06 +00:00 · a7cfee6b07
parent f3a363708f
commit a7cfee6b07
3 changed files with 54 additions and 9 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -814,9 +814,10 @@ Bug fixes
 * LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
  to clones/reopened readers.  (Uwe Schindler)

-* LUCENE-3642: Fixed bugs in CharTokenizer, n-gram filters, and smart chinese 
-  where they would create invalid offsets in some situations, leading to problems
-  in highlighting. (Max Beutel via Robert Muir)
+* LUCENE-3642, SOLR-2891: Fixed bugs in CharTokenizer, n-gram filters, 
+  compound token filters, and smart chinese where they would create invalid 
+  offsets in some situations, leading to problems in highlighting. 
+  (Max Beutel, Edwin Steiner via Robert Muir)

 * LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
  Float.MIN_VALUE when it should be Float.NaN, when there were 0
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
@ -154,13 +154,22 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {

    /** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
    public CompoundToken(int offset, int length) {
-      final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset;
      this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
-      // TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed
-      // chars from the term, offsets may not match correctly (other filters producing tokens
-      // may also have this problem):
-      this.startOffset = newStart;
-      this.endOffset = newStart + length;
+      
+      // offsets of the original word
+      int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset();
+      int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset();
+      
+      if (endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) {
+        // if length by start + end offsets doesn't match the term text then assume
+        // this is a synonym and don't adjust the offsets.
+        this.startOffset = startOff;
+        this.endOffset = endOff;
+      } else {
+        final int newStart = startOff + offset;
+        this.startOffset = newStart;
+        this.endOffset = newStart + length;
+      }
    }

  }  
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
@ -18,14 +18,19 @@ package org.apache.lucene.analysis.compound;
 */

 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.Arrays;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.charfilter.MappingCharFilter;
+import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
@ -299,5 +304,35 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
      }
    }
  }
+  
+  // SOLR-2891
+  // *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of bounds
+  // wrt original text if a previous filter increases the length of the word (in this case ü -> ue)
+  // so in this case we behave like WDF, and preserve any modified offsets
+  public void testInvalidOffsets() throws Exception {
+    final CharArraySet dict = makeDictionary("fall");
+    final NormalizeCharMap normMap = new NormalizeCharMap();
+    normMap.add("ü", "ue");
+    
+    Analyzer analyzer = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        TokenFilter filter = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict);
+        return new TokenStreamComponents(tokenizer, filter);
+      }
+
+      @Override
+      protected Reader initReader(Reader reader) {
+        return new MappingCharFilter(normMap, CharReader.get(reader));
+      }
+    };
+
+    assertAnalyzesTo(analyzer, "banküberfall", 
+        new String[] { "bankueberfall", "fall" },
+        new int[] { 0,  0 },
+        new int[] { 12, 12 });
+  }

 }