LUCENE-3848: don't produce tokenstreams that start with posinc=0

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1301478 13f79535-47bb-0310-9956-ffa450edef68
2012-03-16 13:06:30 +00:00 · 2012-03-16 13:06:30 +00:00 · 3d2d144f92
parent 4f8375ded9
commit 3d2d144f92
6 changed files with 108 additions and 20 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -781,6 +781,12 @@ Changes in Runtime Behavior
  is multiplied into the norm, previously your boost would be 
  silently discarded.  (Tomás Fernández Löbbe, Hoss Man, Robert Muir)

+* LUCENE-3848: Fix tokenstreams to not produce a stream with an initial
+  position increment of 0: which is out of bounds (overlapping with a
+  non-existant previous term). Consumers such as IndexWriter and QueryParser
+  still check for and silently correct this situation today, but at some point 
+  in the future they may throw an exception.  (Mike McCandless, Robert Muir)
+  
 Security fixes

 * LUCENE-3588: Try harder to prevent SIGSEGV on cloned MMapIndexInputs:
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -157,7 +157,11 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
        }
      }
      if (posIncrAtt != null) {
-        assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
+        if (i == 0) {
+          assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
+        } else {
+          assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
+        }
      }
      if (posLengthAtt != null) {
        assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java
@ -33,6 +33,7 @@ public abstract class FilteringTokenFilter extends TokenFilter {

  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
+  private boolean first = true; // only used when not preserving gaps

  public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
    super(input);
@ -58,6 +59,13 @@ public abstract class FilteringTokenFilter extends TokenFilter {
    } else {
      while (input.incrementToken()) {
        if (accept()) {
+          if (first) {
+            // first token having posinc=0 is illegal.
+            if (posIncrAtt.getPositionIncrement() == 0) {
+              posIncrAtt.setPositionIncrement(1);
+            }
+            first = false;
+          }
          return true;
        }
      }
@ -66,6 +74,12 @@ public abstract class FilteringTokenFilter extends TokenFilter {
    return false;
  }

+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    first = true;
+  }
+
  /**
   * @see #setEnablePositionIncrements(boolean)
   */
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
@ -122,6 +122,8 @@ public final class WikipediaTokenizer extends Tokenizer {
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
  
+  private boolean first;
+
  /**
   * Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
   * <code>input</code> to a newly created JFlex scanner.
@ -209,8 +211,13 @@ public final class WikipediaTokenizer extends Tokenizer {
      //output the untokenized Token first
      collapseAndSaveTokens(tokenType, type);
    }
-    posIncrAtt.setPositionIncrement(scanner.getPositionIncrement());
+    int posinc = scanner.getPositionIncrement();
+    if (first && posinc == 0) {
+      posinc = 1; // don't emit posinc=0 for the first token!
+    }
+    posIncrAtt.setPositionIncrement(posinc);
    typeAtt.setType(type);
+    first = false;
    return true;
  }

@ -308,6 +315,7 @@ public final class WikipediaTokenizer extends Tokenizer {
    super.reset();
    tokens = null;
    scanner.reset();
+    first = true;
  }

  @Override
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java
@ -17,13 +17,17 @@ package org.apache.lucene.analysis.core;
 */

 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Set;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
@ -120,4 +124,56 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
      System.out.println(s);
    }
  }
+  
+  // stupid filter that inserts synonym of 'hte' for 'the'
+  private class MockSynonymFilter extends TokenFilter {
+    State bufferedState;
+    CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+
+    MockSynonymFilter(TokenStream input) {
+      super(input);
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      if (bufferedState != null) {
+        restoreState(bufferedState);
+        posIncAtt.setPositionIncrement(0);
+        termAtt.setEmpty().append("hte");
+        bufferedState = null;
+        return true;
+      } else if (input.incrementToken()) {
+        if (termAtt.toString().equals("the")) {
+          bufferedState = captureState();
+        }
+        return true;
+      } else {
+        return false;
+      }
+    }
+
+    @Override
+    public void reset() throws IOException {
+      super.reset();
+      bufferedState = null;
+    }
+  }
+  
+  public void testFirstPosInc() throws Exception {
+    Analyzer analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        TokenFilter filter = new MockSynonymFilter(tokenizer);
+        StopFilter stopfilter = new StopFilter(TEST_VERSION_CURRENT, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+        stopfilter.setEnablePositionIncrements(false);
+        return new TokenStreamComponents(tokenizer, stopfilter);
+      }
+    };
+    
+    assertAnalyzesTo(analyzer, "the quick brown fox",
+        new String[] { "hte", "quick", "brown", "fox" },
+        new int[] { 1, 1, 1, 1} );
+  }
 }
--- a/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java
+++ b/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java
@ -240,27 +240,27 @@ public class TestSlowSynonymFilter extends BaseTokenStreamTestCase {
    assertTokenizesTo(map, tokens("a,5"), 
        new String[] { "aa" },
        new int[] { 5 });
-    assertTokenizesTo(map, tokens("a,0"),
-        new String[] { "aa" },
-        new int[] { 0 });
+    assertTokenizesTo(map, tokens("b,1 a,0"),
+        new String[] { "b", "aa" },
+        new int[] { 1, 0 });

    // test that offset of first replacement is ignored (always takes the orig offset)
    map.add(strings("b"), tokens("bb,100"), orig, merge);
    assertTokenizesTo(map, tokens("b,5"),
        new String[] { "bb" },
        new int[] { 5 });
-    assertTokenizesTo(map, tokens("b,0"),
-        new String[] { "bb" },
-        new int[] { 0 });
+    assertTokenizesTo(map, tokens("c,1 b,0"),
+        new String[] { "c", "bb" },
+        new int[] { 1, 0 });

    // test that subsequent tokens are adjusted accordingly
    map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
    assertTokenizesTo(map, tokens("c,5"),
        new String[] { "cc", "c2" },
        new int[] { 5, 2 });
-    assertTokenizesTo(map, tokens("c,0"),
-        new String[] { "cc", "c2" },
-        new int[] { 0, 2 });
+    assertTokenizesTo(map, tokens("d,1 c,0"),
+        new String[] { "d", "cc", "c2" },
+        new int[] { 1, 0, 2 });
  }


@ -275,27 +275,27 @@ public class TestSlowSynonymFilter extends BaseTokenStreamTestCase {
    assertTokenizesTo(map, tokens("a,5"),
        new String[] { "a", "aa" },
        new int[] { 5, 0 });
-    assertTokenizesTo(map, tokens("a,0"),
-        new String[] { "a", "aa" },
-        new int[] { 0, 0 });
+    assertTokenizesTo(map, tokens("b,1 a,0"),
+        new String[] { "b", "a", "aa" },
+        new int[] { 1, 0, 0 });

    // test that offset of first replacement is ignored (always takes the orig offset)
    map.add(strings("b"), tokens("bb,100"), orig, merge);
    assertTokenizesTo(map, tokens("b,5"),
        new String[] { "b", "bb" },
        new int[] { 5, 0 });
-    assertTokenizesTo(map, tokens("b,0"),
-        new String[] { "b", "bb" },
-        new int[] { 0, 0 });
+    assertTokenizesTo(map, tokens("c,1 b,0"),
+        new String[] { "c", "b", "bb" },
+        new int[] { 1, 0, 0 });

    // test that subsequent tokens are adjusted accordingly
    map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
    assertTokenizesTo(map, tokens("c,5"),
        new String[] { "c", "cc", "c2" },
        new int[] { 5, 0, 2 });
-    assertTokenizesTo(map, tokens("c,0"),
-        new String[] { "c", "cc", "c2" },
-        new int[] { 0, 0, 2 });
+    assertTokenizesTo(map, tokens("d,1 c,0"),
+        new String[] { "d", "c", "cc", "c2" },
+        new int[] { 1, 0, 0, 2 });
  }