LUCENE-3848: don't produce tokenstreams that start with posinc=0

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1301478 13f79535-47bb-0310-9956-ffa450edef68
2012-03-16 13:06:30 +00:00 · 2012-03-16 13:06:30 +00:00 · 3d2d144f92
parent 4f8375ded9
commit 3d2d144f92
6 changed files with 108 additions and 20 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -781,6 +781,12 @@ Changes in Runtime Behavior
  is multiplied into the norm, previously your boost would be 
  silently discarded.  (Tomás Fernández Löbbe, Hoss Man, Robert Muir)
 * LUCENE-3848: Fix tokenstreams to not produce a stream with an initial
  position increment of 0: which is out of bounds (overlapping with a
  non-existant previous term). Consumers such as IndexWriter and QueryParser
  still check for and silently correct this situation today, but at some point 
  in the future they may throw an exception.  (Mike McCandless, Robert Muir)
 Security fixes
 * LUCENE-3588: Try harder to prevent SIGSEGV on cloned MMapIndexInputs:
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -157,8 +157,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
        }
      }
      if (posIncrAtt != null) {
        if (i == 0) {
          assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
        } else {
          assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
        }
      }
      if (posLengthAtt != null) {
        assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
      }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java
@ -33,6 +33,7 @@ public abstract class FilteringTokenFilter extends TokenFilter {
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
  private boolean first = true; // only used when not preserving gaps
  public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
    super(input);
@ -58,6 +59,13 @@ public abstract class FilteringTokenFilter extends TokenFilter {
    } else {
      while (input.incrementToken()) {
        if (accept()) {
          if (first) {
            // first token having posinc=0 is illegal.
            if (posIncrAtt.getPositionIncrement() == 0) {
              posIncrAtt.setPositionIncrement(1);
            }
            first = false;
          }
          return true;
        }
      }
@ -66,6 +74,12 @@ public abstract class FilteringTokenFilter extends TokenFilter {
    return false;
  }
  @Override
  public void reset() throws IOException {
    super.reset();
    first = true;
  }
  /**
   * @see #setEnablePositionIncrements(boolean)
   */
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
@ -122,6 +122,8 @@ public final class WikipediaTokenizer extends Tokenizer {
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
  private boolean first;
  /**
   * Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
   * <code>input</code> to a newly created JFlex scanner.
@ -209,8 +211,13 @@ public final class WikipediaTokenizer extends Tokenizer {
      //output the untokenized Token first
      collapseAndSaveTokens(tokenType, type);
    }
-    posIncrAtt.setPositionIncrement(scanner.getPositionIncrement());
+    int posinc = scanner.getPositionIncrement();
    if (first && posinc == 0) {
      posinc = 1; // don't emit posinc=0 for the first token!
    }
    posIncrAtt.setPositionIncrement(posinc);
    typeAtt.setType(type);
    first = false;
    return true;
  }
@ -308,6 +315,7 @@ public final class WikipediaTokenizer extends Tokenizer {
    super.reset();
    tokens = null;
    scanner.reset();
    first = true;
  }
  @Override
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java
@ -17,13 +17,17 @@ package org.apache.lucene.analysis.core;
 */
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
@ -120,4 +124,56 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
      System.out.println(s);
    }
  }
  // stupid filter that inserts synonym of 'hte' for 'the'
  private class MockSynonymFilter extends TokenFilter {
    State bufferedState;
    CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
    MockSynonymFilter(TokenStream input) {
      super(input);
    }
    @Override
    public boolean incrementToken() throws IOException {
      if (bufferedState != null) {
        restoreState(bufferedState);
        posIncAtt.setPositionIncrement(0);
        termAtt.setEmpty().append("hte");
        bufferedState = null;
        return true;
      } else if (input.incrementToken()) {
        if (termAtt.toString().equals("the")) {
          bufferedState = captureState();
        }
        return true;
      } else {
        return false;
      }
    }
    @Override
    public void reset() throws IOException {
      super.reset();
      bufferedState = null;
    }
  }
  public void testFirstPosInc() throws Exception {
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        TokenFilter filter = new MockSynonymFilter(tokenizer);
        StopFilter stopfilter = new StopFilter(TEST_VERSION_CURRENT, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
        stopfilter.setEnablePositionIncrements(false);
        return new TokenStreamComponents(tokenizer, stopfilter);
      }
    };
    assertAnalyzesTo(analyzer, "the quick brown fox",
        new String[] { "hte", "quick", "brown", "fox" },
        new int[] { 1, 1, 1, 1} );
  }
 }
--- a/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java
+++ b/solr/core/src/test/org/apache/solr/analysis/TestSlowSynonymFilter.java
@ -240,27 +240,27 @@ public class TestSlowSynonymFilter extends BaseTokenStreamTestCase {
    assertTokenizesTo(map, tokens("a,5"), 
        new String[] { "aa" },
        new int[] { 5 });
-    assertTokenizesTo(map, tokens("a,0"),
+    assertTokenizesTo(map, tokens("b,1 a,0"),
-        new String[] { "aa" },
+        new String[] { "b", "aa" },
-        new int[] { 0 });
+        new int[] { 1, 0 });
    // test that offset of first replacement is ignored (always takes the orig offset)
    map.add(strings("b"), tokens("bb,100"), orig, merge);
    assertTokenizesTo(map, tokens("b,5"),
        new String[] { "bb" },
        new int[] { 5 });
-    assertTokenizesTo(map, tokens("b,0"),
+    assertTokenizesTo(map, tokens("c,1 b,0"),
-        new String[] { "bb" },
+        new String[] { "c", "bb" },
-        new int[] { 0 });
+        new int[] { 1, 0 });
    // test that subsequent tokens are adjusted accordingly
    map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
    assertTokenizesTo(map, tokens("c,5"),
        new String[] { "cc", "c2" },
        new int[] { 5, 2 });
-    assertTokenizesTo(map, tokens("c,0"),
+    assertTokenizesTo(map, tokens("d,1 c,0"),
-        new String[] { "cc", "c2" },
+        new String[] { "d", "cc", "c2" },
-        new int[] { 0, 2 });
+        new int[] { 1, 0, 2 });
  }
@ -275,27 +275,27 @@ public class TestSlowSynonymFilter extends BaseTokenStreamTestCase {
    assertTokenizesTo(map, tokens("a,5"),
        new String[] { "a", "aa" },
        new int[] { 5, 0 });
-    assertTokenizesTo(map, tokens("a,0"),
+    assertTokenizesTo(map, tokens("b,1 a,0"),
-        new String[] { "a", "aa" },
+        new String[] { "b", "a", "aa" },
-        new int[] { 0, 0 });
+        new int[] { 1, 0, 0 });
    // test that offset of first replacement is ignored (always takes the orig offset)
    map.add(strings("b"), tokens("bb,100"), orig, merge);
    assertTokenizesTo(map, tokens("b,5"),
        new String[] { "b", "bb" },
        new int[] { 5, 0 });
-    assertTokenizesTo(map, tokens("b,0"),
+    assertTokenizesTo(map, tokens("c,1 b,0"),
-        new String[] { "b", "bb" },
+        new String[] { "c", "b", "bb" },
-        new int[] { 0, 0 });
+        new int[] { 1, 0, 0 });
    // test that subsequent tokens are adjusted accordingly
    map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
    assertTokenizesTo(map, tokens("c,5"),
        new String[] { "c", "cc", "c2" },
        new int[] { 5, 0, 2 });
-    assertTokenizesTo(map, tokens("c,0"),
+    assertTokenizesTo(map, tokens("d,1 c,0"),
-        new String[] { "c", "cc", "c2" },
+        new String[] { "d", "c", "cc", "c2" },
-        new int[] { 0, 0, 2 });
+        new int[] { 1, 0, 0, 2 });
  }