LUCENE-3873: add MockGraphTokenFilter, inserting random graph tokens

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1310910 13f79535-47bb-0310-9956-ffa450edef68
2012-04-07 23:06:12 +00:00 · 2012-04-07 23:06:12 +00:00 · 755ebafa49
parent 62890c8089
commit 755ebafa49
12 changed files with 1095 additions and 18 deletions
--- a/lucene/core/src/java/org/apache/lucene/analysis/Token.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/Token.java
@ -22,6 +22,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.index.Payload;
 import org.apache.lucene.index.DocsAndPositionsEnum; // for javadoc
@ -121,13 +122,14 @@ import org.apache.lucene.util.AttributeReflector;
 */
 public class Token extends CharTermAttributeImpl 
                   implements TypeAttribute, PositionIncrementAttribute,
-                              FlagsAttribute, OffsetAttribute, PayloadAttribute {
+                              FlagsAttribute, OffsetAttribute, PayloadAttribute, PositionLengthAttribute {
  private int startOffset,endOffset;
  private String type = DEFAULT_TYPE;
  private int flags;
  private Payload payload;
  private int positionIncrement = 1;
  private int positionLength = 1;
  /** Constructs a Token will null text. */
  public Token() {
@ -270,6 +272,20 @@ public class Token extends CharTermAttributeImpl
    return positionIncrement;
  }
  /** Set the position length.
   * @see PositionLengthAttribute */
  @Override
  public void setPositionLength(int positionLength) {
    this.positionLength = positionLength;
  }
  /** Get the position length.
   * @see PositionLengthAttribute */
  @Override
  public int getPositionLength() {
    return positionLength;
  }
  /** Returns this Token's starting offset, the position of the first character
    corresponding to this token in the source text.
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
@ -40,6 +40,10 @@ public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribut
  /** Set the starting and ending offset.
    @see #startOffset() and #endOffset()*/
  public void setOffset(int startOffset, int endOffset) {
    // TODO: check that these are valid!  IE, each should be
    // >= 0, and endOffset should be >= startOffset.
    // Problem is this could "break" existing
    // tokenizers/filters.
    this.startOffset = startOffset;
    this.endOffset = endOffset;
  }
--- a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
@ -0,0 +1,382 @@
 package org.apache.lucene.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.io.Reader;
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 public class TestGraphTokenizers extends BaseTokenStreamTestCase {
  // Makes a graph TokenStream from the string; separate
  // positions with single space, multiple tokens at the same
  // position with /, and add optional position length with
  // :.  EG "a b c" is a simple chain, "a/x b c" adds 'x'
  // over 'a' at position 0 with posLen=1, "a/x:3 b c" adds
  // 'x' over a with posLen=3.  Tokens are in normal-form!
  // So, offsets are computed based on the first token at a
  // given position.  NOTE: each token must be a single
  // character!  We assume this when computing offsets...
  // NOTE: all input tokens must be length 1!!!  This means
  // you cannot turn on MockCharFilter when random
  // testing...
  private static class GraphTokenizer extends Tokenizer {
    private List<Token> tokens;
    private int upto;
    private int inputLength;
    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
    private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
    public GraphTokenizer(Reader input) {
      super(input);
    }
    @Override
    public void reset() {
      tokens = null;
      upto = 0;
    }
    @Override
    public boolean incrementToken() throws IOException {
      if (tokens == null) {
        fillTokens();
      }
      //System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size());
      if (upto == tokens.size()) {
        //System.out.println("  END @ " + tokens.size());
        return false;
      } 
      final Token t = tokens.get(upto++);
      //System.out.println("  return token=" + t);
      clearAttributes();
      termAtt.append(t.toString());
      offsetAtt.setOffset(t.startOffset(), t.endOffset());
      posIncrAtt.setPositionIncrement(t.getPositionIncrement());
      posLengthAtt.setPositionLength(t.getPositionLength());
      return true;
    }
    @Override
    public void end() throws IOException {
      super.end();
      // NOTE: somewhat... hackish, but we need this to
      // satisfy BTSTC:
      final int lastOffset;
      if (tokens != null && !tokens.isEmpty()) {
        lastOffset = tokens.get(tokens.size()-1).endOffset();
      } else {
        lastOffset = 0;
      }
      offsetAtt.setOffset(correctOffset(lastOffset),
                          correctOffset(inputLength));
    }
    private void fillTokens() throws IOException {
      final StringBuilder sb = new StringBuilder();
      final char[] buffer = new char[256];
      while (true) {
        final int count = input.read(buffer);
        if (count == -1) {
          break;
        }
        sb.append(buffer, 0, count);
        //System.out.println("got count=" + count);
      }
      //System.out.println("fillTokens: " + sb);
      inputLength = sb.length();
      final String[] parts = sb.toString().split(" ");
      tokens = new ArrayList<Token>();
      int pos = 0;
      int maxPos = -1;
      int offset = 0;
      //System.out.println("again");
      for(String part : parts) {
        final String[] overlapped = part.split("/");
        boolean firstAtPos = true;
        int minPosLength = Integer.MAX_VALUE;
        for(String part2 : overlapped) {
          final int colonIndex = part2.indexOf(':');
          final String token;
          final int posLength;
          if (colonIndex != -1) {
            token = part2.substring(0, colonIndex);
            posLength = Integer.parseInt(part2.substring(1+colonIndex));
          } else {
            token = part2;
            posLength = 1;
          }
          maxPos = Math.max(maxPos, pos + posLength);
          minPosLength = Math.min(minPosLength, posLength);
          final Token t = new Token(token, offset, offset + 2*posLength - 1);
          t.setPositionLength(posLength);
          t.setPositionIncrement(firstAtPos ? 1:0);
          firstAtPos = false;
          //System.out.println("  add token=" + t + " startOff=" + t.startOffset() + " endOff=" + t.endOffset());
          tokens.add(t);
        }
        pos += minPosLength;
        offset = 2 * pos;
      }
      assert maxPos <= pos: "input string mal-formed: posLength>1 tokens hang over the end";
    }
  }
  public void testMockGraphTokenFilterBasic() throws Exception {
    for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {
      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter);
      }
      // Make new analyzer each time, because MGTF has fixed
      // seed:
      final Analyzer a = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
            final TokenStream t2 = new MockGraphTokenFilter(random, t);
            return new TokenStreamComponents(t, t2);
          }
        };
      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
    }
  }
  public void testMockGraphTokenFilterOnGraphInput() throws Exception {
    for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter);
      }
      // Make new analyzer each time, because MGTF has fixed
      // seed:
      final Analyzer a = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            final Tokenizer t = new GraphTokenizer(reader);
            final TokenStream t2 = new MockGraphTokenFilter(random, t);
            return new TokenStreamComponents(t, t2);
          }
        };
      checkAnalysisConsistency(random, a, false, "a/x:3 c/y:2 d e f/z:4 g h i j k");
    }
  }
  // Just deletes (leaving hole) token 'a':
  private final static class RemoveATokens extends TokenFilter {
    private int pendingPosInc;
    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
    public RemoveATokens(TokenStream in) {
      super(in);
    }
    @Override
    public void reset() throws IOException {
      super.reset();
      pendingPosInc = 0;
    }
    @Override
    public void end() throws IOException {
      super.end();
      posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
    }
    @Override
    public boolean incrementToken() throws IOException {
      while (true) {
        final boolean gotOne = input.incrementToken();
        if (!gotOne) {
          return false;
        } else if (termAtt.toString().equals("a")) {
          pendingPosInc += posIncAtt.getPositionIncrement();
        } else {
          posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
          pendingPosInc = 0;
          return true;
        }
      }
    }
  }
  public void testMockGraphTokenFilterBeforeHoles() throws Exception {
    for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter);
      }
      // Make new analyzer each time, because MGTF has fixed
      // seed:
      final Analyzer a = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
            final TokenStream t2 = new MockGraphTokenFilter(random, t);
            final TokenStream t3 = new RemoveATokens(t2);
            return new TokenStreamComponents(t, t3);
          }
        };
      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
      checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
    }
  }
  public void testMockGraphTokenFilterAfterHoles() throws Exception {
    for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter);
      }
      // Make new analyzer each time, because MGTF has fixed
      // seed:
      final Analyzer a = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
            final TokenStream t2 = new RemoveATokens(t);
            final TokenStream t3 = new MockGraphTokenFilter(random, t2);
            return new TokenStreamComponents(t, t3);
          }
        };
      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
      checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
    }
  }
  public void testMockGraphTokenFilterRandom() throws Exception {
    for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {
      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter);
      }
      // Make new analyzer each time, because MGTF has fixed
      // seed:
      final Analyzer a = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
            final TokenStream t2 = new MockGraphTokenFilter(random, t);
            return new TokenStreamComponents(t, t2);
          }
        };
      checkRandomData(random, a, 5, atLeast(1000));
    }
  }
  // Two MockGraphTokenFilters
  public void testDoubleMockGraphTokenFilterRandom() throws Exception {
    for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {
      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter);
      }
      // Make new analyzer each time, because MGTF has fixed
      // seed:
      final Analyzer a = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
            final TokenStream t1 = new MockGraphTokenFilter(random, t);
            final TokenStream t2 = new MockGraphTokenFilter(random, t1);
            return new TokenStreamComponents(t, t2);
          }
        };
      checkRandomData(random, a, 5, atLeast(1000));
    }
  }
  public void testMockGraphTokenFilterBeforeHolesRandom() throws Exception {
    for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {
      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter);
      }
      // Make new analyzer each time, because MGTF has fixed
      // seed:
      final Analyzer a = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
            final TokenStream t1 = new MockGraphTokenFilter(random, t);
            final TokenStream t2 = new MockHoleInjectingTokenFilter(random, t1);
            return new TokenStreamComponents(t, t2);
          }
        };
      checkRandomData(random, a, 5, atLeast(1000));
    }
  }
  public void testMockGraphTokenFilterAfterHolesRandom() throws Exception {
    for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {
      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter);
      }
      // Make new analyzer each time, because MGTF has fixed
      // seed:
      final Analyzer a = new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
            final TokenStream t1 = new MockHoleInjectingTokenFilter(random, t);
            final TokenStream t2 = new MockGraphTokenFilter(random, t1);
            return new TokenStreamComponents(t, t2);
          }
        };
      checkRandomData(random, a, 5, atLeast(1000));
    }
  }
 }
--- a/lucene/core/src/test/org/apache/lucene/search/TestPositionIncrement.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestPositionIncrement.java
@ -59,6 +59,7 @@ public class TestPositionIncrement extends LuceneTestCase {
      @Override
      public TokenStreamComponents createComponents(String fieldName, Reader reader) {
        return new TokenStreamComponents(new Tokenizer(reader) {
          // TODO: use CannedTokenStream
          private final String[] TOKENS = {"1", "2", "3", "4", "5"};
          private final int[] INCREMENTS = {0, 2, 1, 0, 1};
          private int i = 0;
--- a/lucene/core/src/test/org/apache/lucene/util/TestRollingBuffer.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestRollingBuffer.java
@ -0,0 +1,78 @@
 package org.apache.lucene.util;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 public class TestRollingBuffer extends LuceneTestCase {
  private static class Position implements RollingBuffer.Resettable {
    public int pos;
    @Override
    public void reset() {
      pos = -1;
    }
  }
  public void test() {
    final RollingBuffer<Position> buffer = new RollingBuffer<Position>() {
      @Override
      protected Position newInstance() {
        final Position pos = new Position();
        pos.pos = -1;
        return pos;
      }
    };
    for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
      int freeBeforePos = 0;
      final int maxPos = atLeast(10000);
      final FixedBitSet posSet = new FixedBitSet(maxPos + 1000);
      int posUpto = 0;
      while (freeBeforePos < maxPos) {
        if (random.nextInt(4) == 1) {
          final int limit = rarely() ? 1000 : 20;
          final int inc = random.nextInt(limit);
          final int pos = freeBeforePos + inc;
          posUpto = Math.max(posUpto, pos);
          if (VERBOSE) {
            System.out.println("  check pos=" + pos + " posUpto=" + posUpto);
          }
          final Position posData = buffer.get(pos);
          if (!posSet.getAndSet(pos)) {
            assertEquals(-1, posData.pos);
            posData.pos = pos;
          } else {
            assertEquals(pos, posData.pos);
          }
        } else {
          if (posUpto > freeBeforePos) {
            freeBeforePos += random.nextInt(posUpto - freeBeforePos);
          }
          if (VERBOSE) {
            System.out.println("  freeBeforePos=" + freeBeforePos);
          }
          buffer.freeBefore(freeBeforePos);
        }          
      }
      buffer.reset();
    }
  }
 }
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 /**
 * TokenStream from a canned list of Tokens.
@ -32,10 +33,11 @@ public final class CannedTokenStream extends TokenStream {
  private int upto = 0;
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
-  public CannedTokenStream(Token[] tokens) {
+  public CannedTokenStream(Token... tokens) {
    this.tokens = tokens;
  }
@ -49,6 +51,7 @@ public final class CannedTokenStream extends TokenStream {
      termAtt.setEmpty();
      termAtt.append(token.toString());
      posIncrAtt.setPositionIncrement(token.getPositionIncrement());
      posLengthAtt.setPositionLength(token.getPositionLength());
      offsetAtt.setOffset(token.startOffset(), token.endOffset());
      payloadAtt.setPayload(token.getPayload());
      return true;
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockGraphTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockGraphTokenFilter.java
@ -0,0 +1,318 @@
 package org.apache.lucene.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Random;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.RollingBuffer;
 import org.apache.lucene.util._TestUtil;
 // TODO: sometimes remove tokens too...?
 /** Randomly inserts overlapped (posInc=0) tokens with
 *  posLength sometimes > 1.  The chain must have
 *  an OffsetAttribute.  */
 public final class MockGraphTokenFilter extends TokenFilter {
  private static boolean DEBUG = false;
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final long seed;
  private Random random;
  // Don't init to -1 (caller must first call reset):
  private int inputPos;
  private int outputPos;
  // Don't init to -1 (caller must first call reset):
  private int lastOutputPos;
  private boolean end;
  private final class Position implements RollingBuffer.Resettable {
    final List<AttributeSource.State> states = new ArrayList<AttributeSource.State>();
    int nextRead;
    // Any token leaving from this position should have this startOffset:
    int startOffset = -1;
    // Any token arriving to this positoin should have this endOffset:
    int endOffset = -1;
    @Override
    public void reset() {
      states.clear();
      nextRead = 0;
      startOffset = -1;
      endOffset = -1;
    }
    public void captureState() throws IOException {
      assert startOffset == offsetAtt.startOffset();
      states.add(MockGraphTokenFilter.this.captureState());
    }
  }
  private final RollingBuffer<Position> positions = new RollingBuffer<Position>() {
    @Override
    protected Position newInstance() {
      return new Position();
    }
  };
  public MockGraphTokenFilter(Random random, TokenStream input) {
    super(input);
    seed = random.nextLong();
  }
  @Override
  public void reset() throws IOException {
    super.reset();
    end = false;
    positions.reset();
    // NOTE: must be "deterministically random" because
    // BaseTokenStreamTestCase pulls tokens twice on the
    // same input and asserts they are the same:
    this.random = new Random(seed);
    inputPos = -1;
    outputPos = 0;
    lastOutputPos = -1;
  }
  private enum TOKEN_POS {SAME, NEXT, END};
  private TOKEN_POS nextInputToken() throws IOException {
    assert !end;
    if (DEBUG) {
      System.out.println("  call input.incr");
    }
    final boolean result = input.incrementToken();
    if (result) {
      final int posInc = posIncAtt.getPositionIncrement();
      final int posLength = posLengthAtt.getPositionLength();
      // NOTE: when posLength > 1, we have a hole... we
      // don't allow injected tokens to start or end
      // "inside" a hole, so we don't need to make up
      // offsets inside it
      assert inputPos != -1 || posInc > 0;
      inputPos += posInc;
      if (DEBUG) {
        System.out.println("    got token term=" + termAtt + " posLength=" + posLength + " posInc=" + posInc + " inputPos=" + inputPos);
      }
      final Position posData = positions.get(inputPos);
      if (posInc == 0) {
        assert posData.startOffset == offsetAtt.startOffset();
      } else {
        assert posData.startOffset == -1;
        posData.startOffset = offsetAtt.startOffset();
        if (DEBUG) {
          System.out.println("    record startOffset[" + inputPos + "]=" + posData.startOffset);
        }
      }
      final Position posEndData = positions.get(inputPos + posLength);
      if (posEndData.endOffset == -1) {
        // First time we are seeing a token that
        // arrives to this position: record the
        // endOffset
        posEndData.endOffset = offsetAtt.endOffset();
        if (DEBUG) {
          System.out.println("    record endOffset[" + (inputPos+posLength) + "]=" + posEndData.endOffset);
        }
      } else {
        // We've already seen a token arriving there;
        // make sure its endOffset is the same (NOTE:
        // some tokenizers, eg WDF, will fail
        // this...):
        assert posEndData.endOffset == offsetAtt.endOffset(): "posEndData.endOffset=" + posEndData.endOffset + " vs offsetAtt.endOffset()=" + offsetAtt.endOffset();
      }
      if (posInc == 0) {
        return TOKEN_POS.SAME;
      } else {
        return TOKEN_POS.NEXT;
      }
    } else {
      if (DEBUG) {
        System.out.println("    got END");
      }
      return TOKEN_POS.END;
    }
  }
  private void pushOutputPos() {
    posIncAtt.setPositionIncrement(outputPos - lastOutputPos);
    if (DEBUG) {
      System.out.println("  pushOutputPos: set posInc=" + posIncAtt.getPositionIncrement());
    }
    lastOutputPos = outputPos;
    positions.freeBefore(outputPos);
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (DEBUG) {
      System.out.println("MockGraphTF.incr inputPos=" + inputPos + " outputPos=" + outputPos);
    }
    while (true) {
      final Position posData = positions.get(outputPos);
      if (posData.nextRead < posData.states.size()) {
        // Serve up all buffered tokens from this position:
        if (DEBUG) {
          System.out.println("  restore buffered nextRead=" + posData.nextRead + " vs " + posData.states.size());
        }
        restoreState(posData.states.get(posData.nextRead++));
        if (DEBUG) {
          System.out.println("    term=" + termAtt + " outputPos=" + outputPos);
        }
        pushOutputPos();
        return true;
      }
      boolean tokenPending = false;
      final int prevInputPos = inputPos;
      if (inputPos == -1 || inputPos == outputPos) {
        // We've used up the buffered tokens; pull the next
        // input token:
        if (end) {
          return false;
        }
        final TOKEN_POS result = nextInputToken();
        if (result == TOKEN_POS.SAME) {
          return true;
        } else if (result == TOKEN_POS.NEXT) {
          tokenPending = true;
        } else {
          // NOTE: we don't set end=true here... because we
          // are immediately passing through "the end" to
          // caller (return false), and caller must not call
          // us again:
          return false;
        }
      } else {
        assert inputPos > outputPos;
        if (DEBUG) {
          System.out.println("  done @ outputPos=" + outputPos);
        }
      }
      // We're done (above) serving up all tokens leaving
      // from the same position; now maybe insert a token.
      // Note that we may insert more than one token leaving
      // from this position.  We only inject tokens at
      // positions where we've seen at least one input token
      // (ie, we cannot inject inside holes):
      if (prevInputPos != -1  && positions.get(outputPos).startOffset != -1 && random.nextInt(7) == 5) {
        if (DEBUG) {
          System.out.println("  inject @ outputPos=" + outputPos);
        }
        if (tokenPending) {
          positions.get(inputPos).captureState();
        }
        final int posLength = _TestUtil.nextInt(random, 1, 5);
        final Position posEndData = positions.get(outputPos + posLength);
        // Pull enough tokens until we discover what our
        // endOffset should be:
        while (!end && posEndData.endOffset == -1 && inputPos <= (outputPos + posLength)) {
          if (DEBUG) {
            System.out.println("  lookahead [endPos=" + (outputPos + posLength) + "]...");
          }
          final TOKEN_POS result = nextInputToken();
          if (result != TOKEN_POS.END) {
            positions.get(inputPos).captureState();
          } else {
            end = true;
            if (DEBUG) {
              System.out.println("    force end lookahead");
            }
            break;
          }
        }
        // TODO: really, here, on hitting end-of-tokens,
        // we'd like to know the ending "posInc", and allow
        // our token to extend up until that.  But: a
        // TokenFilter is not allowed to call end() from
        // within its incrementToken, so we can't do that.
        // It may have been better if the ending
        // posInc/offsets were set when incrementToken
        // returned false (ie, without having to call the
        // special end method):
        if (posEndData.endOffset != -1) {
          assert posEndData.endOffset != -1;
          clearAttributes();
          posLengthAtt.setPositionLength(posLength);
          termAtt.append(_TestUtil.randomUnicodeString(random));
          pushOutputPos();
          offsetAtt.setOffset(positions.get(outputPos).startOffset,
                              positions.get(outputPos + posLength).endOffset);
          if (DEBUG) {
            System.out.println("  inject: outputPos=" + outputPos + " startOffset=" + offsetAtt.startOffset() +
                               " endOffset=" + offsetAtt.endOffset() +
                               " posLength=" + posLengthAtt.getPositionLength());
          }
          // TODO: set TypeAtt too?
          return true;
        } else {
          // Either, we hit the end of the tokens (ie, our
          // attempted posLength is too long because it
          // hangs out over the end), or, our attempted
          // posLength ended in the middle of a hole; just
          // skip injecting in these cases.  We will still
          // test these cases by having a StopFilter after
          // MockGraphTokenFilter...
        }
      } else if (tokenPending) {
        outputPos = inputPos;
        if (DEBUG) {
          System.out.println("  pass-through");
        }
        pushOutputPos();
        return true;
      } else {
        // We are skipping over a hole (posInc > 1) from our input:
        outputPos++;
        if (DEBUG) {
          System.out.println("  incr outputPos=" + outputPos);
        }
      }
    }
  }
 }
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockHoleInjectingTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockHoleInjectingTokenFilter.java
@ -0,0 +1,62 @@
 package org.apache.lucene.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.util.Random;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.util._TestUtil;
 // Randomly injects holes:
 public final class MockHoleInjectingTokenFilter extends TokenFilter {
  private final long randomSeed;
  private Random random;
  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
  public MockHoleInjectingTokenFilter(Random random, TokenStream in) {
    super(in);
    randomSeed = random.nextLong();
  }
  @Override
  public void reset() throws IOException {
    super.reset();
    random = new Random(randomSeed);
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
      final int posInc = posIncAtt.getPositionIncrement();
      if (posInc > 0 && random.nextInt(5) == 3) {
        posIncAtt.setPositionIncrement(posInc + _TestUtil.nextInt(random, 1, 5));
        // TODO: should we tweak offsets...?
      }
      return true;
    } else {
      return false;
    }
  }
  // TODO: end?
 }
--- a/lucene/test-framework/src/java/org/apache/lucene/util/RollingBuffer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/RollingBuffer.java
@ -0,0 +1,130 @@
 package org.apache.lucene.util;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 // TODO: probably move this to core at some point (eg,
 // cutover kuromoji, synfilter, LookaheadTokenFilter)
 /** Acts like forever growing T[], but internally uses a
 *  circular buffer to reuse instances of T.
 * 
 *  @lucene.internal */
 public abstract class RollingBuffer<T extends RollingBuffer.Resettable> {
  public static interface Resettable {
    public void reset();
  }
  @SuppressWarnings("unchecked") private T[] buffer = (T[]) new RollingBuffer.Resettable[8];
  // Next array index to write to:
  private int nextWrite;
  // Next position to write:
  private int nextPos;
  // How many valid Position are held in the
  // array:
  private int count;
  public RollingBuffer() {
    for(int idx=0;idx<buffer.length;idx++) {
      buffer[idx] = newInstance();
    }
  }
  protected abstract T newInstance();
  public void reset() {
    nextWrite--;
    while (count > 0) {
      if (nextWrite == -1) {
        nextWrite = buffer.length - 1;
      }
      buffer[nextWrite--].reset();
      count--;
    }
    nextWrite = 0;
    nextPos = 0;
    count = 0;
  }
  // For assert:
  private boolean inBounds(int pos) {
    return pos < nextPos && pos >= nextPos - count;
  }
  private int getIndex(int pos) {
    int index = nextWrite - (nextPos - pos);
    if (index < 0) {
      index += buffer.length;
    }
    return index;
  }
  /** Get T instance for this absolute position;
   *  this is allowed to be arbitrarily far "in the
   *  future" but cannot be before the last freeBefore. */
  public T get(int pos) {
    //System.out.println("RA.get pos=" + pos + " nextPos=" + nextPos + " nextWrite=" + nextWrite + " count=" + count);
    while (pos >= nextPos) {
      if (count == buffer.length) {
        @SuppressWarnings("unchecked") T[] newBuffer = (T[]) new Resettable[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
        //System.out.println("  grow length=" + newBuffer.length);
        System.arraycopy(buffer, nextWrite, newBuffer, 0, buffer.length-nextWrite);
        System.arraycopy(buffer, 0, newBuffer, buffer.length-nextWrite, nextWrite);
        for(int i=buffer.length;i<newBuffer.length;i++) {
          newBuffer[i] = newInstance();
        }
        nextWrite = buffer.length;
        buffer = newBuffer;
      }
      if (nextWrite == buffer.length) {
        nextWrite = 0;
      }
      // Should have already been reset:
      nextWrite++;
      nextPos++;
      count++;
    }
    assert inBounds(pos);
    final int index = getIndex(pos);
    //System.out.println("  pos=" + pos + " nextPos=" + nextPos + " -> index=" + index);
    //assert buffer[index].pos == pos;
    return buffer[index];
  }
  public void freeBefore(int pos) {
    final int toFree = count - (nextPos - pos);
    assert toFree >= 0;
    assert toFree <= count: "toFree=" + toFree + " count=" + count;
    int index = nextWrite - count;
    if (index < 0) {
      index += buffer.length;
    }
    for(int i=0;i<toFree;i++) {
      if (index == buffer.length) {
        index = 0;
      }
      //System.out.println("  fb idx=" + index);
      buffer[index].reset();
      index++;
    }
    count -= toFree;
  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
@ -1,17 +1,5 @@
 package org.apache.lucene.analysis.core;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.util.Version;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.Arrays;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
@ -29,6 +17,20 @@ import java.util.Arrays;
 * limitations under the License.
 */
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.Arrays;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockGraphTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.util.Version;
 public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
  public void testHugeDoc() throws IOException {
@ -247,4 +249,18 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
  public void testRandomHugeStrings() throws Exception {
    checkRandomData(random, new StandardAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
  }
  // Adds random graph after:
  public void testRandomHugeStringsGraphAfter() throws Exception {
    checkRandomData(random,
                    new Analyzer() {
                      @Override
                      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
                        Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
                        TokenStream tokenStream = new MockGraphTokenFilter(random, tokenizer);
                        return new TokenStreamComponents(tokenizer, tokenStream);
                      }
                    },
                    200*RANDOM_MULTIPLIER, 8192);
  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
@ -33,6 +33,8 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.MockGraphTokenFilter;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.tokenattributes.*;
 import org.apache.lucene.util.CharsRef;
@ -430,6 +432,57 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
    }
  }
  // Adds MockGraphTokenFilter before SynFilter:
  public void testRandom2GraphBefore() throws Exception {
    final int numIters = atLeast(10);
    for (int i = 0; i < numIters; i++) {
      b = new SynonymMap.Builder(random.nextBoolean());
      final int numEntries = atLeast(10);
      for (int j = 0; j < numEntries; j++) {
        add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
      }
      final SynonymMap map = b.build();
      final boolean ignoreCase = random.nextBoolean();
      final Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
          TokenStream graph = new MockGraphTokenFilter(random, tokenizer);
          return new TokenStreamComponents(tokenizer, new SynonymFilter(graph, map, ignoreCase));
        }
      };
      checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER);
    }
  }
  // Adds MockGraphTokenFilter after SynFilter:
  public void testRandom2GraphAfter() throws Exception {
    final int numIters = atLeast(10);
    for (int i = 0; i < numIters; i++) {
      b = new SynonymMap.Builder(random.nextBoolean());
      final int numEntries = atLeast(10);
      for (int j = 0; j < numEntries; j++) {
        add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
      }
      final SynonymMap map = b.build();
      final boolean ignoreCase = random.nextBoolean();
      final Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
          TokenStream syns = new SynonymFilter(tokenizer, map, ignoreCase);
          TokenStream graph = new MockGraphTokenFilter(random, syns);
          return new TokenStreamComponents(tokenizer, graph);
        }
      };
      checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER);
    }
  }
  public void testEmptyTerm() throws IOException {
    final int numIters = atLeast(10);
    for (int i = 0; i < numIters; i++) {
@ -662,7 +715,6 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
    final boolean keepOrig = false;
    // b hangs off the end (no input token under it):
    add("a", "a b", keepOrig);
    final SynonymMap map = b.build();
    tokensIn = new MockTokenizer(new StringReader("a"),
                                 MockTokenizer.WHITESPACE,
                                 true);
@ -673,8 +725,8 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
    tokensIn.close();
    tokensOut = new SynonymFilter(tokensIn,
-                                     b.build(),
+                                  b.build(),
-                                     true);
+                                  true);
    termAtt = tokensOut.addAttribute(CharTermAttribute.class);
    posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
    offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
--- a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
+++ b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
@ -26,6 +26,7 @@ import java.io.StringReader;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockGraphTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
@ -191,6 +192,20 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
    checkRandomData(random, analyzerNoPunct, 200*RANDOM_MULTIPLIER, 8192);
  }
  public void testRandomHugeStringsMockGraphAfter() throws Exception {
    // Randomly inject graph tokens after JapaneseTokenizer:
    checkRandomData(random,
                    new Analyzer() {
                      @Override
                      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
                        Tokenizer tokenizer = new JapaneseTokenizer(reader, readDict(), false, Mode.SEARCH);
                        TokenStream graph = new MockGraphTokenFilter(random, tokenizer);
                        return new TokenStreamComponents(tokenizer, graph);
                      }
                    },
                    200*RANDOM_MULTIPLIER, 8192);
  }
  public void testLargeDocReliability() throws Exception {
    for (int i = 0; i < 100; i++) {
      String s = _TestUtil.randomUnicodeString(random, 10000);