From d55447b25e2885c58785776b9f829164512b0441 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Mon, 9 Apr 2012 13:24:23 +0000
Subject: [PATCH 01/40] git-svn-id:
 https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311219
 13f79535-47bb-0310-9956-ffa450edef68


From 6311f71de604bc2dda824855fab7834274278b05 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Mon, 9 Apr 2012 13:25:28 +0000
Subject: [PATCH 02/40] LUCENE-3969: commit current state

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311220 13f79535-47bb-0310-9956-ffa450edef68
---
 .../apache/lucene/analysis/MockAnalyzer.java  |   5 +-
 .../lucene/analysis/MockCharFilter.java       |   4 +-
 .../MockFixedLengthPayloadFilter.java         |   3 +
 .../lucene/analysis/MockTokenFilter.java      |   8 +-
 .../analysis/core/KeywordTokenizer.java       |   9 +
 .../analysis/path/PathHierarchyTokenizer.java |  27 +-
 .../path/ReversePathHierarchyTokenizer.java   |  18 +-
 .../analysis/pattern/PatternTokenizer.java    |   4 +
 .../analysis/position/PositionFilter.java     |   3 +
 .../analysis/snowball/SnowballFilter.java     |   2 +-
 .../analysis/core/TestRandomChains.java       | 527 +++++++++++++++---
 .../analysis/snowball/TestSnowball.java       |  16 +-
 12 files changed, 526 insertions(+), 100 deletions(-)

diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java
index 642b28f87b1..b1ab2597176 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java
@@ -76,7 +76,7 @@ public final class MockAnalyzer extends Analyzer {
    * MockAnalyzer(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false}).
    */
   public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
-    this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false);
+    this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, true);
   }
 
   /** 
@@ -93,7 +93,8 @@ public final class MockAnalyzer extends Analyzer {
   public TokenStreamComponents createComponents(String fieldName, Reader reader) {
     MockTokenizer tokenizer = new MockTokenizer(reader, runAutomaton, lowerCase, maxTokenLength);
     tokenizer.setEnableChecks(enableChecks);
-    TokenFilter filt = new MockTokenFilter(tokenizer, filter, enablePositionIncrements);
+    MockTokenFilter filt = new MockTokenFilter(tokenizer, filter);
+    filt.setEnablePositionIncrements(enablePositionIncrements);
     return new TokenStreamComponents(tokenizer, maybePayload(filt, fieldName));
   }
   
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java
index a488c4be3d5..5a11b97964b 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java
@@ -34,7 +34,9 @@ public class MockCharFilter extends CharStream {
     // TODO: instead of fixed remainder... maybe a fixed
     // random seed?
     this.remainder = remainder;
-    assert remainder >= 0 && remainder < 10 : "invalid parameter";
+    if (remainder < 0 || remainder >= 10) {
+      throw new IllegalArgumentException("invalid remainder parameter (must be 0..10): " + remainder);
+    }
   }
   
   // for testing only, uses a remainder of 0
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockFixedLengthPayloadFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockFixedLengthPayloadFilter.java
index 74e233924ee..bbe2f37fa58 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockFixedLengthPayloadFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockFixedLengthPayloadFilter.java
@@ -34,6 +34,9 @@ public final class MockFixedLengthPayloadFilter extends TokenFilter {
 
   public MockFixedLengthPayloadFilter(Random random, TokenStream in, int length) {
     super(in);
+    if (length < 0) {
+      throw new IllegalArgumentException("length must be >= 0");
+    }
     this.random = random;
     this.bytes = new byte[length];
     this.payload = new Payload(bytes);
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java
index 97863a40bd3..efc7633f6ce 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java
@@ -55,7 +55,7 @@ public final class MockTokenFilter extends TokenFilter {
       makeString("with"))));
   
   private final CharacterRunAutomaton filter;
-  private boolean enablePositionIncrements = false;
+  private boolean enablePositionIncrements = true;
 
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
@@ -67,14 +67,16 @@ public final class MockTokenFilter extends TokenFilter {
    * @param filter DFA representing the terms that should be removed.
    * @param enablePositionIncrements true if the removal should accumulate position increments.
    */
-  public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter, boolean enablePositionIncrements) {
+  public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter) {
     super(input);
     this.filter = filter;
-    this.enablePositionIncrements = enablePositionIncrements;
   }
   
   @Override
   public boolean incrementToken() throws IOException {
+    // TODO: fix me when posInc=false, to work like FilteringTokenFilter in that case and not return
+    // initial token with posInc=0 ever
+    
     // return the first non-stop word found
     int skippedPositions = 0;
     while (input.incrementToken()) {
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
index c9d73ef9669..44ee0842872 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
@@ -43,16 +43,25 @@ public final class KeywordTokenizer extends Tokenizer {
 
   public KeywordTokenizer(Reader input, int bufferSize) {
     super(input);
+    if (bufferSize <= 0) {
+      throw new IllegalArgumentException("bufferSize must be > 0");
+    }
     termAtt.resizeBuffer(bufferSize);
   }
 
   public KeywordTokenizer(AttributeSource source, Reader input, int bufferSize) {
     super(source, input);
+    if (bufferSize <= 0) {
+      throw new IllegalArgumentException("bufferSize must be > 0");
+    }
     termAtt.resizeBuffer(bufferSize);
   }
 
   public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) {
     super(factory, input);
+    if (bufferSize <= 0) {
+      throw new IllegalArgumentException("bufferSize must be > 0");
+    }
     termAtt.resizeBuffer(bufferSize);
   }
   
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java
index 26b5b1d3a28..c4450f4878d 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java
@@ -65,6 +65,12 @@ public class PathHierarchyTokenizer extends Tokenizer {
 
   public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) {
     super(input);
+    if (bufferSize < 0) {
+      throw new IllegalArgumentException("bufferSize cannot be negative");
+    }
+    if (skip < 0) {
+      throw new IllegalArgumentException("skip cannot be negative");
+    }
     termAtt.resizeBuffer(bufferSize);
 
     this.delimiter = delimiter;
@@ -85,10 +91,11 @@ public class PathHierarchyTokenizer extends Tokenizer {
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
   private int startPosition = 0;
-  private int finalOffset = 0;
   private int skipped = 0;
   private boolean endDelimiter = false;
   private StringBuilder resultToken;
+  
+  private int charsRead = 0;
 
 
   @Override
@@ -112,12 +119,13 @@ public class PathHierarchyTokenizer extends Tokenizer {
 
     while (true) {
       int c = input.read();
-      if( c < 0 ){
+      if (c >= 0) {
+        charsRead++;
+      } else {
         if( skipped > skip ) {
           length += resultToken.length();
           termAtt.setLength(length);
-          finalOffset = correctOffset(startPosition + length);
-          offsetAtt.setOffset(correctOffset(startPosition), finalOffset);
+           offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + length));
           if( added ){
             resultToken.setLength(0);
             resultToken.append(termAtt.buffer(), 0, length);
@@ -125,7 +133,6 @@ public class PathHierarchyTokenizer extends Tokenizer {
           return added;
         }
         else{
-          finalOffset = correctOffset(startPosition + length);
           return false;
         }
       }
@@ -168,8 +175,7 @@ public class PathHierarchyTokenizer extends Tokenizer {
     }
     length += resultToken.length();
     termAtt.setLength(length);
-    finalOffset = correctOffset(startPosition + length);
-    offsetAtt.setOffset(correctOffset(startPosition), finalOffset);
+    offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition+length));
     resultToken.setLength(0);
     resultToken.append(termAtt.buffer(), 0, length);
     return true;
@@ -178,14 +184,15 @@ public class PathHierarchyTokenizer extends Tokenizer {
   @Override
   public final void end() {
     // set final offset
+    int finalOffset = correctOffset(charsRead);
     offsetAtt.setOffset(finalOffset, finalOffset);
   }
 
   @Override
-  public void reset(Reader input) throws IOException {
-    super.reset(input);
+  public void reset() throws IOException {
+    super.reset();
     resultToken.setLength(0);
-    finalOffset = 0;
+    charsRead = 0;
     endDelimiter = false;
     skipped = 0;
   }
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java
index fc8a6831742..759c48c7cd6 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java
@@ -77,6 +77,13 @@ public class ReversePathHierarchyTokenizer extends Tokenizer {
 
   public ReversePathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) {
     super(input);
+    if (bufferSize < 0) {
+      throw new IllegalArgumentException("bufferSize cannot be negative");
+    }
+    if (skip < 0) {
+      // nocommit: not quite right right here: see line 84... if skip > numTokensFound we always get a NegativeArrayException? needs fixing!
+      throw new IllegalArgumentException("skip cannot be negative");
+    }
     termAtt.resizeBuffer(bufferSize);
     this.delimiter = delimiter;
     this.replacement = replacement;
@@ -137,7 +144,11 @@ public class ReversePathHierarchyTokenizer extends Tokenizer {
       }
       resultToken.getChars(0, resultToken.length(), resultTokenBuffer, 0);
       resultToken.setLength(0);
-      endPosition = delimiterPositions.get(delimitersCount-1 - skip);
+      int idx = delimitersCount-1 - skip;
+      if (idx >= 0) {
+        // otherwise its ok, because we will skip and return false
+        endPosition = delimiterPositions.get(idx);
+      }
       finalOffset = correctOffset(length);
       posAtt.setPositionIncrement(1);
     }
@@ -163,10 +174,11 @@ public class ReversePathHierarchyTokenizer extends Tokenizer {
   }
 
   @Override
-  public void reset(Reader input) throws IOException {
-    super.reset(input);
+  public void reset() throws IOException {
+    super.reset();
     resultToken.setLength(0);
     finalOffset = 0;
+    endPosition = 0;
     skipped = 0;
     delimitersCount = -1;
     delimiterPositions.clear();
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
index 3d43d17dced..6aca0c5edd8 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
@@ -71,6 +71,10 @@ public final class PatternTokenizer extends Tokenizer {
     this.group = group;
     fillBuffer(str, input);
     matcher = pattern.matcher(str);
+    // confusingly group count depends ENTIRELY on the pattern but is only accessible via matcher
+    if (group >= 0 && group > matcher.groupCount()) {
+      throw new IllegalArgumentException("invalid group specified: pattern only has: " + matcher.groupCount() + " capturing groups");
+    }
     index = 0;
   }
 
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java
index 97f5fefbbb1..04737ed0cd2 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java
@@ -57,6 +57,9 @@ public final class PositionFilter extends TokenFilter {
    */
   public PositionFilter(final TokenStream input, final int positionIncrement) {
     super(input);
+    if (positionIncrement < 0) {
+      throw new IllegalArgumentException("positionIncrement may not be negative");
+    }
     this.positionIncrement = positionIncrement;
   }
 
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
index c69d4707bb4..7a2639e70f3 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
@@ -67,7 +67,7 @@ public final class SnowballFilter extends TokenFilter {
         Class.forName("org.tartarus.snowball.ext." + name + "Stemmer").asSubclass(SnowballProgram.class);
       stemmer = stemClass.newInstance();
     } catch (Exception e) {
-      throw new RuntimeException(e.toString());
+      throw new IllegalArgumentException("Invalid stemmer class specified: " + name, e);
     }
   }
 
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index b40022a7384..d9759ef709e 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -18,17 +18,26 @@ package org.apache.lucene.analysis.core;
  */
 
 import java.io.File;
+import java.io.InputStream;
 import java.io.Reader;
 import java.io.StringReader;
 import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Modifier;
 import java.net.URL;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.Enumeration;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Random;
+import java.util.Set;
+import java.util.Map;
+import java.util.IdentityHashMap;
+import java.util.regex.Pattern;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -36,67 +45,113 @@ import org.apache.lucene.analysis.CachingTokenFilter;
 import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.EmptyTokenizer;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
+import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
+import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
+import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter;
+import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
+import org.apache.lucene.analysis.hunspell.HunspellDictionary;
+import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest;
+import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
 import org.apache.lucene.analysis.ngram.NGramTokenFilter;
 import org.apache.lucene.analysis.ngram.NGramTokenizer;
+import org.apache.lucene.analysis.payloads.IdentityEncoder;
+import org.apache.lucene.analysis.payloads.PayloadEncoder;
+import org.apache.lucene.analysis.snowball.TestSnowball;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.analysis.util.CharArrayMap;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.AttributeSource.AttributeFactory;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.Rethrow;
 import org.apache.lucene.util.Version;
+import org.apache.lucene.util._TestUtil;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
+import org.tartarus.snowball.SnowballProgram;
+import org.xml.sax.InputSource;
 
 /** tests random analysis chains */
 public class TestRandomChains extends BaseTokenStreamTestCase {
-  static List<Class<? extends Tokenizer>> tokenizers;
-  static List<Class<? extends TokenFilter>> tokenfilters;
-  static List<Class<? extends CharStream>> charfilters;
+  static List<Constructor<? extends Tokenizer>> tokenizers;
+  static List<Constructor<? extends TokenFilter>> tokenfilters;
+  static List<Constructor<? extends CharStream>> charfilters;
   
   @BeforeClass
   public static void beforeClass() throws Exception {
     List<Class<?>> analysisClasses = new ArrayList<Class<?>>();
     getClassesForPackage("org.apache.lucene.analysis", analysisClasses);
-    tokenizers = new ArrayList<Class<? extends Tokenizer>>();
-    tokenfilters = new ArrayList<Class<? extends TokenFilter>>();
-    charfilters = new ArrayList<Class<? extends CharStream>>();
-    for (Class<?> c : analysisClasses) {
-      // don't waste time with abstract classes or deprecated known-buggy ones
+    tokenizers = new ArrayList<Constructor<? extends Tokenizer>>();
+    tokenfilters = new ArrayList<Constructor<? extends TokenFilter>>();
+    charfilters = new ArrayList<Constructor<? extends CharStream>>();
+    for (final Class<?> c : analysisClasses) {
       final int modifiers = c.getModifiers();
-      if (Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
-          || c.getAnnotation(Deprecated.class) != null
-          || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
-          // TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
-          || c.equals(EmptyTokenizer.class)
-          // doesn't actual reset itself!
-          || c.equals(CachingTokenFilter.class)
-          // broken!
-          || c.equals(NGramTokenizer.class)
-          // broken!
-          || c.equals(NGramTokenFilter.class)
-          // broken!
-          || c.equals(EdgeNGramTokenizer.class)
-          // broken!
-          || c.equals(EdgeNGramTokenFilter.class)) {
+      if (
+        // don't waste time with abstract classes or deprecated known-buggy ones
+        Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
+        || c.isAnnotationPresent(Deprecated.class)
+        || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
+        || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharStream.class.isAssignableFrom(c))
+        // TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
+        || c == EmptyTokenizer.class
+        // doesn't actual reset itself!
+        || c == CachingTokenFilter.class
+        // doesn't consume whole stream!
+        || c == LimitTokenCountFilter.class
+        // broken!
+        || c == NGramTokenizer.class
+        // broken!
+        || c == NGramTokenFilter.class
+        // broken!
+        || c == EdgeNGramTokenizer.class
+        // broken!
+        || c == EdgeNGramTokenFilter.class
+      ) {
         continue;
       }
-      if (Tokenizer.class.isAssignableFrom(c)) {
-        tokenizers.add(c.asSubclass(Tokenizer.class));
-      } else if (TokenFilter.class.isAssignableFrom(c)) {
-        tokenfilters.add(c.asSubclass(TokenFilter.class));
-      } else if (CharStream.class.isAssignableFrom(c)) {
-        charfilters.add(c.asSubclass(CharStream.class));
+      for (final Constructor<?> ctor : c.getConstructors()) {
+        // don't test deprecated ctors, they likely have known bugs:
+        if (ctor.isAnnotationPresent(Deprecated.class) || ctor.isSynthetic()) {
+          continue;
+        }
+        if (Tokenizer.class.isAssignableFrom(c)) {
+          assertTrue(ctor.toGenericString() + " has unsupported parameter types",
+            allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+          tokenizers.add(castConstructor(Tokenizer.class, ctor));
+        } else if (TokenFilter.class.isAssignableFrom(c)) {
+          assertTrue(ctor.toGenericString() + " has unsupported parameter types",
+            allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+          tokenfilters.add(castConstructor(TokenFilter.class, ctor));
+        } else if (CharStream.class.isAssignableFrom(c)) {
+          assertTrue(ctor.toGenericString() + " has unsupported parameter types",
+            allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+          charfilters.add(castConstructor(CharStream.class, ctor));
+        } else {
+          fail("Cannot get here");
+        }
       }
     }
-    final Comparator<Class<?>> classComp = new Comparator<Class<?>>() {
+
+    final Comparator<Constructor<?>> ctorComp = new Comparator<Constructor<?>>() {
       @Override
-      public int compare(Class<?> arg0, Class<?> arg1) {
-        return arg0.getName().compareTo(arg1.getName());
+      public int compare(Constructor<?> arg0, Constructor<?> arg1) {
+        return arg0.toGenericString().compareTo(arg1.toGenericString());
       }
     };
-    Collections.sort(tokenizers, classComp);
-    Collections.sort(tokenfilters, classComp);
-    Collections.sort(charfilters, classComp);
+    Collections.sort(tokenizers, ctorComp);
+    Collections.sort(tokenfilters, ctorComp);
+    Collections.sort(charfilters, ctorComp);
+    
     if (VERBOSE) {
       System.out.println("tokenizers = " + tokenizers);
       System.out.println("tokenfilters = " + tokenfilters);
@@ -111,6 +166,304 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     charfilters = null;
   }
   
+  /** Hack to work around the stupidness of Oracle's strict Java backwards compatibility.
+   * {@code Class<T>#getConstructors()} should return unmodifiable {@code List<Constructor<T>>} not array! */
+  @SuppressWarnings("unchecked") 
+  private static <T> Constructor<? extends T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) {
+    return (Constructor<? extends T>) ctor;
+  }
+  
+  private static interface ArgProducer {
+    Object create(Random random);
+  }
+  
+  private static final Map<Class<?>,ArgProducer> argProducers = new IdentityHashMap<Class<?>,ArgProducer>() {{
+    put(int.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        // TODO: could cause huge ram usage to use full int range for some filters
+        // (e.g. allocate enormous arrays)
+        // return Integer.valueOf(random.nextInt());
+        return Integer.valueOf(_TestUtil.nextInt(random, -100, 100));
+      }
+    });
+    put(char.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        return Character.valueOf((char)random.nextInt(65536));
+      }
+    });
+    put(float.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        return Float.valueOf(random.nextFloat());
+      }
+    });
+    put(boolean.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        return Boolean.valueOf(random.nextBoolean());
+      }
+    });
+    put(byte.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        byte bytes[] = new byte[1];
+        random.nextBytes(bytes);
+        return Byte.valueOf(bytes[0]);
+      }
+    });
+    put(byte[].class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        byte bytes[] = new byte[random.nextInt(256)];
+        random.nextBytes(bytes);
+        return bytes;
+      }
+    });
+    put(Random.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        return new Random(random.nextLong());
+      }
+    });
+    put(Version.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        // we expect bugs in emulating old versions
+        return TEST_VERSION_CURRENT;
+      }
+    });
+    put(Set.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        // TypeTokenFilter
+        Set<String> set = new HashSet<String>();
+        int num = random.nextInt(5);
+        for (int i = 0; i < num; i++) {
+          set.add(StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]);
+        }
+        return set;
+      }
+    });
+    put(Collection.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        // CapitalizationFilter
+        Collection<char[]> col = new ArrayList<char[]>();
+        int num = random.nextInt(5);
+        for (int i = 0; i < num; i++) {
+          col.add(_TestUtil.randomSimpleString(random).toCharArray());
+        }
+        return col;
+      }
+    });
+    put(CharArraySet.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        int num = random.nextInt(10);
+        CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, num, random.nextBoolean());
+        for (int i = 0; i < num; i++) {
+          // TODO: make nastier
+          set.add(_TestUtil.randomSimpleString(random));
+        }
+        return set;
+      }
+    });
+    put(Pattern.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        // TODO: don't want to make the exponentially slow ones Dawid documents
+        // in TestPatternReplaceFilter, so dont use truly random patterns (for now)
+        return Pattern.compile("a");
+      }
+    });
+    put(PayloadEncoder.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers?
+      }
+    });
+    put(HunspellDictionary.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        // TODO: make nastier
+        InputStream affixStream = HunspellDictionaryTest.class.getResourceAsStream("test.aff");
+        InputStream dictStream = HunspellDictionaryTest.class.getResourceAsStream("test.dic");
+        try {
+         return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
+        } catch (Exception ex) {
+          throw new RuntimeException(ex);
+        }
+      }
+    });
+    put(HyphenationTree.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        // TODO: make nastier
+        try {
+          InputSource is = new InputSource(TestCompoundWordTokenFilter.class.getResource("da_UTF8.xml").toExternalForm());
+          HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
+          return hyphenator;
+        } catch (Exception ex) {
+          Rethrow.rethrow(ex);
+          return null; // unreachable code
+        }
+      }
+    });
+    put(SnowballProgram.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        try {
+          String lang = TestSnowball.SNOWBALL_LANGS[random.nextInt(TestSnowball.SNOWBALL_LANGS.length)];
+          Class<? extends SnowballProgram> clazz = Class.forName("org.tartarus.snowball.ext." + lang + "Stemmer").asSubclass(SnowballProgram.class);
+          return clazz.newInstance();
+        } catch (Exception ex) {
+          Rethrow.rethrow(ex);
+          return null; // unreachable code
+        }
+      }
+    });
+    put(String.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        // TODO: make nastier
+        if (random.nextBoolean()) {
+          // a token type
+          return StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)];
+        } else {
+          return _TestUtil.randomSimpleString(random);
+        }
+      }
+    });
+    put(NormalizeCharMap.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        NormalizeCharMap map = new NormalizeCharMap();
+        // we can't add duplicate keys, or NormalizeCharMap gets angry
+        Set<String> keys = new HashSet<String>();
+        int num = random.nextInt(5);
+        for (int i = 0; i < num; i++) {
+          String key = _TestUtil.randomSimpleString(random);
+          if (!keys.contains(key)) {
+            map.add(key,_TestUtil.randomSimpleString(random));
+            keys.add(key);
+          }
+        }
+        return map;
+      }
+    });
+    put(CharacterRunAutomaton.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        // TODO: could probably use a purely random automaton
+        switch(random.nextInt(5)) {
+          case 0: return MockTokenizer.KEYWORD;
+          case 1: return MockTokenizer.SIMPLE;
+          case 2: return MockTokenizer.WHITESPACE;
+          case 3: return MockTokenFilter.EMPTY_STOPSET;
+          default: return MockTokenFilter.ENGLISH_STOPSET;
+        }
+      }
+    });
+    put(CharArrayMap.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        int num = random.nextInt(10);
+        CharArrayMap<String> map = new CharArrayMap<String>(TEST_VERSION_CURRENT, num, random.nextBoolean());
+        for (int i = 0; i < num; i++) {
+          // TODO: make nastier
+          map.put(_TestUtil.randomSimpleString(random), _TestUtil.randomSimpleString(random));
+        }
+        return map;
+      }
+    });
+    put(SynonymMap.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean());
+        final int numEntries = atLeast(10);
+        for (int j = 0; j < numEntries; j++) {
+          addSyn(b, randomNonEmptyString(random), randomNonEmptyString(random), random.nextBoolean());
+        }
+        try {
+          return b.build();
+        } catch (Exception e) {
+          throw new RuntimeException(e);
+        }
+      }
+      
+      private void addSyn(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
+        b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
+              new CharsRef(output.replaceAll(" +", "\u0000")),
+              keepOrig);
+      }
+      
+      private String randomNonEmptyString(Random random) {
+        while(true) {
+          final String s = _TestUtil.randomUnicodeString(random).trim();
+          if (s.length() != 0 && s.indexOf('\u0000') == -1) {
+            return s;
+          }
+        }
+      }    
+    });
+  }};
+  
+  static final Set<Class<?>> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs;
+  static {
+    allowedTokenizerArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
+    allowedTokenizerArgs.addAll(argProducers.keySet());
+    allowedTokenizerArgs.add(Reader.class);
+    allowedTokenizerArgs.add(AttributeFactory.class);
+    allowedTokenizerArgs.add(AttributeSource.class);
+    
+    allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
+    allowedTokenFilterArgs.addAll(argProducers.keySet());
+    allowedTokenFilterArgs.add(TokenStream.class);
+    allowedTokenFilterArgs.add(CommonGramsFilter.class);
+    
+    allowedCharFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
+    allowedCharFilterArgs.addAll(argProducers.keySet());
+    allowedCharFilterArgs.add(Reader.class);
+    allowedCharFilterArgs.add(CharStream.class);
+  }
+  
+  @SuppressWarnings("unchecked")
+  static <T> T createRandomArg(Random random, Class<T> paramType) {
+    final ArgProducer producer = argProducers.get(paramType);
+    assertNotNull("No producer for arguments of type " + paramType.getName() + " found", producer);
+    return (T) producer.create(random);
+  }
+  
+  static Object[] newTokenizerArgs(Random random, Reader reader, Class<?>[] paramTypes) {
+    Object[] args = new Object[paramTypes.length];
+    for (int i = 0; i < args.length; i++) {
+      Class<?> paramType = paramTypes[i];
+      if (paramType == Reader.class) {
+        args[i] = reader;
+      } else if (paramType == AttributeFactory.class) {
+        // TODO: maybe the collator one...???
+        args[i] = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
+      } else if (paramType == AttributeSource.class) {
+        args[i] = null; // this always gives IAE: fine 
+      } else {
+        args[i] = createRandomArg(random, paramType);
+      }
+    }
+    return args;
+  }
+  
+  static Object[] newCharFilterArgs(Random random, Reader reader, Class<?>[] paramTypes) {
+    Object[] args = new Object[paramTypes.length];
+    for (int i = 0; i < args.length; i++) {
+      Class<?> paramType = paramTypes[i];
+      if (paramType == Reader.class) {
+        args[i] = reader;
+      } else if (paramType == CharStream.class) {
+        args[i] = CharReader.get(reader);
+      } else {
+        args[i] = createRandomArg(random, paramType);
+      }
+    }
+    return args;
+  }
+  
+  static Object[] newFilterArgs(Random random, TokenStream stream, Class<?>[] paramTypes) {
+    Object[] args = new Object[paramTypes.length];
+    for (int i = 0; i < args.length; i++) {
+      Class<?> paramType = paramTypes[i];
+      if (paramType == TokenStream.class) {
+        args[i] = stream;
+      } else if (paramType == CommonGramsFilter.class) {
+        // CommonGramsQueryFilter takes this one explicitly
+        args[i] = new CommonGramsFilter(TEST_VERSION_CURRENT, stream, createRandomArg(random, CharArraySet.class));
+      } else {
+        args[i] = createRandomArg(random, paramType);
+      }
+    }
+    return args;
+  }
+
   static class MockRandomAnalyzer extends Analyzer {
     final long seed;
     
@@ -123,6 +476,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       Random random = new Random(seed);
       TokenizerSpec tokenizerspec = newTokenizer(random, reader);
       TokenFilterSpec filterspec = newFilterChain(random, tokenizerspec.tokenizer);
+      //System.out.println("seed=" + seed + ",tokenizerSpec=" + tokenizerspec.toString);
+      //System.out.println("seed=" + seed + ",tokenfilterSpec=" + filterspec.toString);
       return new TokenStreamComponents(tokenizerspec.tokenizer, filterspec.stream);
     }
 
@@ -130,6 +485,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     protected Reader initReader(Reader reader) {
       Random random = new Random(seed);
       CharFilterSpec charfilterspec = newCharFilterChain(random, reader);
+      //System.out.println("seed=" + seed + ",charFilterSpec=" + charfilterspec.toString);
       return charfilterspec.reader;
     }
 
@@ -159,20 +515,27 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       boolean success = false;
       while (!success) {
         try {
-          // TODO: check Reader+Version,Version+Reader too
-          // also look for other variants and handle them special
-          int idx = random.nextInt(tokenizers.size());
-          try {
-            Constructor<? extends Tokenizer> c = tokenizers.get(idx).getConstructor(Version.class, Reader.class);
-            spec.tokenizer = c.newInstance(TEST_VERSION_CURRENT, reader);
-          } catch (NoSuchMethodException e) {
-            Constructor<? extends Tokenizer> c = tokenizers.get(idx).getConstructor(Reader.class);
-            spec.tokenizer = c.newInstance(reader);
-          }
-          spec.toString = tokenizers.get(idx).toString();
+          final Constructor<? extends Tokenizer> ctor = tokenizers.get(random.nextInt(tokenizers.size()));
+          final Object args[] = newTokenizerArgs(random, reader, ctor.getParameterTypes());
+          spec.tokenizer = ctor.newInstance(args);
+          spec.toString =  ctor.getDeclaringClass().getName() + ("(" + Arrays.toString(args) + ")");
           success = true;
-        } catch (Exception e) {
-          // ignore
+        } catch (InvocationTargetException ite) {
+          final Throwable cause = ite.getCause();
+          if (cause instanceof IllegalArgumentException ||
+              cause instanceof UnsupportedOperationException) {
+            // thats ok, ignore
+            if (VERBOSE) {
+              System.err.println("Ignoring IAE/UOE from ctor:");
+              cause.printStackTrace(System.err);
+            }
+          } else {
+            Rethrow.rethrow(cause);
+          }
+        } catch (IllegalAccessException iae) {
+          Rethrow.rethrow(iae);
+        } catch (InstantiationException ie) {
+          Rethrow.rethrow(ie);
         }
       }
       return spec;
@@ -187,23 +550,32 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         boolean success = false;
         while (!success) {
           try {
-            // TODO: also look for other variants and handle them special
-            int idx = random.nextInt(charfilters.size());
-            try {
-              Constructor<? extends CharStream> c = charfilters.get(idx).getConstructor(Reader.class);
-              spec.reader = c.newInstance(spec.reader);
-            } catch (NoSuchMethodException e) {
-              Constructor<? extends CharStream> c = charfilters.get(idx).getConstructor(CharStream.class);
-              spec.reader = c.newInstance(CharReader.get(spec.reader));
-            }
+            final Constructor<? extends CharStream> ctor = charfilters.get(random.nextInt(charfilters.size()));
+            final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
+            spec.reader = ctor.newInstance(args);
 
             if (descr.length() > 0) {
               descr.append(",");
             }
-            descr.append(charfilters.get(idx).toString());
+            descr.append(ctor.getDeclaringClass().getName());
+            descr.append("(" + Arrays.toString(args) + ")");
             success = true;
-          } catch (Exception e) {
-            // ignore
+          } catch (InvocationTargetException ite) {
+            final Throwable cause = ite.getCause();
+            if (cause instanceof IllegalArgumentException ||
+                cause instanceof UnsupportedOperationException) {
+              // thats ok, ignore
+              if (VERBOSE) {
+                System.err.println("Ignoring IAE/UOE from ctor:");
+                cause.printStackTrace(System.err);
+              }
+            } else {
+              Rethrow.rethrow(cause);
+            }
+          } catch (IllegalAccessException iae) {
+            Rethrow.rethrow(iae);
+          } catch (InstantiationException ie) {
+            Rethrow.rethrow(ie);
           }
         }
       }
@@ -220,22 +592,31 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         boolean success = false;
         while (!success) {
           try {
-            // TODO: also look for other variants and handle them special
-            int idx = random.nextInt(tokenfilters.size());
-            try {
-              Constructor<? extends TokenFilter> c = tokenfilters.get(idx).getConstructor(Version.class, TokenStream.class);
-              spec.stream = c.newInstance(TEST_VERSION_CURRENT, spec.stream);
-            } catch (NoSuchMethodException e) {
-              Constructor<? extends TokenFilter> c = tokenfilters.get(idx).getConstructor(TokenStream.class);
-              spec.stream = c.newInstance(spec.stream);
-            }
+            final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
+            final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
+            spec.stream = ctor.newInstance(args);
             if (descr.length() > 0) {
               descr.append(",");
             }
-            descr.append(tokenfilters.get(idx).toString());
+            descr.append(ctor.getDeclaringClass().getName());
+            descr.append("(" + Arrays.toString(args) + ")");
             success = true;
-          } catch (Exception e) {
-            // ignore
+          } catch (InvocationTargetException ite) {
+            final Throwable cause = ite.getCause();
+            if (cause instanceof IllegalArgumentException ||
+                cause instanceof UnsupportedOperationException) {
+              // thats ok, ignore
+              if (VERBOSE) {
+                System.err.println("Ignoring IAE/UOE from ctor:");
+                cause.printStackTrace(System.err);
+              }
+            } else {
+              Rethrow.rethrow(cause);
+            }
+          } catch (IllegalAccessException iae) {
+            Rethrow.rethrow(iae);
+          } catch (InstantiationException ie) {
+            Rethrow.rethrow(ie);
           }
         }
       }
@@ -263,7 +644,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     int numIterations = atLeast(20);
     for (int i = 0; i < numIterations; i++) {
       MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong());
-      if (VERBOSE) {
+      if (true || VERBOSE) {
         System.out.println("Creating random analyzer:" + a);
       }
       try {
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java
index 36bc26233a1..7791fb44e67 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java
@@ -142,14 +142,16 @@ public class TestSnowball extends BaseTokenStreamTestCase {
     }
   }
   
+  /** for testing purposes ONLY */
+  public static String SNOWBALL_LANGS[] = {
+    "Armenian", "Basque", "Catalan", "Danish", "Dutch", "English",
+    "Finnish", "French", "German2", "German", "Hungarian", "Irish",
+    "Italian", "Kp", "Lovins", "Norwegian", "Porter", "Portuguese",
+    "Romanian", "Russian", "Spanish", "Swedish", "Turkish"
+  };
+  
   public void testEmptyTerm() throws IOException {
-    String langs[] = { 
-        "Armenian", "Basque", "Catalan", "Danish", "Dutch", "English",
-        "Finnish", "French", "German2", "German", "Hungarian", "Irish",
-        "Italian", "Kp", "Lovins", "Norwegian", "Porter", "Portuguese",
-        "Romanian", "Russian", "Spanish", "Swedish", "Turkish"
-    };
-    for (final String lang : langs) {
+    for (final String lang : SNOWBALL_LANGS) {
       Analyzer a = new Analyzer() {
         @Override
         protected TokenStreamComponents createComponents(String fieldName, Reader reader) {

From f63af6afe58bad12e45d14a69c2f1d324318b7e8 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Mon, 9 Apr 2012 13:44:18 +0000
Subject: [PATCH 03/40] LUCENE-3969: don't be this evil yet for type char

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311235 13f79535-47bb-0310-9956-ffa450edef68
---
 .../apache/lucene/analysis/core/TestRandomChains.java    | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index d9759ef709e..8bec640b6b9 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -188,7 +188,14 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     });
     put(char.class, new ArgProducer() {
       @Override public Object create(Random random) {
-        return Character.valueOf((char)random.nextInt(65536));
+        // nocommit: fix any filters that care to throw IAE instead.
+        // return Character.valueOf((char)random.nextInt(65536));
+        while(true) {
+          char c = (char)random.nextInt(65536);
+          if (c < '\uD800' || c > '\uDFFF') {
+            return Character.valueOf(c);
+          }
+        }
       }
     });
     put(float.class, new ArgProducer() {

From 24f8a9e627acfffe1caf5c7a60c3c2068dbf4e71 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Mon, 9 Apr 2012 14:16:35 +0000
Subject: [PATCH 04/40] LUCENE-3969: disable PositionFilter for now

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311241 13f79535-47bb-0310-9956-ffa450edef68
---
 .../test/org/apache/lucene/analysis/core/TestRandomChains.java | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 8bec640b6b9..79db9cedec7 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -64,6 +64,7 @@ import org.apache.lucene.analysis.ngram.NGramTokenFilter;
 import org.apache.lucene.analysis.ngram.NGramTokenizer;
 import org.apache.lucene.analysis.payloads.IdentityEncoder;
 import org.apache.lucene.analysis.payloads.PayloadEncoder;
+import org.apache.lucene.analysis.position.PositionFilter;
 import org.apache.lucene.analysis.snowball.TestSnowball;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.synonym.SynonymMap;
@@ -106,6 +107,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         || c == EmptyTokenizer.class
         // doesn't actual reset itself!
         || c == CachingTokenFilter.class
+        // nocommit: corrumpts graphs (offset consistency check)
+        || c == PositionFilter.class
         // doesn't consume whole stream!
         || c == LimitTokenCountFilter.class
         // broken!

From ac393486e0e6d5a74b88cd6f98881dac15146db2 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Mon, 9 Apr 2012 14:31:25 +0000
Subject: [PATCH 05/40] LUCENE-3969: don't allow negative subword params,
 Hyphenation relies upon this to filter out what appear to be bogus
 hyphenation points

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311257 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/compound/CompoundWordTokenFilterBase.java   | 9 +++++++++
 .../compound/HyphenationCompoundWordTokenFilter.java     | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
index 3b3fae9ca76..909ef5ef1a2 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
@@ -82,8 +82,17 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
     super(input);
     
     this.tokens=new LinkedList<CompoundToken>();
+    if (minWordSize < 0) {
+      throw new IllegalArgumentException("minWordSize cannot be negative");
+    }
     this.minWordSize=minWordSize;
+    if (minSubwordSize < 0) {
+      throw new IllegalArgumentException("minSubwordSize cannot be negative");
+    }
     this.minSubwordSize=minSubwordSize;
+    if (maxSubwordSize < 0) {
+      throw new IllegalArgumentException("maxSubwordSize cannot be negative");
+    }
     this.maxSubwordSize=maxSubwordSize;
     this.onlyLongestMatch=onlyLongestMatch;
     this.dictionary = dictionary;
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
index 935c607c3de..a71352db1f7 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
@@ -191,6 +191,8 @@ public class HyphenationCompoundWordTokenFilter extends
         // we only put subwords to the token stream
         // that are longer than minPartSize
         if (partLength < this.minSubwordSize) {
+          // nocommit/BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the 
+          // calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
           continue;
         }
 

From 214ab39f68c7e6fbd92048af5e15a5cabc2ab5dc Mon Sep 17 00:00:00 2001
From: Uwe Schindler <uschindler@apache.org>
Date: Mon, 9 Apr 2012 15:15:11 +0000
Subject: [PATCH 06/40] LUCENE-3969: Minor cleanups and code consistency

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311278 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/core/TestRandomChains.java       | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 79db9cedec7..9b9b630d882 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -289,7 +289,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         try {
          return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
         } catch (Exception ex) {
-          throw new RuntimeException(ex);
+          Rethrow.rethrow(ex);
+          return null; // unreachable code
         }
       }
     });
@@ -377,8 +378,9 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         }
         try {
           return b.build();
-        } catch (Exception e) {
-          throw new RuntimeException(e);
+        } catch (Exception ex) {
+          Rethrow.rethrow(ex);
+          return null; // unreachable code
         }
       }
       
@@ -410,6 +412,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
     allowedTokenFilterArgs.addAll(argProducers.keySet());
     allowedTokenFilterArgs.add(TokenStream.class);
+    // TODO: fix this one, thats broken:
     allowedTokenFilterArgs.add(CommonGramsFilter.class);
     
     allowedCharFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
@@ -419,7 +422,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
   }
   
   @SuppressWarnings("unchecked")
-  static <T> T createRandomArg(Random random, Class<T> paramType) {
+  static <T> T newRandomArg(Random random, Class<T> paramType) {
     final ArgProducer producer = argProducers.get(paramType);
     assertNotNull("No producer for arguments of type " + paramType.getName() + " found", producer);
     return (T) producer.create(random);
@@ -435,9 +438,9 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         // TODO: maybe the collator one...???
         args[i] = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
       } else if (paramType == AttributeSource.class) {
-        args[i] = null; // this always gives IAE: fine 
+        args[i] = new AttributeSource();
       } else {
-        args[i] = createRandomArg(random, paramType);
+        args[i] = newRandomArg(random, paramType);
       }
     }
     return args;
@@ -452,7 +455,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       } else if (paramType == CharStream.class) {
         args[i] = CharReader.get(reader);
       } else {
-        args[i] = createRandomArg(random, paramType);
+        args[i] = newRandomArg(random, paramType);
       }
     }
     return args;
@@ -465,10 +468,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       if (paramType == TokenStream.class) {
         args[i] = stream;
       } else if (paramType == CommonGramsFilter.class) {
-        // CommonGramsQueryFilter takes this one explicitly
-        args[i] = new CommonGramsFilter(TEST_VERSION_CURRENT, stream, createRandomArg(random, CharArraySet.class));
+        // TODO: fix this one, thats broken: CommonGramsQueryFilter takes this one explicitly
+        args[i] = new CommonGramsFilter(TEST_VERSION_CURRENT, stream, newRandomArg(random, CharArraySet.class));
       } else {
-        args[i] = createRandomArg(random, paramType);
+        args[i] = newRandomArg(random, paramType);
       }
     }
     return args;

From 102ece7710eab541afa68c7614151f361a3692fa Mon Sep 17 00:00:00 2001
From: Uwe Schindler <uschindler@apache.org>
Date: Mon, 9 Apr 2012 15:32:08 +0000
Subject: [PATCH 07/40] LUCENE-3969: More cleanups

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311282 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/core/TestRandomChains.java       | 42 +++++++++++--------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 9b9b630d882..31fb5f24797 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -88,6 +88,29 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
   static List<Constructor<? extends TokenFilter>> tokenfilters;
   static List<Constructor<? extends CharStream>> charfilters;
   
+  // TODO: fix those and remove
+  private static final Set<Class<?>> brokenComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
+  static {
+    Collections.<Class<?>>addAll(brokenComponents,
+      // TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
+      EmptyTokenizer.class,
+      // doesn't actual reset itself!
+      CachingTokenFilter.class,
+      // nocommit: corrumpts graphs (offset consistency check)
+      PositionFilter.class,
+      // doesn't consume whole stream!
+      LimitTokenCountFilter.class,
+      // broken!
+      NGramTokenizer.class,
+      // broken!
+      NGramTokenFilter.class,
+      // broken!
+      EdgeNGramTokenizer.class,
+      // broken!
+      EdgeNGramTokenFilter.class
+    );
+  }
+  
   @BeforeClass
   public static void beforeClass() throws Exception {
     List<Class<?>> analysisClasses = new ArrayList<Class<?>>();
@@ -103,22 +126,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         || c.isAnnotationPresent(Deprecated.class)
         || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
         || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharStream.class.isAssignableFrom(c))
-        // TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
-        || c == EmptyTokenizer.class
-        // doesn't actual reset itself!
-        || c == CachingTokenFilter.class
-        // nocommit: corrumpts graphs (offset consistency check)
-        || c == PositionFilter.class
-        // doesn't consume whole stream!
-        || c == LimitTokenCountFilter.class
-        // broken!
-        || c == NGramTokenizer.class
-        // broken!
-        || c == NGramTokenFilter.class
-        // broken!
-        || c == EdgeNGramTokenizer.class
-        // broken!
-        || c == EdgeNGramTokenFilter.class
+        || brokenComponents.contains(c)
       ) {
         continue;
       }
@@ -657,7 +665,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     int numIterations = atLeast(20);
     for (int i = 0; i < numIterations; i++) {
       MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong());
-      if (true || VERBOSE) {
+      if (VERBOSE) {
         System.out.println("Creating random analyzer:" + a);
       }
       try {

From d76a03214c7c9e9c16395649effd2356d0f03dd6 Mon Sep 17 00:00:00 2001
From: Michael McCandless <mikemccand@apache.org>
Date: Mon, 9 Apr 2012 16:00:41 +0000
Subject: [PATCH 08/40] LUCENE-3969: add missing IAE to WikipediaTokenizer ctor

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311294 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/analysis/wikipedia/WikipediaTokenizer.java       | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
index c495bdd11a3..c5ba3a0567a 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
@@ -177,6 +177,12 @@ public final class WikipediaTokenizer extends Tokenizer {
   }
   
   private void init(int tokenOutput, Set<String> untokenizedTypes) {
+    // TODO: cutover to enum
+    if (tokenOutput != TOKENS_ONLY &&
+        tokenOutput != UNTOKENIZED_ONLY &&
+        tokenOutput != BOTH) {
+      throw new IllegalArgumentException("tokenOutput must be TOKENS_ONLY, UNTOKENIZED_ONLY or BOTH");
+    }
     this.tokenOutput = tokenOutput;
     this.untokenizedTypes = untokenizedTypes;    
   }

From 4456273922144d9b856cf885ff7fc2b797d37f02 Mon Sep 17 00:00:00 2001
From: Michael McCandless <mikemccand@apache.org>
Date: Mon, 9 Apr 2012 16:47:56 +0000
Subject: [PATCH 09/40] LUCENE-3969: fix PatternTokenizer to not consume chars
 from the input Reader if it throws IAE

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311318 13f79535-47bb-0310-9956-ffa450edef68
---
 .../apache/lucene/analysis/pattern/PatternTokenizer.java | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
index 6aca0c5edd8..bc80391c95e 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
@@ -69,12 +69,17 @@ public final class PatternTokenizer extends Tokenizer {
     super(input);
     this.pattern = pattern;
     this.group = group;
-    fillBuffer(str, input);
-    matcher = pattern.matcher(str);
+
+    // Use "" instead of str so don't consume chars
+    // (fillBuffer) from the input on throwing IAE below:
+    matcher = pattern.matcher("");
+
     // confusingly group count depends ENTIRELY on the pattern but is only accessible via matcher
     if (group >= 0 && group > matcher.groupCount()) {
       throw new IllegalArgumentException("invalid group specified: pattern only has: " + matcher.groupCount() + " capturing groups");
     }
+    fillBuffer(str, input);
+    matcher.reset(str);
     index = 0;
   }
 

From bd8bdb08b3a3ae6b2c0bc84548b5a12891ebf4e8 Mon Sep 17 00:00:00 2001
From: Uwe Schindler <uschindler@apache.org>
Date: Mon, 9 Apr 2012 16:52:14 +0000
Subject: [PATCH 10/40] LUCENE-3969: Remove code duplication

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311320 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/core/TestRandomChains.java       | 184 ++++++++----------
 1 file changed, 76 insertions(+), 108 deletions(-)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 31fb5f24797..d49e1c001e6 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -184,6 +184,35 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     return (Constructor<? extends T>) ctor;
   }
   
+  private static void getClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception {
+    final ClassLoader cld = TestRandomChains.class.getClassLoader();
+    final String path = pckgname.replace('.', '/');
+    final Enumeration<URL> resources = cld.getResources(path);
+    while (resources.hasMoreElements()) {
+      final File directory = new File(resources.nextElement().toURI());
+      if (directory.exists()) {
+        String[] files = directory.list();
+        for (String file : files) {
+          if (new File(directory, file).isDirectory()) {
+            // recurse
+            String subPackage = pckgname + "." + file;
+            getClassesForPackage(subPackage, classes);
+          }
+          if (file.endsWith(".class")) {
+            String clazzName = file.substring(0, file.length() - 6);
+            // exclude Test classes that happen to be in these packages.
+            // class.ForName'ing some of them can cause trouble.
+            if (!clazzName.endsWith("Test") && !clazzName.startsWith("Test")) {
+              // Don't run static initializers, as we won't use most of them.
+              // Java will do that automatically once accessed/instantiated.
+              classes.add(Class.forName(pckgname + '.' + clazzName, false, cld));
+            }
+          }
+        }
+      }
+    }
+  }
+  
   private static interface ArgProducer {
     Object create(Random random);
   }
@@ -497,8 +526,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       Random random = new Random(seed);
       TokenizerSpec tokenizerspec = newTokenizer(random, reader);
       TokenFilterSpec filterspec = newFilterChain(random, tokenizerspec.tokenizer);
-      //System.out.println("seed=" + seed + ",tokenizerSpec=" + tokenizerspec.toString);
-      //System.out.println("seed=" + seed + ",tokenfilterSpec=" + filterspec.toString);
       return new TokenStreamComponents(tokenizerspec.tokenizer, filterspec.stream);
     }
 
@@ -506,7 +533,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     protected Reader initReader(Reader reader) {
       Random random = new Random(seed);
       CharFilterSpec charfilterspec = newCharFilterChain(random, reader);
-      //System.out.println("seed=" + seed + ",charFilterSpec=" + charfilterspec.toString);
       return charfilterspec.reader;
     }
 
@@ -530,34 +556,46 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       return sb.toString();
     }
     
+    private <T> T createComponent(Constructor<? extends T> ctor, Object[] args, StringBuilder descr) {
+      try {
+        final T instance = ctor.newInstance(args);
+        if (descr.length() > 0) {
+          descr.append(",");
+        }
+        descr.append(ctor.getDeclaringClass().getName());
+        String params = Arrays.toString(args);
+        params = params.substring(1, params.length()-1);
+        descr.append("(").append(params).append(")");
+        return instance;
+      } catch (InvocationTargetException ite) {
+        final Throwable cause = ite.getCause();
+        if (cause instanceof IllegalArgumentException ||
+            cause instanceof UnsupportedOperationException) {
+          // thats ok, ignore
+          if (VERBOSE) {
+            System.err.println("Ignoring IAE/UOE from ctor:");
+            cause.printStackTrace(System.err);
+          }
+        } else {
+          Rethrow.rethrow(cause);
+        }
+      } catch (IllegalAccessException iae) {
+        Rethrow.rethrow(iae);
+      } catch (InstantiationException ie) {
+        Rethrow.rethrow(ie);
+      }
+      return null; // no success
+    }
+    
     // create a new random tokenizer from classpath
     private TokenizerSpec newTokenizer(Random random, Reader reader) {
       TokenizerSpec spec = new TokenizerSpec();
-      boolean success = false;
-      while (!success) {
-        try {
-          final Constructor<? extends Tokenizer> ctor = tokenizers.get(random.nextInt(tokenizers.size()));
-          final Object args[] = newTokenizerArgs(random, reader, ctor.getParameterTypes());
-          spec.tokenizer = ctor.newInstance(args);
-          spec.toString =  ctor.getDeclaringClass().getName() + ("(" + Arrays.toString(args) + ")");
-          success = true;
-        } catch (InvocationTargetException ite) {
-          final Throwable cause = ite.getCause();
-          if (cause instanceof IllegalArgumentException ||
-              cause instanceof UnsupportedOperationException) {
-            // thats ok, ignore
-            if (VERBOSE) {
-              System.err.println("Ignoring IAE/UOE from ctor:");
-              cause.printStackTrace(System.err);
-            }
-          } else {
-            Rethrow.rethrow(cause);
-          }
-        } catch (IllegalAccessException iae) {
-          Rethrow.rethrow(iae);
-        } catch (InstantiationException ie) {
-          Rethrow.rethrow(ie);
-        }
+      while (spec.tokenizer == null) {
+        final Constructor<? extends Tokenizer> ctor = tokenizers.get(random.nextInt(tokenizers.size()));
+        final StringBuilder descr = new StringBuilder();
+        final Object args[] = newTokenizerArgs(random, reader, ctor.getParameterTypes());
+        spec.tokenizer = createComponent(ctor, args, descr);
+        spec.toString = descr.toString();
       }
       return spec;
     }
@@ -570,33 +608,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       for (int i = 0; i < numFilters; i++) {
         boolean success = false;
         while (!success) {
-          try {
-            final Constructor<? extends CharStream> ctor = charfilters.get(random.nextInt(charfilters.size()));
-            final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
-            spec.reader = ctor.newInstance(args);
-
-            if (descr.length() > 0) {
-              descr.append(",");
-            }
-            descr.append(ctor.getDeclaringClass().getName());
-            descr.append("(" + Arrays.toString(args) + ")");
+          final Constructor<? extends CharStream> ctor = charfilters.get(random.nextInt(charfilters.size()));
+          final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
+          reader = createComponent(ctor, args, descr);
+          if (reader != null) {
             success = true;
-          } catch (InvocationTargetException ite) {
-            final Throwable cause = ite.getCause();
-            if (cause instanceof IllegalArgumentException ||
-                cause instanceof UnsupportedOperationException) {
-              // thats ok, ignore
-              if (VERBOSE) {
-                System.err.println("Ignoring IAE/UOE from ctor:");
-                cause.printStackTrace(System.err);
-              }
-            } else {
-              Rethrow.rethrow(cause);
-            }
-          } catch (IllegalAccessException iae) {
-            Rethrow.rethrow(iae);
-          } catch (InstantiationException ie) {
-            Rethrow.rethrow(ie);
+            spec.reader = reader;
           }
         }
       }
@@ -612,32 +629,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       for (int i = 0; i < numFilters; i++) {
         boolean success = false;
         while (!success) {
-          try {
-            final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
-            final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
-            spec.stream = ctor.newInstance(args);
-            if (descr.length() > 0) {
-              descr.append(",");
-            }
-            descr.append(ctor.getDeclaringClass().getName());
-            descr.append("(" + Arrays.toString(args) + ")");
+          final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
+          final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
+          final TokenFilter flt = createComponent(ctor, args, descr);
+          if (flt != null) {
             success = true;
-          } catch (InvocationTargetException ite) {
-            final Throwable cause = ite.getCause();
-            if (cause instanceof IllegalArgumentException ||
-                cause instanceof UnsupportedOperationException) {
-              // thats ok, ignore
-              if (VERBOSE) {
-                System.err.println("Ignoring IAE/UOE from ctor:");
-                cause.printStackTrace(System.err);
-              }
-            } else {
-              Rethrow.rethrow(cause);
-            }
-          } catch (IllegalAccessException iae) {
-            Rethrow.rethrow(iae);
-          } catch (InstantiationException ie) {
-            Rethrow.rethrow(ie);
+            spec.stream = flt;
           }
         }
       }
@@ -676,33 +673,4 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       }
     }
   }
-  
-  private static void getClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception {
-    final ClassLoader cld = TestRandomChains.class.getClassLoader();
-    final String path = pckgname.replace('.', '/');
-    final Enumeration<URL> resources = cld.getResources(path);
-    while (resources.hasMoreElements()) {
-      final File directory = new File(resources.nextElement().toURI());
-      if (directory.exists()) {
-        String[] files = directory.list();
-        for (String file : files) {
-          if (new File(directory, file).isDirectory()) {
-            // recurse
-            String subPackage = pckgname + "." + file;
-            getClassesForPackage(subPackage, classes);
-          }
-          if (file.endsWith(".class")) {
-            String clazzName = file.substring(0, file.length() - 6);
-            // exclude Test classes that happen to be in these packages.
-            // class.ForName'ing some of them can cause trouble.
-            if (!clazzName.endsWith("Test") && !clazzName.startsWith("Test")) {
-              // Don't run static initializers, as we won't use most of them.
-              // Java will do that automatically once accessed/instantiated.
-              classes.add(Class.forName(pckgname + '.' + clazzName, false, cld));
-            }
-          }
-        }
-      }
-    }
-  }
 }

From eae8e8159dd5443d2c95370016cbbb889f235da0 Mon Sep 17 00:00:00 2001
From: Uwe Schindler <uschindler@apache.org>
Date: Mon, 9 Apr 2012 16:56:35 +0000
Subject: [PATCH 11/40] LUCENE-3969: Remove useless success variable

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311322 13f79535-47bb-0310-9956-ffa450edef68
---
 .../apache/lucene/analysis/core/TestRandomChains.java  | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index d49e1c001e6..e09178320ae 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -606,14 +606,13 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       StringBuilder descr = new StringBuilder();
       int numFilters = random.nextInt(3);
       for (int i = 0; i < numFilters; i++) {
-        boolean success = false;
-        while (!success) {
+        while (true) {
           final Constructor<? extends CharStream> ctor = charfilters.get(random.nextInt(charfilters.size()));
           final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
           reader = createComponent(ctor, args, descr);
           if (reader != null) {
-            success = true;
             spec.reader = reader;
+            break;
           }
         }
       }
@@ -627,14 +626,13 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       StringBuilder descr = new StringBuilder();
       int numFilters = random.nextInt(5);
       for (int i = 0; i < numFilters; i++) {
-        boolean success = false;
-        while (!success) {
+        while (true) {
           final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
           final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
           final TokenFilter flt = createComponent(ctor, args, descr);
           if (flt != null) {
-            success = true;
             spec.stream = flt;
+            break;
           }
         }
       }

From 79baa1f682aa481a1c49c05cc306631e8ecb5dd4 Mon Sep 17 00:00:00 2001
From: Uwe Schindler <uschindler@apache.org>
Date: Mon, 9 Apr 2012 17:08:19 +0000
Subject: [PATCH 12/40] LUCENE-3969: Remove unneeded wildcards

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311331 13f79535-47bb-0310-9956-ffa450edef68
---
 .../org/apache/lucene/analysis/core/TestRandomChains.java   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index e09178320ae..975c56c758f 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -180,8 +180,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
   /** Hack to work around the stupidness of Oracle's strict Java backwards compatibility.
    * {@code Class<T>#getConstructors()} should return unmodifiable {@code List<Constructor<T>>} not array! */
   @SuppressWarnings("unchecked") 
-  private static <T> Constructor<? extends T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) {
-    return (Constructor<? extends T>) ctor;
+  private static <T> Constructor<T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) {
+    return (Constructor<T>) ctor;
   }
   
   private static void getClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception {
@@ -556,7 +556,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       return sb.toString();
     }
     
-    private <T> T createComponent(Constructor<? extends T> ctor, Object[] args, StringBuilder descr) {
+    private <T> T createComponent(Constructor<T> ctor, Object[] args, StringBuilder descr) {
       try {
         final T instance = ctor.newInstance(args);
         if (descr.length() > 0) {

From 2a01acc0e8af338ae4b8b9d68dde67656a9bfe2f Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Mon, 9 Apr 2012 17:21:46 +0000
Subject: [PATCH 13/40] LUCENE-3969: don't use scary attsource ctor yet, and
 always print the analyzer for now

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311339 13f79535-47bb-0310-9956-ffa450edef68
---
 .../org/apache/lucene/analysis/core/TestRandomChains.java  | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 975c56c758f..fc93f3bc83b 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -475,7 +475,9 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         // TODO: maybe the collator one...???
         args[i] = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
       } else if (paramType == AttributeSource.class) {
-        args[i] = new AttributeSource();
+        // nocommit: args[i] = new AttributeSource();
+        // this is currently too scary to deal with!
+        args[i] = null; // force IAE
       } else {
         args[i] = newRandomArg(random, paramType);
       }
@@ -660,7 +662,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     int numIterations = atLeast(20);
     for (int i = 0; i < numIterations; i++) {
       MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong());
-      if (VERBOSE) {
+      // nocommit: wrap the uncaught handler with our own that prints the analyzer
+      if (true || VERBOSE) {
         System.out.println("Creating random analyzer:" + a);
       }
       try {

From f41576a306bd0db5c3874565062840a0a163c374 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Mon, 9 Apr 2012 17:32:39 +0000
Subject: [PATCH 14/40] LUCENE-3969: don't get caught by tokenizers that
 consume in ctor and throw IAE or UOE ever again

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311351 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/core/TestRandomChains.java       | 78 ++++++++++++++++++-
 1 file changed, 77 insertions(+), 1 deletion(-)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index fc93f3bc83b..32919819441 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -18,6 +18,7 @@ package org.apache.lucene.analysis.core;
  */
 
 import java.io.File;
+import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
 import java.io.StringReader;
@@ -25,6 +26,7 @@ import java.lang.reflect.Constructor;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Modifier;
 import java.net.URL;
+import java.nio.CharBuffer;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -595,8 +597,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       while (spec.tokenizer == null) {
         final Constructor<? extends Tokenizer> ctor = tokenizers.get(random.nextInt(tokenizers.size()));
         final StringBuilder descr = new StringBuilder();
-        final Object args[] = newTokenizerArgs(random, reader, ctor.getParameterTypes());
+        CheckThatYouDidntReadAnythingReaderWrapper wrapper = new CheckThatYouDidntReadAnythingReaderWrapper(reader);
+        final Object args[] = newTokenizerArgs(random, wrapper, ctor.getParameterTypes());
         spec.tokenizer = createComponent(ctor, args, descr);
+        if (spec.tokenizer == null) {
+          assert wrapper.readSomething == false;
+        }
         spec.toString = descr.toString();
       }
       return spec;
@@ -643,6 +649,76 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     }
   }
   
+  // wants charfilter to be a filterreader...
+  static class CheckThatYouDidntReadAnythingReaderWrapper extends CharStream {
+    boolean readSomething;
+    CharStream in;
+    
+    CheckThatYouDidntReadAnythingReaderWrapper(Reader in) {
+      this.in = CharReader.get(in);
+    }
+    
+    @Override
+    public int correctOffset(int currentOff) {
+      return in.correctOffset(currentOff);
+    }
+
+    @Override
+    public void close() throws IOException {
+      in.close();
+    }
+
+    @Override
+    public int read(char[] cbuf, int off, int len) throws IOException {
+      readSomething = true;
+      return in.read(cbuf, off, len);
+    }
+
+    @Override
+    public int read() throws IOException {
+      readSomething = true;
+      return in.read();
+    }
+
+    @Override
+    public int read(CharBuffer target) throws IOException {
+      readSomething = true;
+      return in.read(target);
+    }
+
+    @Override
+    public void mark(int readAheadLimit) throws IOException {
+      in.mark(readAheadLimit);
+    }
+
+    @Override
+    public boolean markSupported() {
+      return in.markSupported();
+    }
+
+    @Override
+    public int read(char[] cbuf) throws IOException {
+      readSomething = true;
+      return in.read(cbuf);
+    }
+
+    @Override
+    public boolean ready() throws IOException {
+      return in.ready();
+    }
+
+    @Override
+    public void reset() throws IOException {
+      in.reset();
+    }
+
+    @Override
+    public long skip(long n) throws IOException {
+      readSomething = true;
+      return in.skip(n);
+    }
+  }
+  
   static class TokenizerSpec {
     Tokenizer tokenizer;
     String toString;

From f6f8e38cfa1aa0d82d935bb4f9022393ed7276c0 Mon Sep 17 00:00:00 2001
From: Uwe Schindler <uschindler@apache.org>
Date: Mon, 9 Apr 2012 17:53:27 +0000
Subject: [PATCH 15/40] LUCENE-3969: Simplify the crazy Reader wrapper

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311358 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/core/TestRandomChains.java       | 51 ++++---------------
 1 file changed, 10 insertions(+), 41 deletions(-)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 32919819441..482c1bc864e 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -52,6 +52,7 @@ import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.charfilter.CharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
 import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
@@ -597,11 +598,11 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       while (spec.tokenizer == null) {
         final Constructor<? extends Tokenizer> ctor = tokenizers.get(random.nextInt(tokenizers.size()));
         final StringBuilder descr = new StringBuilder();
-        CheckThatYouDidntReadAnythingReaderWrapper wrapper = new CheckThatYouDidntReadAnythingReaderWrapper(reader);
+        final CheckThatYouDidntReadAnythingReaderWrapper wrapper = new CheckThatYouDidntReadAnythingReaderWrapper(reader);
         final Object args[] = newTokenizerArgs(random, wrapper, ctor.getParameterTypes());
         spec.tokenizer = createComponent(ctor, args, descr);
         if (spec.tokenizer == null) {
-          assert wrapper.readSomething == false;
+          assertFalse(ctor.getDeclaringClass().getName() + " has read something in ctor but failed with UOE/IAE", wrapper.readSomething);
         }
         spec.toString = descr.toString();
       }
@@ -649,73 +650,41 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     }
   }
   
-  // wants charfilter to be a filterreader...
-  static class CheckThatYouDidntReadAnythingReaderWrapper extends CharStream {
+  static final class CheckThatYouDidntReadAnythingReaderWrapper extends CharFilter {
     boolean readSomething;
-    CharStream in;
     
     CheckThatYouDidntReadAnythingReaderWrapper(Reader in) {
-      this.in = CharReader.get(in);
-    }
-    
-    @Override
-    public int correctOffset(int currentOff) {
-      return in.correctOffset(currentOff);
-    }
-
-    @Override
-    public void close() throws IOException {
-      in.close();
+      super(CharReader.get(in));
     }
 
     @Override
     public int read(char[] cbuf, int off, int len) throws IOException {
       readSomething = true;
-      return in.read(cbuf, off, len);
+      return super.read(cbuf, off, len);
     }
 
     @Override
     public int read() throws IOException {
       readSomething = true;
-      return in.read();
+      return super.read();
     }
 
     @Override
     public int read(CharBuffer target) throws IOException {
       readSomething = true;
-      return in.read(target);
-    }
-
-    @Override
-    public void mark(int readAheadLimit) throws IOException {
-      in.mark(readAheadLimit);
-    }
-
-    @Override
-    public boolean markSupported() {
-      return in.markSupported();
+      return super.read(target);
     }
 
     @Override
     public int read(char[] cbuf) throws IOException {
       readSomething = true;
-      return in.read(cbuf);
-    }
-
-    @Override
-    public boolean ready() throws IOException {
-      return in.ready();
-    }
-
-    @Override
-    public void reset() throws IOException {
-      in.reset();
+      return super.read(cbuf);
     }
 
     @Override
     public long skip(long n) throws IOException {
       readSomething = true;
-      return in.skip(n);
+      return super.skip(n);
     }
   }
   

From ad5c89b1b15d662fedf32604d70d27077a0d884a Mon Sep 17 00:00:00 2001
From: Michael McCandless <mikemccand@apache.org>
Date: Mon, 9 Apr 2012 19:05:47 +0000
Subject: [PATCH 16/40] LUCENE-3969: validate after each analysis stage;
 tenatively add posLen to ShingleFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311373 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/BaseTokenStreamTestCase.java     |   2 +-
 .../lucene/analysis/LookaheadTokenFilter.java |   4 +-
 .../analysis/ValidatingTokenFilter.java       | 117 ++++++++++++++++++
 .../analysis/shingle/ShingleFilter.java       |   6 +-
 .../analysis/core/TestRandomChains.java       |  25 +++-
 5 files changed, 147 insertions(+), 7 deletions(-)
 create mode 100644 lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java

diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
index d8fbd15d328..d0f4b2b81d9 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@@ -222,7 +222,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
         assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
       }
     }
-    assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
+    assertFalse("TokenStream has more tokens than expected (expected count=" + output.length + ")", ts.incrementToken());
     ts.end();
     if (finalOffset != null) {
       assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java
index 298ab96fe8f..9515ae94004 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java
@@ -151,7 +151,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
         startPosData.startOffset = startOffset;
       } else {
         // Make sure our input isn't messing up offsets:
-        assert startPosData.startOffset == startOffset;
+        assert startPosData.startOffset == startOffset: "prev startOffset=" + startPosData.startOffset + " vs new startOffset=" + startOffset + " inputPos=" + inputPos;
       }
 
       final int endOffset = offsetAtt.endOffset();
@@ -159,7 +159,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
         endPosData.endOffset = endOffset;
       } else {
         // Make sure our input isn't messing up offsets:
-        assert endPosData.endOffset == endOffset;
+        assert endPosData.endOffset == endOffset: "prev endOffset=" + endPosData.endOffset + " vs new endOffset=" + endOffset + " inputPos=" + inputPos;
       }
 
       tokenPending = true;
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
new file mode 100644
index 00000000000..264999cdc9b
--- /dev/null
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
@@ -0,0 +1,117 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+
+// nocommit better name...?
+
+// nocommit BTSTC should just append this to the chain
+// instead of checking itself:
+
+/** A TokenFilter that checks consistency of the tokens (eg
+ *  offsets are consistent with one another). */
+public final class ValidatingTokenFilter extends TokenFilter {
+
+  private int pos;
+
+  // Maps position to the start/end offset:
+  private final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
+  private final Map<Integer,Integer> posToEndOffset = new HashMap<Integer,Integer>();
+
+  // nocommit must be more careful here?  check hasAttribute first...?
+  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+  private final String name;
+
+  /** The name arg is used to identify this stage when
+   *  throwing exceptions (useful if you have more than one
+   *  instance in your chain). */
+  public ValidatingTokenFilter(TokenStream in, String name) {
+    super(in);
+    this.name = name;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (!input.incrementToken()) {
+      return false;
+    }
+
+    pos += posIncAtt.getPositionIncrement();
+    if (pos == -1) {
+      throw new IllegalStateException("first posInc must be > 0");
+    }
+
+    final int startOffset = offsetAtt.startOffset();
+    final int endOffset = offsetAtt.endOffset();
+
+    final int posLen = posLenAtt.getPositionLength();
+    if (!posToStartOffset.containsKey(pos)) {
+      // First time we've seen a token leaving from this position:
+      posToStartOffset.put(pos, startOffset);
+      System.out.println("  + s " + pos + " -> " + startOffset);
+    } else {
+      // We've seen a token leaving from this position
+      // before; verify the startOffset is the same:
+      System.out.println("  + vs " + pos + " -> " + startOffset);
+      final int oldStartOffset = posToStartOffset.get(pos);
+      if (oldStartOffset != startOffset) {
+        throw new IllegalStateException(name + ": inconsistent startOffset as pos=" + pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + termAtt);
+      }
+    }
+
+    final int endPos = pos + posLen;
+
+    if (!posToEndOffset.containsKey(endPos)) {
+      // First time we've seen a token arriving to this position:
+      posToEndOffset.put(endPos, endOffset);
+      System.out.println("  + e " + endPos + " -> " + endOffset);
+    } else {
+      // We've seen a token arriving to this position
+      // before; verify the endOffset is the same:
+      System.out.println("  + ve " + endPos + " -> " + endOffset);
+      final int oldEndOffset = posToEndOffset.get(endPos);
+      if (oldEndOffset != endOffset) {
+        throw new IllegalStateException(name + ": inconsistent endOffset as pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + termAtt);
+      }
+    }
+
+    return true;
+  }
+
+  // TODO: end?  (what to validate?)
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    pos = -1;
+    posToStartOffset.clear();
+    posToEndOffset.clear();
+  }
+}
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
index 464bde05bcc..8ff920a4600 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
@@ -23,9 +23,10 @@ import java.util.LinkedList;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.AttributeSource;
 
@@ -150,6 +151,7 @@ public final class ShingleFilter extends TokenFilter {
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
 
 
@@ -319,6 +321,8 @@ public final class ShingleFilter extends TokenFilter {
           noShingleOutput = false;
         }
         offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
+        // nocommit is this right!?  i'm just guessing...
+        posLenAtt.setPositionLength(builtGramSize);
         isOutputHere = true;
         gramSize.advance();
         tokenAvailable = true;
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 482c1bc864e..477e0bc16cd 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -34,11 +34,11 @@ import java.util.Collections;
 import java.util.Comparator;
 import java.util.Enumeration;
 import java.util.HashSet;
+import java.util.IdentityHashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.Random;
 import java.util.Set;
-import java.util.Map;
-import java.util.IdentityHashMap;
 import java.util.regex.Pattern;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -52,6 +52,7 @@ import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ValidatingTokenFilter;
 import org.apache.lucene.analysis.charfilter.CharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
@@ -73,8 +74,8 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.synonym.SynonymMap;
 import org.apache.lucene.analysis.util.CharArrayMap;
 import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.AttributeSource.AttributeFactory;
+import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.Rethrow;
 import org.apache.lucene.util.Version;
@@ -133,6 +134,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       ) {
         continue;
       }
+
+      if (c == ValidatingTokenFilter.class) {
+        // We insert this one ourselves after each stage...
+        continue;
+      }
+
       for (final Constructor<?> ctor : c.getConstructors()) {
         // don't test deprecated ctors, they likely have known bugs:
         if (ctor.isAnnotationPresent(Deprecated.class) || ctor.isSynthetic()) {
@@ -635,6 +642,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       StringBuilder descr = new StringBuilder();
       int numFilters = random.nextInt(5);
       for (int i = 0; i < numFilters; i++) {
+
+        // Insert ValidatingTF after each stage so we can
+        // catch problems right after the TF that "caused"
+        // them:
+        spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
+
         while (true) {
           final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
           final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
@@ -645,6 +658,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
           }
         }
       }
+
+      // Insert ValidatingTF after each stage so we can
+      // catch problems right after the TF that "caused"
+      // them:
+      spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
+
       spec.toString = descr.toString();
       return spec;
     }

From 11a65763d0b708183e3cfcf17453ddacf55e724c Mon Sep 17 00:00:00 2001
From: Michael McCandless <mikemccand@apache.org>
Date: Mon, 9 Apr 2012 19:45:16 +0000
Subject: [PATCH 17/40] LUCENE-3969: remove nocommit

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311400 13f79535-47bb-0310-9956-ffa450edef68
---
 .../java/org/apache/lucene/analysis/shingle/ShingleFilter.java   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
index 8ff920a4600..50e7ab59840 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
@@ -321,7 +321,6 @@ public final class ShingleFilter extends TokenFilter {
           noShingleOutput = false;
         }
         offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
-        // nocommit is this right!?  i'm just guessing...
         posLenAtt.setPositionLength(builtGramSize);
         isOutputHere = true;
         gramSize.advance();

From 3e098abaedf532b12f429e885828cee6f3799615 Mon Sep 17 00:00:00 2001
From: Michael McCandless <mikemccand@apache.org>
Date: Mon, 9 Apr 2012 20:00:50 +0000
Subject: [PATCH 18/40] LUCENE-3969: ValidatingTokenFilter shouldn't create new
 atts

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311405 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/ValidatingTokenFilter.java       | 97 +++++++++++--------
 .../analysis/core/TestRandomChains.java       | 10 +-
 2 files changed, 63 insertions(+), 44 deletions(-)

diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
index 264999cdc9b..fe98feb3116 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
@@ -25,6 +25,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.Attribute;
 
 // nocommit better name...?
 
@@ -41,14 +42,22 @@ public final class ValidatingTokenFilter extends TokenFilter {
   private final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
   private final Map<Integer,Integer> posToEndOffset = new HashMap<Integer,Integer>();
 
-  // nocommit must be more careful here?  check hasAttribute first...?
-  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
-  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
-  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final PositionIncrementAttribute posIncAtt = getAttrIfExists(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAtt = getAttrIfExists(PositionLengthAttribute.class);
+  private final OffsetAttribute offsetAtt = getAttrIfExists(OffsetAttribute.class);
+  private final CharTermAttribute termAtt = getAttrIfExists(CharTermAttribute.class);
 
   private final String name;
 
+  // Returns null if the attr wasn't already added
+  private <A extends Attribute> A getAttrIfExists(Class<A> att) {
+    if (hasAttribute(att)) {
+      return getAttribute(att);
+    } else {
+      return null;
+    }
+  }
+
   /** The name arg is used to identify this stage when
    *  throwing exceptions (useful if you have more than one
    *  instance in your chain). */
@@ -63,49 +72,61 @@ public final class ValidatingTokenFilter extends TokenFilter {
       return false;
     }
 
-    pos += posIncAtt.getPositionIncrement();
-    if (pos == -1) {
-      throw new IllegalStateException("first posInc must be > 0");
-    }
+    if (posIncAtt != null && offsetAtt != null) {
 
-    final int startOffset = offsetAtt.startOffset();
-    final int endOffset = offsetAtt.endOffset();
-
-    final int posLen = posLenAtt.getPositionLength();
-    if (!posToStartOffset.containsKey(pos)) {
-      // First time we've seen a token leaving from this position:
-      posToStartOffset.put(pos, startOffset);
-      System.out.println("  + s " + pos + " -> " + startOffset);
-    } else {
-      // We've seen a token leaving from this position
-      // before; verify the startOffset is the same:
-      System.out.println("  + vs " + pos + " -> " + startOffset);
-      final int oldStartOffset = posToStartOffset.get(pos);
-      if (oldStartOffset != startOffset) {
-        throw new IllegalStateException(name + ": inconsistent startOffset as pos=" + pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + termAtt);
+      pos += posIncAtt.getPositionIncrement();
+      if (pos == -1) {
+        throw new IllegalStateException("first posInc must be > 0");
       }
-    }
 
-    final int endPos = pos + posLen;
+      final int startOffset = offsetAtt.startOffset();
+      final int endOffset = offsetAtt.endOffset();
 
-    if (!posToEndOffset.containsKey(endPos)) {
-      // First time we've seen a token arriving to this position:
-      posToEndOffset.put(endPos, endOffset);
-      System.out.println("  + e " + endPos + " -> " + endOffset);
-    } else {
-      // We've seen a token arriving to this position
-      // before; verify the endOffset is the same:
-      System.out.println("  + ve " + endPos + " -> " + endOffset);
-      final int oldEndOffset = posToEndOffset.get(endPos);
-      if (oldEndOffset != endOffset) {
-        throw new IllegalStateException(name + ": inconsistent endOffset as pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + termAtt);
+      final int posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength();
+
+      if (!posToStartOffset.containsKey(pos)) {
+        // First time we've seen a token leaving from this position:
+        posToStartOffset.put(pos, startOffset);
+        System.out.println("  + s " + pos + " -> " + startOffset);
+      } else {
+        // We've seen a token leaving from this position
+        // before; verify the startOffset is the same:
+        System.out.println("  + vs " + pos + " -> " + startOffset);
+        final int oldStartOffset = posToStartOffset.get(pos);
+        if (oldStartOffset != startOffset) {
+          throw new IllegalStateException(name + ": inconsistent startOffset as pos=" + pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + termAtt);
+        }
+      }
+
+      final int endPos = pos + posLen;
+
+      if (!posToEndOffset.containsKey(endPos)) {
+        // First time we've seen a token arriving to this position:
+        posToEndOffset.put(endPos, endOffset);
+        System.out.println("  + e " + endPos + " -> " + endOffset);
+      } else {
+        // We've seen a token arriving to this position
+        // before; verify the endOffset is the same:
+        System.out.println("  + ve " + endPos + " -> " + endOffset);
+        final int oldEndOffset = posToEndOffset.get(endPos);
+        if (oldEndOffset != endOffset) {
+          throw new IllegalStateException(name + ": inconsistent endOffset as pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + termAtt);
+        }
       }
     }
 
     return true;
   }
 
-  // TODO: end?  (what to validate?)
+  @Override
+  public void end() throws IOException {
+    super.end();
+
+    // TODO: what else to validate
+
+    // nocommit check that endOffset is >= max(endOffset)
+    // we've seen
+  }
 
   @Override
   public void reset() throws IOException {
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 477e0bc16cd..4f348f57626 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -111,7 +111,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       // broken!
       EdgeNGramTokenizer.class,
       // broken!
-      EdgeNGramTokenFilter.class
+      EdgeNGramTokenFilter.class,
+      // Not broken: we forcefully add this, so we shouldn't
+      // also randomly pick it:
+      ValidatingTokenFilter.class
     );
   }
   
@@ -135,11 +138,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         continue;
       }
 
-      if (c == ValidatingTokenFilter.class) {
-        // We insert this one ourselves after each stage...
-        continue;
-      }
-
       for (final Constructor<?> ctor : c.getConstructors()) {
         // don't test deprecated ctors, they likely have known bugs:
         if (ctor.isAnnotationPresent(Deprecated.class) || ctor.isSynthetic()) {

From 9e98ec186cb042546bb98172327585f60b17ce2f Mon Sep 17 00:00:00 2001
From: Michael McCandless <mikemccand@apache.org>
Date: Mon, 9 Apr 2012 20:04:55 +0000
Subject: [PATCH 19/40] LUCENE-3969: check that startOffset <= endOffset;
 comment out sops

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311406 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/analysis/ValidatingTokenFilter.java   | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
index fe98feb3116..984f8b5b696 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
@@ -82,19 +82,23 @@ public final class ValidatingTokenFilter extends TokenFilter {
       final int startOffset = offsetAtt.startOffset();
       final int endOffset = offsetAtt.endOffset();
 
+      if (endOffset < startOffset) {
+        throw new IllegalStateException(name + ": startOffset=" + startOffset + " is > endOffset=" + endOffset + " pos=" + pos + "; token=" + termAtt);
+      }
+
       final int posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength();
 
       if (!posToStartOffset.containsKey(pos)) {
         // First time we've seen a token leaving from this position:
         posToStartOffset.put(pos, startOffset);
-        System.out.println("  + s " + pos + " -> " + startOffset);
+        //System.out.println("  + s " + pos + " -> " + startOffset);
       } else {
         // We've seen a token leaving from this position
         // before; verify the startOffset is the same:
-        System.out.println("  + vs " + pos + " -> " + startOffset);
+        //System.out.println("  + vs " + pos + " -> " + startOffset);
         final int oldStartOffset = posToStartOffset.get(pos);
         if (oldStartOffset != startOffset) {
-          throw new IllegalStateException(name + ": inconsistent startOffset as pos=" + pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + termAtt);
+          throw new IllegalStateException(name + ": inconsistent startOffset at pos=" + pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + termAtt);
         }
       }
 
@@ -103,14 +107,14 @@ public final class ValidatingTokenFilter extends TokenFilter {
       if (!posToEndOffset.containsKey(endPos)) {
         // First time we've seen a token arriving to this position:
         posToEndOffset.put(endPos, endOffset);
-        System.out.println("  + e " + endPos + " -> " + endOffset);
+        //System.out.println("  + e " + endPos + " -> " + endOffset);
       } else {
         // We've seen a token arriving to this position
         // before; verify the endOffset is the same:
-        System.out.println("  + ve " + endPos + " -> " + endOffset);
+        //System.out.println("  + ve " + endPos + " -> " + endOffset);
         final int oldEndOffset = posToEndOffset.get(endPos);
         if (oldEndOffset != endOffset) {
-          throw new IllegalStateException(name + ": inconsistent endOffset as pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + termAtt);
+          throw new IllegalStateException(name + ": inconsistent endOffset at pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + termAtt);
         }
       }
     }

From a764c0d021cbc35ca035808292ce8d86078783c3 Mon Sep 17 00:00:00 2001
From: Michael McCandless <mikemccand@apache.org>
Date: Tue, 10 Apr 2012 10:28:24 +0000
Subject: [PATCH 20/40] LUCENE-3969: add whitespace to analyzer description

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311667 13f79535-47bb-0310-9956-ffa450edef68
---
 .../test/org/apache/lucene/analysis/core/TestRandomChains.java | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 4f348f57626..4bdd65b9abb 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -569,9 +569,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     private <T> T createComponent(Constructor<T> ctor, Object[] args, StringBuilder descr) {
       try {
         final T instance = ctor.newInstance(args);
+        /*
         if (descr.length() > 0) {
           descr.append(",");
         }
+        */
+        descr.append("\n  ");
         descr.append(ctor.getDeclaringClass().getName());
         String params = Arrays.toString(args);
         params = params.substring(1, params.length()-1);

From 3706fbc5b0483a9e455d7c0c5b23df1dcbe4f138 Mon Sep 17 00:00:00 2001
From: Uwe Schindler <uschindler@apache.org>
Date: Tue, 10 Apr 2012 13:50:03 +0000
Subject: [PATCH 21/40] Fix ShingleFilter reuse, some minor changes to testcase
 for speed and consistency

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311724 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/analysis/shingle/ShingleFilter.java    |  2 ++
 .../lucene/analysis/core/TestRandomChains.java    | 15 +++++++--------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
index 50e7ab59840..d0b8e055352 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
@@ -439,6 +439,8 @@ public final class ShingleFilter extends TokenFilter {
     super.reset();
     gramSize.reset();
     inputWindow.clear();
+    nextInputStreamToken = null;
+    isNextInputStreamToken = false;
     numFillerTokensToInsert = 0;
     isOutputHere = false;
     noShingleOutput = true;
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 4bdd65b9abb..777f7e7b10a 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -130,17 +130,17 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       if (
         // don't waste time with abstract classes or deprecated known-buggy ones
         Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
-        || c.isAnnotationPresent(Deprecated.class)
         || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
-        || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharStream.class.isAssignableFrom(c))
         || brokenComponents.contains(c)
+        || c.isAnnotationPresent(Deprecated.class)
+        || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharStream.class.isAssignableFrom(c))
       ) {
         continue;
       }
 
       for (final Constructor<?> ctor : c.getConstructors()) {
-        // don't test deprecated ctors, they likely have known bugs:
-        if (ctor.isAnnotationPresent(Deprecated.class) || ctor.isSynthetic()) {
+        // don't test synthetic or deprecated ctors, they likely have known bugs:
+        if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class)) {
           continue;
         }
         if (Tokenizer.class.isAssignableFrom(c)) {
@@ -258,9 +258,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     });
     put(byte.class, new ArgProducer() {
       @Override public Object create(Random random) {
-        byte bytes[] = new byte[1];
-        random.nextBytes(bytes);
-        return Byte.valueOf(bytes[0]);
+        // this wraps to negative when casting to byte
+        return Byte.valueOf((byte) random.nextInt(256));
       }
     });
     put(byte[].class, new ArgProducer() {
@@ -671,7 +670,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
   }
   
   static final class CheckThatYouDidntReadAnythingReaderWrapper extends CharFilter {
-    boolean readSomething;
+    boolean readSomething = false;
     
     CheckThatYouDidntReadAnythingReaderWrapper(Reader in) {
       super(CharReader.get(in));

From d4b5405533cc89c12d9c72eaa924e2acf1a1ec07 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Tue, 10 Apr 2012 14:09:15 +0000
Subject: [PATCH 22/40] LUCENE-3969: check offsets even if posIncrAtt doesnt
 yet exist (and vice versa), and check that offsets are non-negative

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311734 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/ValidatingTokenFilter.java       | 28 ++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
index 984f8b5b696..9f81f7266cc 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
@@ -72,21 +72,35 @@ public final class ValidatingTokenFilter extends TokenFilter {
       return false;
     }
 
-    if (posIncAtt != null && offsetAtt != null) {
-
+    int startOffset = 0;
+    int endOffset = 0;
+    int posLen = 0;
+    
+    if (posIncAtt != null) {
       pos += posIncAtt.getPositionIncrement();
       if (pos == -1) {
         throw new IllegalStateException("first posInc must be > 0");
       }
+    }
+    
+    if (offsetAtt != null) {
+      startOffset = offsetAtt.startOffset();
+      endOffset = offsetAtt.endOffset();
 
-      final int startOffset = offsetAtt.startOffset();
-      final int endOffset = offsetAtt.endOffset();
-
+      if (startOffset < 0) {
+        throw new IllegalStateException(name + ": startOffset=" + startOffset + " is < 0");
+      }
+      if (endOffset < 0) {
+        throw new IllegalStateException(name + ": endOffset=" + endOffset + " is < 0");
+      }
       if (endOffset < startOffset) {
         throw new IllegalStateException(name + ": startOffset=" + startOffset + " is > endOffset=" + endOffset + " pos=" + pos + "; token=" + termAtt);
       }
-
-      final int posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength();
+    }
+    
+    posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength();
+    
+    if (offsetAtt != null && posIncAtt != null) {
 
       if (!posToStartOffset.containsKey(pos)) {
         // First time we've seen a token leaving from this position:

From 8966429dab94fc5c45f9e67737d0f20d2eca42ed Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Tue, 10 Apr 2012 14:19:09 +0000
Subject: [PATCH 23/40] LUCENE-3969: disable these for now so we can work on
 the other issues

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311748 13f79535-47bb-0310-9956-ffa450edef68
---
 .../apache/lucene/analysis/core/TestRandomChains.java    | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 777f7e7b10a..2270c571a14 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -56,6 +56,7 @@ import org.apache.lucene.analysis.ValidatingTokenFilter;
 import org.apache.lucene.analysis.charfilter.CharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
+import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
 import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
 import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter;
 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
@@ -66,6 +67,8 @@ import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
 import org.apache.lucene.analysis.ngram.NGramTokenFilter;
 import org.apache.lucene.analysis.ngram.NGramTokenizer;
+import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
+import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
 import org.apache.lucene.analysis.payloads.IdentityEncoder;
 import org.apache.lucene.analysis.payloads.PayloadEncoder;
 import org.apache.lucene.analysis.position.PositionFilter;
@@ -112,6 +115,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       EdgeNGramTokenizer.class,
       // broken!
       EdgeNGramTokenFilter.class,
+      // fix these 4 to use 'real positions' and not stack the way they do:
+      // if you want that use positionfilter
+      PathHierarchyTokenizer.class,
+      ReversePathHierarchyTokenizer.class,
+      HyphenationCompoundWordTokenFilter.class,
+      DictionaryCompoundWordTokenFilter.class,
       // Not broken: we forcefully add this, so we shouldn't
       // also randomly pick it:
       ValidatingTokenFilter.class

From f97ac2d0cb9b1a374c2af1b1f9f8b1eeeb720401 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Tue, 10 Apr 2012 14:38:39 +0000
Subject: [PATCH 24/40] LUCENE-3969: add failing test case for
 MappingCharFilter wrong final offset

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311761 13f79535-47bb-0310-9956-ffa450edef68
---
 .../charfilter/TestMappingCharFilter.java     | 24 +++++++++++++++++++
 .../analysis/core/TestRandomChains.java       |  5 +++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
index 9740bafb847..2e86a977f52 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
@@ -190,4 +190,28 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
     int numRounds = RANDOM_MULTIPLIER * 10000;
     checkRandomData(random, analyzer, numRounds);
   }
+  
+  // nocommit: wrong final offset, fix this!
+  public void testFinalOffsetSpecialCase() throws Exception {  
+    final NormalizeCharMap map = new NormalizeCharMap();
+    map.add("t", "");
+    // even though this below rule has no effect, the test passes if you remove it!!
+    map.add("tmakdbl", "c");
+    
+    Analyzer analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
+
+      @Override
+      protected Reader initReader(Reader reader) {
+        return new MappingCharFilter(map, CharReader.get(reader));
+      }
+    };
+    
+    String text = "gzw f quaxot";
+    checkAnalysisConsistency(random, analyzer, false, text);
+  }
 }
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 2270c571a14..aef40acc9a4 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -389,11 +389,14 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         // we can't add duplicate keys, or NormalizeCharMap gets angry
         Set<String> keys = new HashSet<String>();
         int num = random.nextInt(5);
+        //System.out.println("NormalizeCharMap=");
         for (int i = 0; i < num; i++) {
           String key = _TestUtil.randomSimpleString(random);
           if (!keys.contains(key)) {
-            map.add(key,_TestUtil.randomSimpleString(random));
+            String value = _TestUtil.randomSimpleString(random);
+            map.add(key, value);
             keys.add(key);
+            //System.out.println("mapping: '" + key + "' => '" + value + "'");
           }
         }
         return map;

From 6563a58a2a6822d41b159b5654eed9853659e222 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Tue, 10 Apr 2012 14:49:36 +0000
Subject: [PATCH 25/40] LUCENE-3969: add new random test for MappingCharFilter
 (sometimes fails, due to same final offset bug)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311765 13f79535-47bb-0310-9956-ffa450edef68
---
 .../charfilter/TestMappingCharFilter.java     | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
index 2e86a977f52..56efa87b1f5 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
@@ -19,6 +19,8 @@ package org.apache.lucene.analysis.charfilter;
 
 import java.io.Reader;
 import java.io.StringReader;
+import java.util.HashSet;
+import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -27,6 +29,7 @@ import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util._TestUtil;
 
 public class TestMappingCharFilter extends BaseTokenStreamTestCase {
 
@@ -214,4 +217,43 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
     String text = "gzw f quaxot";
     checkAnalysisConsistency(random, analyzer, false, text);
   }
+  
+  // nocommit: this is intended to fail until we fix bugs
+  public void testRandomMaps() throws Exception {
+    for (int i = 0; i < 100; i++) {
+      final NormalizeCharMap map = randomMap();
+      Analyzer analyzer = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          return new TokenStreamComponents(tokenizer, tokenizer);
+        }
+
+        @Override
+        protected Reader initReader(Reader reader) {
+          return new MappingCharFilter(map, CharReader.get(reader));
+        }
+      };
+      int numRounds = RANDOM_MULTIPLIER * 100;
+      checkRandomData(random, analyzer, numRounds);
+    }
+  }
+  
+  private NormalizeCharMap randomMap() {
+    NormalizeCharMap map = new NormalizeCharMap();
+    // we can't add duplicate keys, or NormalizeCharMap gets angry
+    Set<String> keys = new HashSet<String>();
+    int num = random.nextInt(5);
+    //System.out.println("NormalizeCharMap=");
+    for (int i = 0; i < num; i++) {
+      String key = _TestUtil.randomSimpleString(random);
+      if (!keys.contains(key)) {
+        String value = _TestUtil.randomSimpleString(random);
+        map.add(key, value);
+        keys.add(key);
+        //System.out.println("mapping: '" + key + "' => '" + value + "'");
+      }
+    }
+    return map;
+  }
 }

From b67e7a0a9ba8d6e1f0f15abf5e103a9a71a9d907 Mon Sep 17 00:00:00 2001
From: Michael McCandless <mikemccand@apache.org>
Date: Tue, 10 Apr 2012 16:54:54 +0000
Subject: [PATCH 26/40] LUCENE-3969: make full offset checking optional and
 disable for the known (buggy) offenders

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311864 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/BaseTokenStreamTestCase.java     |  70 +++++++---
 .../analysis/ValidatingTokenFilter.java       |  20 ++-
 .../charfilter/TestMappingCharFilter.java     |   3 +
 .../analysis/core/TestRandomChains.java       | 125 +++++++++++-------
 .../miscellaneous/TestTrimFilter.java         |   6 +-
 .../TestWordDelimiterFilter.java              |  56 ++++++--
 .../ngram/EdgeNGramTokenFilterTest.java       |  12 +-
 .../ngram/EdgeNGramTokenizerTest.java         |  10 +-
 .../analysis/ngram/NGramTokenFilterTest.java  |   5 +-
 .../analysis/ngram/NGramTokenizerTest.java    |  10 +-
 10 files changed, 223 insertions(+), 94 deletions(-)

diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
index d0f4b2b81d9..a9989ac6845 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@@ -100,7 +100,14 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     }
   }
 
-  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
+  // offsetsAreCorrect also validates:
+  //   - graph offsets are correct (all tokens leaving from
+  //     pos X have the same startOffset; all tokens
+  //     arriving to pos Y have the same endOffset)
+  //   - offsets only move forwards (startOffset >=
+  //     lastStartOffset)
+  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset,
+                                               boolean offsetsAreCorrect) throws IOException {
     assertNotNull(output);
     CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
     
@@ -137,6 +144,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
 
     ts.reset();
     int pos = -1;
+    int lastStartOffset = 0;
     for (int i = 0; i < output.length; i++) {
       // extra safety to enforce, that the state is not preserved and also assign bogus values
       ts.clearAttributes();
@@ -176,7 +184,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
                      endOffset <= finalOffset.intValue());
         }
 
-        if (posLengthAtt != null && posIncrAtt != null) {
+        if (offsetsAreCorrect) {
+          assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset);
+          lastStartOffset = offsetAtt.startOffset();
+        }
+
+        if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
           // Validate offset consistency in the graph, ie
           // all tokens leaving from a certain pos have the
           // same startOffset, and all tokens arriving to a
@@ -233,6 +246,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     ts.close();
   }
   
+  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, true);
+  }
+
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
     assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset);
   }
@@ -280,6 +297,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
   public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
     assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
   }
+
+  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect) throws IOException {
+    assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect);
+  }
   
   public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
     assertAnalyzesTo(a, input, output, null, null, null, null, null);
@@ -342,12 +363,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
   
   /** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
   public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
-    checkRandomData(random, a, iterations, 20, false);
+    checkRandomData(random, a, iterations, 20, false, true);
   }
-  
+
   /** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
   public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
-    checkRandomData(random, a, iterations, maxWordLength, false);
+    checkRandomData(random, a, iterations, maxWordLength, false, true);
   }
   
   /** 
@@ -355,7 +376,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
    * @param simple true if only ascii strings will be used (try to avoid)
    */
   public static void checkRandomData(Random random, Analyzer a, int iterations, boolean simple) throws IOException {
-    checkRandomData(random, a, iterations, 20, simple);
+    checkRandomData(random, a, iterations, 20, simple, true);
   }
   
   static class AnalysisThread extends Thread {
@@ -364,13 +385,15 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     final Random random;
     final Analyzer a;
     final boolean simple;
+    final boolean offsetsAreCorrect;
     
-    AnalysisThread(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) {
+    AnalysisThread(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) {
       this.random = random;
       this.a = a;
       this.iterations = iterations;
       this.maxWordLength = maxWordLength;
       this.simple = simple;
+      this.offsetsAreCorrect = offsetsAreCorrect;
     }
     
     @Override
@@ -378,7 +401,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
       try {
         // see the part in checkRandomData where it replays the same text again
         // to verify reproducability/reuse: hopefully this would catch thread hazards.
-        checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple);
+        checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple, offsetsAreCorrect);
       } catch (IOException e) {
         Rethrow.rethrow(e);
       }
@@ -386,12 +409,16 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
   };
   
   public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) throws IOException {
-    checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple);
+    checkRandomData(random, a, iterations, maxWordLength, simple, true);
+  }
+
+  public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) throws IOException {
+    checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple, offsetsAreCorrect);
     // now test with multiple threads
     int numThreads = _TestUtil.nextInt(random, 4, 8);
     Thread threads[] = new Thread[numThreads];
     for (int i = 0; i < threads.length; i++) {
-      threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations, maxWordLength, simple);
+      threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations, maxWordLength, simple, offsetsAreCorrect);
     }
     for (int i = 0; i < threads.length; i++) {
       threads[i].start();
@@ -405,7 +432,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     }
   }
 
-  private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple) throws IOException {
+  private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect) throws IOException {
 
     final LineFileDocs docs = new LineFileDocs(random);
 
@@ -437,7 +464,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
       }
 
       try {
-        checkAnalysisConsistency(random, a, useCharFilter, text);
+        checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect);
       } catch (Throwable t) {
         // TODO: really we should pass a random seed to
         // checkAnalysisConsistency then print it here too:
@@ -477,6 +504,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
   }
 
   public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
+    checkAnalysisConsistency(random, a, useCharFilter, text, true);
+  }
+
+  public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect) throws IOException {
 
     if (VERBOSE) {
       System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
@@ -616,7 +647,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
                                 types.toArray(new String[types.size()]),
                                 toIntArray(positions),
                                 toIntArray(positionLengths),
-                                text.length());
+                                text.length(),
+                                offsetsAreCorrect);
     } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
       // offset + pos + type
       assertTokenStreamContents(ts, 
@@ -626,7 +658,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
                                 types.toArray(new String[types.size()]),
                                 toIntArray(positions),
                                 null,
-                                text.length());
+                                text.length(),
+                                offsetsAreCorrect);
     } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
       // offset + pos + posLength
       assertTokenStreamContents(ts, 
@@ -636,7 +669,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
                                 null,
                                 toIntArray(positions),
                                 toIntArray(positionLengths),
-                                text.length());
+                                text.length(),
+                                offsetsAreCorrect);
     } else if (posIncAtt != null && offsetAtt != null) {
       // offset + pos
       assertTokenStreamContents(ts, 
@@ -646,7 +680,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
                                 null,
                                 toIntArray(positions),
                                 null,
-                                text.length());
+                                text.length(),
+                                offsetsAreCorrect);
     } else if (offsetAtt != null) {
       // offset
       assertTokenStreamContents(ts, 
@@ -656,7 +691,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
                                 null,
                                 null,
                                 null,
-                                text.length());
+                                text.length(),
+                                offsetsAreCorrect);
     } else {
       // terms only
       assertTokenStreamContents(ts, 
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
index 9f81f7266cc..976f0ff950e 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
@@ -27,7 +27,11 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.util.Attribute;
 
-// nocommit better name...?
+// nocommit rename to OffsetsXXXTF?  ie we only validate
+// offsets (now anyway...)
+
+// TODO: also make a DebuggingTokenFilter, that just prints
+// all att values that come through it...
 
 // nocommit BTSTC should just append this to the chain
 // instead of checking itself:
@@ -37,6 +41,7 @@ import org.apache.lucene.util.Attribute;
 public final class ValidatingTokenFilter extends TokenFilter {
 
   private int pos;
+  private int lastStartOffset;
 
   // Maps position to the start/end offset:
   private final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
@@ -46,6 +51,7 @@ public final class ValidatingTokenFilter extends TokenFilter {
   private final PositionLengthAttribute posLenAtt = getAttrIfExists(PositionLengthAttribute.class);
   private final OffsetAttribute offsetAtt = getAttrIfExists(OffsetAttribute.class);
   private final CharTermAttribute termAtt = getAttrIfExists(CharTermAttribute.class);
+  private final boolean offsetsAreCorrect;
 
   private final String name;
 
@@ -61,9 +67,10 @@ public final class ValidatingTokenFilter extends TokenFilter {
   /** The name arg is used to identify this stage when
    *  throwing exceptions (useful if you have more than one
    *  instance in your chain). */
-  public ValidatingTokenFilter(TokenStream in, String name) {
+  public ValidatingTokenFilter(TokenStream in, String name, boolean offsetsAreCorrect) {
     super(in);
     this.name = name;
+    this.offsetsAreCorrect = offsetsAreCorrect;
   }
 
   @Override
@@ -82,6 +89,8 @@ public final class ValidatingTokenFilter extends TokenFilter {
         throw new IllegalStateException("first posInc must be > 0");
       }
     }
+
+    // System.out.println("  got token=" + termAtt + " pos=" + pos);
     
     if (offsetAtt != null) {
       startOffset = offsetAtt.startOffset();
@@ -96,11 +105,15 @@ public final class ValidatingTokenFilter extends TokenFilter {
       if (endOffset < startOffset) {
         throw new IllegalStateException(name + ": startOffset=" + startOffset + " is > endOffset=" + endOffset + " pos=" + pos + "; token=" + termAtt);
       }
+      if (offsetsAreCorrect && offsetAtt.startOffset() < lastStartOffset) {
+        throw new IllegalStateException(name + ": offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
+      }
+      lastStartOffset = offsetAtt.startOffset();
     }
     
     posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength();
     
-    if (offsetAtt != null && posIncAtt != null) {
+    if (offsetAtt != null && posIncAtt != null && offsetsAreCorrect) {
 
       if (!posToStartOffset.containsKey(pos)) {
         // First time we've seen a token leaving from this position:
@@ -152,5 +165,6 @@ public final class ValidatingTokenFilter extends TokenFilter {
     pos = -1;
     posToStartOffset.clear();
     posToEndOffset.clear();
+    lastStartOffset = 0;
   }
 }
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
index 56efa87b1f5..71986253cee 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
@@ -30,6 +30,7 @@ import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.util._TestUtil;
+import org.junit.Ignore;
 
 public class TestMappingCharFilter extends BaseTokenStreamTestCase {
 
@@ -195,6 +196,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
   }
   
   // nocommit: wrong final offset, fix this!
+  @Ignore
   public void testFinalOffsetSpecialCase() throws Exception {  
     final NormalizeCharMap map = new NormalizeCharMap();
     map.add("t", "");
@@ -219,6 +221,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
   }
   
   // nocommit: this is intended to fail until we fix bugs
+  @Ignore
   public void testRandomMaps() throws Exception {
     for (int i = 0; i < 100; i++) {
       final NormalizeCharMap map = randomMap();
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index aef40acc9a4..7034834665a 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -52,6 +52,7 @@ import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
 import org.apache.lucene.analysis.ValidatingTokenFilter;
 import org.apache.lucene.analysis.charfilter.CharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
@@ -63,6 +64,8 @@ import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
 import org.apache.lucene.analysis.hunspell.HunspellDictionary;
 import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest;
 import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
+import org.apache.lucene.analysis.miscellaneous.TrimFilter;
+import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
 import org.apache.lucene.analysis.ngram.NGramTokenFilter;
@@ -91,42 +94,54 @@ import org.xml.sax.InputSource;
 
 /** tests random analysis chains */
 public class TestRandomChains extends BaseTokenStreamTestCase {
+
   static List<Constructor<? extends Tokenizer>> tokenizers;
   static List<Constructor<? extends TokenFilter>> tokenfilters;
   static List<Constructor<? extends CharStream>> charfilters;
-  
+
   // TODO: fix those and remove
   private static final Set<Class<?>> brokenComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
   static {
+    // nocommit can we promote some of these to be only
+    // offsets offenders?
     Collections.<Class<?>>addAll(brokenComponents,
-      // TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
-      EmptyTokenizer.class,
-      // doesn't actual reset itself!
-      CachingTokenFilter.class,
-      // nocommit: corrumpts graphs (offset consistency check)
-      PositionFilter.class,
-      // doesn't consume whole stream!
-      LimitTokenCountFilter.class,
-      // broken!
-      NGramTokenizer.class,
-      // broken!
-      NGramTokenFilter.class,
-      // broken!
-      EdgeNGramTokenizer.class,
-      // broken!
-      EdgeNGramTokenFilter.class,
-      // fix these 4 to use 'real positions' and not stack the way they do:
-      // if you want that use positionfilter
-      PathHierarchyTokenizer.class,
-      ReversePathHierarchyTokenizer.class,
-      HyphenationCompoundWordTokenFilter.class,
-      DictionaryCompoundWordTokenFilter.class,
-      // Not broken: we forcefully add this, so we shouldn't
-      // also randomly pick it:
-      ValidatingTokenFilter.class
+                                 // TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
+                                 EmptyTokenizer.class,
+                                 // doesn't actual reset itself!
+                                 CachingTokenFilter.class,
+                                 // doesn't consume whole stream!
+                                 LimitTokenCountFilter.class,
+                                 // Not broken: we forcefully add this, so we shouldn't
+                                 // also randomly pick it:
+                                 ValidatingTokenFilter.class,
+                                 // nocommit: randomly generate the Side enum param here; then promote to brokenOffsets?
+                                 EdgeNGramTokenizer.class,
+                                 // nocommit: randomly generate the Side enum param here; then promote to brokenOffsets?
+                                 EdgeNGramTokenFilter.class
     );
   }
-  
+
+  // TODO: also fix these and remove (maybe):
+  // Classes that don't produce consistent graph offsets:
+  private static final Set<Class<?>> brokenOffsetsComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
+  static {
+    Collections.<Class<?>>addAll(brokenOffsetsComponents,
+                                 WordDelimiterFilter.class,
+                                 TrimFilter.class,
+                                 ReversePathHierarchyTokenizer.class,
+                                 PathHierarchyTokenizer.class,
+                                 HyphenationCompoundWordTokenFilter.class,
+                                 DictionaryCompoundWordTokenFilter.class,
+                                 // nocommit: corrumpts graphs (offset consistency check):
+                                 PositionFilter.class,
+                                 // broken!
+                                 NGramTokenizer.class,
+                                 // broken!
+                                 NGramTokenFilter.class,
+                                 // nocommit it seems to mess up offsets!?
+                                 WikipediaTokenizer.class
+                                 );
+  }
   @BeforeClass
   public static void beforeClass() throws Exception {
     List<Class<?>> analysisClasses = new ArrayList<Class<?>>();
@@ -146,7 +161,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       ) {
         continue;
       }
-
       for (final Constructor<?> ctor : c.getConstructors()) {
         // don't test synthetic or deprecated ctors, they likely have known bugs:
         if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class)) {
@@ -154,22 +168,21 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         }
         if (Tokenizer.class.isAssignableFrom(c)) {
           assertTrue(ctor.toGenericString() + " has unsupported parameter types",
-            allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+                     allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
           tokenizers.add(castConstructor(Tokenizer.class, ctor));
         } else if (TokenFilter.class.isAssignableFrom(c)) {
           assertTrue(ctor.toGenericString() + " has unsupported parameter types",
-            allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+                     allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
           tokenfilters.add(castConstructor(TokenFilter.class, ctor));
         } else if (CharStream.class.isAssignableFrom(c)) {
           assertTrue(ctor.toGenericString() + " has unsupported parameter types",
-            allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+                     allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
           charfilters.add(castConstructor(CharStream.class, ctor));
         } else {
           fail("Cannot get here");
         }
       }
     }
-
     final Comparator<Constructor<?>> ctorComp = new Comparator<Constructor<?>>() {
       @Override
       public int compare(Constructor<?> arg0, Constructor<?> arg1) {
@@ -179,28 +192,24 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     Collections.sort(tokenizers, ctorComp);
     Collections.sort(tokenfilters, ctorComp);
     Collections.sort(charfilters, ctorComp);
-    
     if (VERBOSE) {
       System.out.println("tokenizers = " + tokenizers);
       System.out.println("tokenfilters = " + tokenfilters);
       System.out.println("charfilters = " + charfilters);
     }
   }
-  
   @AfterClass
   public static void afterClass() throws Exception {
     tokenizers = null;
     tokenfilters = null;
     charfilters = null;
   }
-  
   /** Hack to work around the stupidness of Oracle's strict Java backwards compatibility.
    * {@code Class<T>#getConstructors()} should return unmodifiable {@code List<Constructor<T>>} not array! */
   @SuppressWarnings("unchecked") 
   private static <T> Constructor<T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) {
     return (Constructor<T>) ctor;
   }
-  
   private static void getClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception {
     final ClassLoader cld = TestRandomChains.class.getClassLoader();
     final String path = pckgname.replace('.', '/');
@@ -541,13 +550,21 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     MockRandomAnalyzer(long seed) {
       this.seed = seed;
     }
+
+    public boolean offsetsAreCorrect() {
+      // nocommit: can we not do the full chain here!?
+      Random random = new Random(seed);
+      TokenizerSpec tokenizerSpec = newTokenizer(random, new StringReader(""));
+      TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
+      return filterSpec.offsetsAreCorrect;
+    }
     
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
       Random random = new Random(seed);
-      TokenizerSpec tokenizerspec = newTokenizer(random, reader);
-      TokenFilterSpec filterspec = newFilterChain(random, tokenizerspec.tokenizer);
-      return new TokenStreamComponents(tokenizerspec.tokenizer, filterspec.stream);
+      TokenizerSpec tokenizerSpec = newTokenizer(random, reader);
+      TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
+      return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream);
     }
 
     @Override
@@ -561,19 +578,21 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     public String toString() {
       Random random = new Random(seed);
       StringBuilder sb = new StringBuilder();
-      CharFilterSpec charfilterSpec = newCharFilterChain(random, new StringReader(""));
+      CharFilterSpec charFilterSpec = newCharFilterChain(random, new StringReader(""));
       sb.append("\ncharfilters=");
-      sb.append(charfilterSpec.toString);
+      sb.append(charFilterSpec.toString);
       // intentional: initReader gets its own separate random
       random = new Random(seed);
-      TokenizerSpec tokenizerSpec = newTokenizer(random, charfilterSpec.reader);
+      TokenizerSpec tokenizerSpec = newTokenizer(random, charFilterSpec.reader);
       sb.append("\n");
       sb.append("tokenizer=");
       sb.append(tokenizerSpec.toString);
-      TokenFilterSpec tokenfilterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
+      TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
       sb.append("\n");
       sb.append("filters=");
-      sb.append(tokenfilterSpec.toString);
+      sb.append(tokenFilterSpec.toString);
+      sb.append("\n");
+      sb.append("offsetsAreCorrect=" + tokenFilterSpec.offsetsAreCorrect);
       return sb.toString();
     }
     
@@ -620,6 +639,9 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         final CheckThatYouDidntReadAnythingReaderWrapper wrapper = new CheckThatYouDidntReadAnythingReaderWrapper(reader);
         final Object args[] = newTokenizerArgs(random, wrapper, ctor.getParameterTypes());
         spec.tokenizer = createComponent(ctor, args, descr);
+        if (brokenOffsetsComponents.contains(ctor.getDeclaringClass())) {
+          spec.offsetsAreCorrect = false;
+        }
         if (spec.tokenizer == null) {
           assertFalse(ctor.getDeclaringClass().getName() + " has read something in ctor but failed with UOE/IAE", wrapper.readSomething);
         }
@@ -648,8 +670,9 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       return spec;
     }
     
-    private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) {
+    private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer, boolean offsetsAreCorrect) {
       TokenFilterSpec spec = new TokenFilterSpec();
+      spec.offsetsAreCorrect = offsetsAreCorrect;
       spec.stream = tokenizer;
       StringBuilder descr = new StringBuilder();
       int numFilters = random.nextInt(5);
@@ -658,13 +681,16 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         // Insert ValidatingTF after each stage so we can
         // catch problems right after the TF that "caused"
         // them:
-        spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
+        spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i, spec.offsetsAreCorrect);
 
         while (true) {
           final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
           final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
           final TokenFilter flt = createComponent(ctor, args, descr);
           if (flt != null) {
+            if (brokenOffsetsComponents.contains(ctor.getDeclaringClass())) {
+              spec.offsetsAreCorrect = false;
+            }
             spec.stream = flt;
             break;
           }
@@ -674,7 +700,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       // Insert ValidatingTF after each stage so we can
       // catch problems right after the TF that "caused"
       // them:
-      spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
+      spec.stream = new ValidatingTokenFilter(spec.stream, "last stage", spec.offsetsAreCorrect);
 
       spec.toString = descr.toString();
       return spec;
@@ -722,11 +748,13 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
   static class TokenizerSpec {
     Tokenizer tokenizer;
     String toString;
+    boolean offsetsAreCorrect = true;
   }
   
   static class TokenFilterSpec {
     TokenStream stream;
     String toString;
+    boolean offsetsAreCorrect = true;
   }
   
   static class CharFilterSpec {
@@ -743,7 +771,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         System.out.println("Creating random analyzer:" + a);
       }
       try {
-        checkRandomData(random, a, 1000);
+        checkRandomData(random, a, 1000, 20, false,
+                        false /* We already validate our own offsets... */);
       } catch (Throwable e) {
         System.err.println("Exception from random analyzer: " + a);
         throw e;
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java
index 0179b94e353..e3e8813601e 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java
@@ -65,7 +65,11 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
         new String[] { "a", "b", "c", "" },
         new int[] { 1, 0, 1, 3 },
         new int[] { 2, 1, 2, 3 },
-        new int[] { 1, 1, 1, 1 });
+        null,
+        new int[] { 1, 1, 1, 1 },
+        null,
+        null,
+        false);
   }
   
   /**
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
index 754116c4f60..54e68ab77e8 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
@@ -72,14 +72,16 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
     assertTokenStreamContents(wdf, 
         new String[] { "foo", "bar", "foobar" },
         new int[] { 5, 9, 5 }, 
-        new int[] { 8, 12, 12 });
+        new int[] { 8, 12, 12 },
+        null, null, null, null, false);
 
     wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
     
     assertTokenStreamContents(wdf,
         new String[] { "foo", "bar", "foobar" },
         new int[] { 5, 5, 5 },
-        new int[] { 6, 6, 6 });
+        new int[] { 6, 6, 6 },
+        null, null, null, null, false);
   }
   
   @Test
@@ -123,7 +125,8 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
     assertTokenStreamContents(wdf,
         new String[] { "foo", "bar", "foobar"},
         new int[] { 8, 12, 8 },
-        new int[] { 11, 15, 15 });
+        new int[] { 11, 15, 15 },
+        null, null, null, null, false);
   }
 
   public void doSplit(final String input, String... output) throws Exception {
@@ -230,18 +233,27 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
     assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
         new int[] { 0, 9 },
         new int[] { 6, 13 },
-        new int[] { 1, 1 });
+        null,
+        new int[] { 1, 1 },
+        null,
+        false);
     
     /* only in this case, posInc of 2 ?! */
     assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
         new int[] { 0, 9, 12, 9 },
         new int[] { 6, 12, 13, 13 },
-        new int[] { 1, 1, 1, 0 });
+        null,
+        new int[] { 1, 1, 1, 0 },
+        null,
+        false);
     
     assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
         new int[] { 0, 9, 15 },
         new int[] { 6, 14, 19 },
-        new int[] { 1, 1, 1 });
+        null,
+        new int[] { 1, 1, 1 },
+        null,
+        false);
     
     /* analyzer that will consume tokens with large position increments */
     Analyzer a2 = new Analyzer() {
@@ -258,24 +270,36 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
     assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
         new int[] { 0, 7, 16 },
         new int[] { 6, 15, 20 },
-        new int[] { 1, 10, 1 });
+        null,
+        new int[] { 1, 10, 1 },
+        null,
+        false);
     
     /* the "/" had a position increment of 10, where did it go?!?!! */
     assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
         new int[] { 0, 9 },
         new int[] { 6, 13 },
-        new int[] { 1, 11 });
+        null,
+        new int[] { 1, 11 },
+        null,
+        false);
     
     /* in this case, the increment of 10 from the "/" is carried over */
     assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
         new int[] { 0, 9, 12, 9 },
         new int[] { 6, 12, 13, 13 },
-        new int[] { 1, 11, 1, 0 });
+        null,
+        new int[] { 1, 11, 1, 0 },
+        null,
+        false);
     
     assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
         new int[] { 0, 9, 15 },
         new int[] { 6, 14, 19 },
-        new int[] { 1, 11, 1 });
+        null,
+        new int[] { 1, 11, 1 },
+        null,
+        false);
 
     Analyzer a3 = new Analyzer() {
       @Override
@@ -292,14 +316,20 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
         new String[] { "lucene", "solr", "lucenesolr" },
         new int[] { 0, 7, 0 },
         new int[] { 6, 11, 11 },
-        new int[] { 1, 1, 0 });
+        null,
+        new int[] { 1, 1, 0 },
+        null,
+        false);
 
     /* the stopword should add a gap here */
     assertAnalyzesTo(a3, "the lucene.solr", 
         new String[] { "lucene", "solr", "lucenesolr" }, 
         new int[] { 4, 11, 4 }, 
         new int[] { 10, 15, 15 },
-        new int[] { 2, 1, 0 });
+        null,
+        new int[] { 2, 1, 0 },
+        null,
+        false);
   }
   
   /** blast some random strings through the analyzer */
@@ -322,7 +352,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
           return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
         }
       };
-      checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+      checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
     }
   }
   
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
index e8e7f6cf4ad..adb887059fc 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
@@ -94,7 +94,15 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
 
   public void testBackRangeOfNgrams() throws Exception {
     EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
-    assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5});
+    assertTokenStreamContents(tokenizer,
+                              new String[]{"e","de","cde"},
+                              new int[]{4,3,2},
+                              new int[]{5,5,5},
+                              null,
+                              null,
+                              null,
+                              null,
+                              false);
   }
   
   public void testSmallTokenInStream() throws Exception {
@@ -151,7 +159,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
             new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15));
       }    
     };
-    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER, 20, false, false);
   }
   
   public void testEmptyTerm() throws Exception {
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
index 90611a1f2ec..158c603a91c 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
@@ -90,7 +90,7 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
 
   public void testBackRangeOfNgrams() throws Exception {
     EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
-    assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, 5 /* abcde */);
+    assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, null, null, null, 5 /* abcde */, false);
   }
   
   public void testReset() throws Exception {
@@ -109,8 +109,8 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
         return new TokenStreamComponents(tokenizer, tokenizer);
       }    
     };
-    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
-    checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
+    checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192, false, false);
     
     Analyzer b = new Analyzer() {
       @Override
@@ -119,7 +119,7 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
         return new TokenStreamComponents(tokenizer, tokenizer);
       }    
     };
-    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
-    checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192);
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER, 20, false, false);
+    checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192, false, false);
   }
 }
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
index 3375c027057..f5f3071e43f 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
@@ -77,7 +77,8 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
     assertTokenStreamContents(filter,
         new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"}, 
         new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
-        new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5}
+        new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
+        null, null, null, null, false
         );
   }
   
@@ -130,7 +131,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
             new NGramTokenFilter(tokenizer, 2, 15));
       }    
     };
-    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
   }
   
   public void testEmptyTerm() throws Exception {
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
index 9dd3c65723f..86a97828e6c 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
@@ -73,7 +73,11 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
         new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"}, 
         new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
         new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
-        5 /* abcde */
+        null,
+        null,
+        null,
+        5 /* abcde */,
+        false
         );
   }
   
@@ -98,7 +102,7 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
         return new TokenStreamComponents(tokenizer, tokenizer);
       }    
     };
-    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
-    checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
+    checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192, false, false);
   }
 }

From ad994d8281d745a9758194d9ed1e38456e337828 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Tue, 10 Apr 2012 17:02:11 +0000
Subject: [PATCH 27/40] LUCENE-3969: promote edgeNgrams from 'totally broken
 list' to 'broken offsets list'

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311869 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/core/TestRandomChains.java       | 24 +++++++++++++++----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 7034834665a..00190dd8ea3 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -113,11 +113,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
                                  LimitTokenCountFilter.class,
                                  // Not broken: we forcefully add this, so we shouldn't
                                  // also randomly pick it:
-                                 ValidatingTokenFilter.class,
-                                 // nocommit: randomly generate the Side enum param here; then promote to brokenOffsets?
-                                 EdgeNGramTokenizer.class,
-                                 // nocommit: randomly generate the Side enum param here; then promote to brokenOffsets?
-                                 EdgeNGramTokenFilter.class
+                                 ValidatingTokenFilter.class
     );
   }
 
@@ -138,6 +134,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
                                  NGramTokenizer.class,
                                  // broken!
                                  NGramTokenFilter.class,
+                                 // broken!
+                                 EdgeNGramTokenizer.class,
+                                 // broken!
+                                 EdgeNGramTokenFilter.class,
                                  // nocommit it seems to mess up offsets!?
                                  WikipediaTokenizer.class
                                  );
@@ -356,6 +356,20 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         }
       }
     });
+    put(EdgeNGramTokenizer.Side.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        return random.nextBoolean() 
+            ? EdgeNGramTokenizer.Side.FRONT 
+            : EdgeNGramTokenizer.Side.BACK;
+      }
+    });
+    put(EdgeNGramTokenFilter.Side.class, new ArgProducer() {
+      @Override public Object create(Random random) {
+        return random.nextBoolean() 
+            ? EdgeNGramTokenFilter.Side.FRONT 
+            : EdgeNGramTokenFilter.Side.BACK;
+      }
+    });
     put(HyphenationTree.class, new ArgProducer() {
       @Override public Object create(Random random) {
         // TODO: make nastier

From c58dfd5516e47c2b19f7db1807eb82428817ccd7 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Tue, 10 Apr 2012 18:36:34 +0000
Subject: [PATCH 28/40] LUCENE-3969: demote the n-grams again (with
 explanation)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311915 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/core/TestRandomChains.java       | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 00190dd8ea3..2dac8f3b8ba 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -113,7 +113,22 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
                                  LimitTokenCountFilter.class,
                                  // Not broken: we forcefully add this, so we shouldn't
                                  // also randomly pick it:
-                                 ValidatingTokenFilter.class
+                                 ValidatingTokenFilter.class,
+                                 // NOTE: these by themselves won't cause any 'basic assertions' to fail.
+                                 // but see https://issues.apache.org/jira/browse/LUCENE-3920, if any 
+                                 // tokenfilter that combines words (e.g. shingles) comes after them,
+                                 // this will create bogus offsets because their 'offsets go backwards',
+                                 // causing shingle or whatever to make a single token with a 
+                                 // startOffset thats > its endOffset
+                                 // (see LUCENE-3738 for a list of other offenders here)
+                                 // broken!
+                                 NGramTokenizer.class,
+                                 // broken!
+                                 NGramTokenFilter.class,
+                                 // broken!
+                                 EdgeNGramTokenizer.class,
+                                 // broken!
+                                 EdgeNGramTokenFilter.class
     );
   }
 
@@ -130,14 +145,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
                                  DictionaryCompoundWordTokenFilter.class,
                                  // nocommit: corrumpts graphs (offset consistency check):
                                  PositionFilter.class,
-                                 // broken!
-                                 NGramTokenizer.class,
-                                 // broken!
-                                 NGramTokenFilter.class,
-                                 // broken!
-                                 EdgeNGramTokenizer.class,
-                                 // broken!
-                                 EdgeNGramTokenFilter.class,
                                  // nocommit it seems to mess up offsets!?
                                  WikipediaTokenizer.class
                                  );

From 842a54c29054b25b011212af81bf55209740f0ff Mon Sep 17 00:00:00 2001
From: Uwe Schindler <uschindler@apache.org>
Date: Tue, 10 Apr 2012 18:50:54 +0000
Subject: [PATCH 29/40] LUCENE-3969: revert Whitespace

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311920 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/core/TestRandomChains.java       | 81 ++++++++++---------
 1 file changed, 43 insertions(+), 38 deletions(-)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 2dac8f3b8ba..3ba7ecb4638 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -105,30 +105,30 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     // nocommit can we promote some of these to be only
     // offsets offenders?
     Collections.<Class<?>>addAll(brokenComponents,
-                                 // TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
-                                 EmptyTokenizer.class,
-                                 // doesn't actual reset itself!
-                                 CachingTokenFilter.class,
-                                 // doesn't consume whole stream!
-                                 LimitTokenCountFilter.class,
-                                 // Not broken: we forcefully add this, so we shouldn't
-                                 // also randomly pick it:
-                                 ValidatingTokenFilter.class,
-                                 // NOTE: these by themselves won't cause any 'basic assertions' to fail.
-                                 // but see https://issues.apache.org/jira/browse/LUCENE-3920, if any 
-                                 // tokenfilter that combines words (e.g. shingles) comes after them,
-                                 // this will create bogus offsets because their 'offsets go backwards',
-                                 // causing shingle or whatever to make a single token with a 
-                                 // startOffset thats > its endOffset
-                                 // (see LUCENE-3738 for a list of other offenders here)
-                                 // broken!
-                                 NGramTokenizer.class,
-                                 // broken!
-                                 NGramTokenFilter.class,
-                                 // broken!
-                                 EdgeNGramTokenizer.class,
-                                 // broken!
-                                 EdgeNGramTokenFilter.class
+      // TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
+      EmptyTokenizer.class,
+      // doesn't actual reset itself!
+      CachingTokenFilter.class,
+      // doesn't consume whole stream!
+      LimitTokenCountFilter.class,
+      // Not broken: we forcefully add this, so we shouldn't
+      // also randomly pick it:
+      ValidatingTokenFilter.class,
+      // NOTE: these by themselves won't cause any 'basic assertions' to fail.
+      // but see https://issues.apache.org/jira/browse/LUCENE-3920, if any 
+      // tokenfilter that combines words (e.g. shingles) comes after them,
+      // this will create bogus offsets because their 'offsets go backwards',
+      // causing shingle or whatever to make a single token with a 
+      // startOffset thats > its endOffset
+      // (see LUCENE-3738 for a list of other offenders here)
+      // broken!
+      NGramTokenizer.class,
+      // broken!
+      NGramTokenFilter.class,
+      // broken!
+      EdgeNGramTokenizer.class,
+      // broken!
+      EdgeNGramTokenFilter.class
     );
   }
 
@@ -137,18 +137,19 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
   private static final Set<Class<?>> brokenOffsetsComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
   static {
     Collections.<Class<?>>addAll(brokenOffsetsComponents,
-                                 WordDelimiterFilter.class,
-                                 TrimFilter.class,
-                                 ReversePathHierarchyTokenizer.class,
-                                 PathHierarchyTokenizer.class,
-                                 HyphenationCompoundWordTokenFilter.class,
-                                 DictionaryCompoundWordTokenFilter.class,
-                                 // nocommit: corrumpts graphs (offset consistency check):
-                                 PositionFilter.class,
-                                 // nocommit it seems to mess up offsets!?
-                                 WikipediaTokenizer.class
-                                 );
+      WordDelimiterFilter.class,
+      TrimFilter.class,
+      ReversePathHierarchyTokenizer.class,
+      PathHierarchyTokenizer.class,
+      HyphenationCompoundWordTokenFilter.class,
+      DictionaryCompoundWordTokenFilter.class,
+      // nocommit: corrumpts graphs (offset consistency check):
+      PositionFilter.class,
+      // nocommit it seems to mess up offsets!?
+      WikipediaTokenizer.class
+    );
   }
+  
   @BeforeClass
   public static void beforeClass() throws Exception {
     List<Class<?>> analysisClasses = new ArrayList<Class<?>>();
@@ -168,6 +169,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       ) {
         continue;
       }
+      
       for (final Constructor<?> ctor : c.getConstructors()) {
         // don't test synthetic or deprecated ctors, they likely have known bugs:
         if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class)) {
@@ -175,21 +177,22 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         }
         if (Tokenizer.class.isAssignableFrom(c)) {
           assertTrue(ctor.toGenericString() + " has unsupported parameter types",
-                     allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+            allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
           tokenizers.add(castConstructor(Tokenizer.class, ctor));
         } else if (TokenFilter.class.isAssignableFrom(c)) {
           assertTrue(ctor.toGenericString() + " has unsupported parameter types",
-                     allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+            allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
           tokenfilters.add(castConstructor(TokenFilter.class, ctor));
         } else if (CharStream.class.isAssignableFrom(c)) {
           assertTrue(ctor.toGenericString() + " has unsupported parameter types",
-                     allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+            allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
           charfilters.add(castConstructor(CharStream.class, ctor));
         } else {
           fail("Cannot get here");
         }
       }
     }
+    
     final Comparator<Constructor<?>> ctorComp = new Comparator<Constructor<?>>() {
       @Override
       public int compare(Constructor<?> arg0, Constructor<?> arg1) {
@@ -205,12 +208,14 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       System.out.println("charfilters = " + charfilters);
     }
   }
+  
   @AfterClass
   public static void afterClass() throws Exception {
     tokenizers = null;
     tokenfilters = null;
     charfilters = null;
   }
+  
   /** Hack to work around the stupidness of Oracle's strict Java backwards compatibility.
    * {@code Class<T>#getConstructors()} should return unmodifiable {@code List<Constructor<T>>} not array! */
   @SuppressWarnings("unchecked") 

From 0cf3c779c6b0f15a6de1d9b2b30b84e66ea6ee33 Mon Sep 17 00:00:00 2001
From: Michael McCandless <mikemccand@apache.org>
Date: Tue, 10 Apr 2012 19:20:04 +0000
Subject: [PATCH 30/40] LUCENE-3969: stop iterating random text if a thread
 hits a failure

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311938 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/analysis/BaseTokenStreamTestCase.java | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
index a9989ac6845..ae5eef552ac 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@@ -207,7 +207,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
             // We've seen a token leaving from this position
             // before; verify the startOffset is the same:
             //System.out.println("  + vs " + pos + " -> " + startOffset);
-            assertEquals(posToStartOffset.get(pos).intValue(), startOffset);
+            assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToStartOffset.get(pos).intValue(), startOffset);
           }
 
           final int endPos = pos + posLength;
@@ -220,7 +220,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
             // We've seen a token arriving to this position
             // before; verify the endOffset is the same:
             //System.out.println("  + ve " + endPos + " -> " + endOffset);
-            assertEquals(posToEndOffset.get(endPos).intValue(), endOffset);
+            assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToEndOffset.get(endPos).intValue(), endOffset);
           }
         }
       }
@@ -386,6 +386,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     final Analyzer a;
     final boolean simple;
     final boolean offsetsAreCorrect;
+    public boolean failed;
     
     AnalysisThread(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) {
       this.random = random;
@@ -398,12 +399,16 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     
     @Override
     public void run() {
+      boolean success = false;
       try {
         // see the part in checkRandomData where it replays the same text again
         // to verify reproducability/reuse: hopefully this would catch thread hazards.
         checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple, offsetsAreCorrect);
+        success = true;
       } catch (IOException e) {
         Rethrow.rethrow(e);
+      } finally {
+        failed = !success;
       }
     }
   };
@@ -416,7 +421,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple, offsetsAreCorrect);
     // now test with multiple threads
     int numThreads = _TestUtil.nextInt(random, 4, 8);
-    Thread threads[] = new Thread[numThreads];
+    AnalysisThread threads[] = new AnalysisThread[numThreads];
     for (int i = 0; i < threads.length; i++) {
       threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations, maxWordLength, simple, offsetsAreCorrect);
     }
@@ -430,6 +435,11 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
         throw new RuntimeException(e);
       }
     }
+    for (int i = 0; i < threads.length; i++) {
+      if (threads[i].failed) {
+        throw new RuntimeException("some thread(s) failed");
+      }
+    }
   }
 
   private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect) throws IOException {

From 6954ba241085944c77365194d59a05f883f1bf35 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Tue, 10 Apr 2012 19:31:01 +0000
Subject: [PATCH 31/40] LUCENE-3969: fix BaseTokenTest to do the same work in
 multi-threads that it did in single-threads, so it really shouldnt fail from
 another thread unless you have an actual thread problem

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311950 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/BaseTokenStreamTestCase.java     | 19 ++++++++++++-------
 .../analysis/core/TestRandomChains.java       |  3 +--
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
index ae5eef552ac..10161e0ab38 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@@ -382,17 +382,19 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
   static class AnalysisThread extends Thread {
     final int iterations;
     final int maxWordLength;
-    final Random random;
+    final long seed;
     final Analyzer a;
+    final boolean useCharFilter;
     final boolean simple;
     final boolean offsetsAreCorrect;
     public boolean failed;
     
-    AnalysisThread(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) {
-      this.random = random;
+    AnalysisThread(long seed, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect) {
+      this.seed = seed;
       this.a = a;
       this.iterations = iterations;
       this.maxWordLength = maxWordLength;
+      this.useCharFilter = useCharFilter;
       this.simple = simple;
       this.offsetsAreCorrect = offsetsAreCorrect;
     }
@@ -403,7 +405,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
       try {
         // see the part in checkRandomData where it replays the same text again
         // to verify reproducability/reuse: hopefully this would catch thread hazards.
-        checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple, offsetsAreCorrect);
+        checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
         success = true;
       } catch (IOException e) {
         Rethrow.rethrow(e);
@@ -418,12 +420,15 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
   }
 
   public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) throws IOException {
-    checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple, offsetsAreCorrect);
-    // now test with multiple threads
+    long seed = random.nextLong();
+    boolean useCharFilter = random.nextBoolean();
+    checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
+    // now test with multiple threads: note we do the EXACT same thing we did before in each thread,
+    // so this should only really fail from another thread if its an actual thread problem
     int numThreads = _TestUtil.nextInt(random, 4, 8);
     AnalysisThread threads[] = new AnalysisThread[numThreads];
     for (int i = 0; i < threads.length; i++) {
-      threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations, maxWordLength, simple, offsetsAreCorrect);
+      threads[i] = new AnalysisThread(seed, a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
     }
     for (int i = 0; i < threads.length; i++) {
       threads[i].start();
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 3ba7ecb4638..d6a8c4267bf 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -792,8 +792,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     int numIterations = atLeast(20);
     for (int i = 0; i < numIterations; i++) {
       MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong());
-      // nocommit: wrap the uncaught handler with our own that prints the analyzer
-      if (true || VERBOSE) {
+      if (VERBOSE) {
         System.out.println("Creating random analyzer:" + a);
       }
       try {

From 64631a4309e5aba5b5b21e626f47b3a0811619f1 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Tue, 10 Apr 2012 19:37:35 +0000
Subject: [PATCH 32/40] LUCENE-3969: fix this filter to reset its seed... how
 far you peek ahead could cause some producer to fail differently....

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311953 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/MockRandomLookaheadTokenFilter.java    | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockRandomLookaheadTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockRandomLookaheadTokenFilter.java
index e47551b28ec..44215e724f7 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockRandomLookaheadTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockRandomLookaheadTokenFilter.java
@@ -31,10 +31,12 @@ public final class MockRandomLookaheadTokenFilter extends LookaheadTokenFilter<L
 
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final Random random;
+  private final long seed;
 
   public MockRandomLookaheadTokenFilter(Random random, TokenStream in) {
     super(in);
-    this.random = random;
+    this.seed = random.nextLong();
+    this.random = new Random(seed);
   }
 
   @Override
@@ -57,9 +59,6 @@ public final class MockRandomLookaheadTokenFilter extends LookaheadTokenFilter<L
 
     if (!end) {
       while (true) {
-        // We can use un-re-seeded random, because how far
-        // ahead we peek should never alter the resulting
-        // tokens as seen by the consumer:
         if (random.nextInt(3) == 1) {
           if (!peekToken()) {
             if (DEBUG) {
@@ -91,4 +90,10 @@ public final class MockRandomLookaheadTokenFilter extends LookaheadTokenFilter<L
     }
     return result;
   }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    random.setSeed(seed);
+  }
 }

From 71291daa74b9e687d897a2890bdc37e0d9156ba9 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Wed, 11 Apr 2012 12:16:31 +0000
Subject: [PATCH 33/40] LUCENE-3969: when outputting a bigram token, mark
 posLen=2 to note that it spans two tokens

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1324727 13f79535-47bb-0310-9956-ffa450edef68
---
 .../apache/lucene/analysis/commongrams/CommonGramsFilter.java  | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
index 8232b88c2bf..9798464f938 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
@@ -16,6 +16,7 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.util.Version;
@@ -54,6 +55,7 @@ public final class CommonGramsFilter extends TokenFilter {
   private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
   private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
   private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
 
   private int lastStartOffset;
   private boolean lastWasCommon;
@@ -166,6 +168,7 @@ public final class CommonGramsFilter extends TokenFilter {
     buffer.getChars(0, length, termText, 0);
     termAttribute.setLength(length);
     posIncAttribute.setPositionIncrement(0);
+    posLenAttribute.setPositionLength(2); // bigram
     offsetAttribute.setOffset(lastStartOffset, endOffset);
     typeAttribute.setType(GRAM_TYPE);
     buffer.setLength(0);

From bf2549a27b9fdda9685d6eda1c181e1a1a60c27e Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Wed, 11 Apr 2012 12:23:15 +0000
Subject: [PATCH 34/40] LUCENE-3969: add hack for MockGraph's asserts

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1324734 13f79535-47bb-0310-9956-ffa450edef68
---
 .../org/apache/lucene/analysis/core/TestRandomChains.java | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index d6a8c4267bf..e319e5f821a 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -47,6 +47,7 @@ import org.apache.lucene.analysis.CachingTokenFilter;
 import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.EmptyTokenizer;
+import org.apache.lucene.analysis.MockGraphTokenFilter;
 import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
@@ -711,6 +712,13 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
 
         while (true) {
           final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
+          
+          // nocommit/hack: MockGraph has assertions that will trip if it follows
+          // an offsets violator. so we cant use it after e.g. wikipediatokenizer
+          if (ctor.getDeclaringClass().equals(MockGraphTokenFilter.class) && !spec.offsetsAreCorrect) {
+            continue;
+          }
+          
           final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
           final TokenFilter flt = createComponent(ctor, args, descr);
           if (flt != null) {

From 69fafd4791caa513be70e1f1f61665714c58b52f Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Wed, 11 Apr 2012 13:05:22 +0000
Subject: [PATCH 35/40] LUCENE-3969: clear this in reset()

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1324747 13f79535-47bb-0310-9956-ffa450edef68
---
 .../org/apache/lucene/analysis/path/PathHierarchyTokenizer.java  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java
index c4450f4878d..37557755d53 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java
@@ -195,5 +195,6 @@ public class PathHierarchyTokenizer extends Tokenizer {
     charsRead = 0;
     endDelimiter = false;
     skipped = 0;
+    startPosition = 0;
   }
 }

From 14928d42c69c4afa00cb738c3f922fa36f759593 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Wed, 11 Apr 2012 13:08:10 +0000
Subject: [PATCH 36/40] LUCENE-3969: add hack for MockLookahead's asserts

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1324749 13f79535-47bb-0310-9956-ffa450edef68
---
 .../apache/lucene/analysis/core/TestRandomChains.java    | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index e319e5f821a..491a1942574 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -48,6 +48,7 @@ import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.EmptyTokenizer;
 import org.apache.lucene.analysis.MockGraphTokenFilter;
+import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter;
 import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
@@ -713,9 +714,11 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         while (true) {
           final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
           
-          // nocommit/hack: MockGraph has assertions that will trip if it follows
-          // an offsets violator. so we cant use it after e.g. wikipediatokenizer
-          if (ctor.getDeclaringClass().equals(MockGraphTokenFilter.class) && !spec.offsetsAreCorrect) {
+          // nocommit/hack: MockGraph/MockLookahead has assertions that will trip if they follow
+          // an offsets violator. so we cant use them after e.g. wikipediatokenizer
+          if (!spec.offsetsAreCorrect &&
+              (ctor.getDeclaringClass().equals(MockGraphTokenFilter.class)
+               || ctor.getDeclaringClass().equals(MockRandomLookaheadTokenFilter.class))) {
             continue;
           }
           

From 974ea5ee34bcb3adc2fabc0174ba0a4f9062c036 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Wed, 11 Apr 2012 13:15:33 +0000
Subject: [PATCH 37/40] LUCENE-3969: add mappingcharfilter to broken list until
 its bug is fixed

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1324751 13f79535-47bb-0310-9956-ffa450edef68
---
 .../org/apache/lucene/analysis/core/TestRandomChains.java    | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 491a1942574..80e4a40f364 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -57,6 +57,7 @@ import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
 import org.apache.lucene.analysis.ValidatingTokenFilter;
 import org.apache.lucene.analysis.charfilter.CharFilter;
+import org.apache.lucene.analysis.charfilter.MappingCharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
 import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
@@ -130,7 +131,9 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       // broken!
       EdgeNGramTokenizer.class,
       // broken!
-      EdgeNGramTokenFilter.class
+      EdgeNGramTokenFilter.class,
+      // nocommit: remove this class after we fix its finalOffset bug
+      MappingCharFilter.class
     );
   }
 

From 5475644b59318e2016f000c993de08a0bcf317a1 Mon Sep 17 00:00:00 2001
From: Michael McCandless <mikemccand@apache.org>
Date: Wed, 11 Apr 2012 14:20:35 +0000
Subject: [PATCH 38/40] LUCENE-3969: add comment

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1324777 13f79535-47bb-0310-9956-ffa450edef68
---
 .../org/apache/lucene/analysis/BaseTokenStreamTestCase.java   | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
index 10161e0ab38..6978b77506b 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@@ -387,6 +387,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     final boolean useCharFilter;
     final boolean simple;
     final boolean offsetsAreCorrect;
+
+    // NOTE: not volatile because we don't want the tests to
+    // add memory barriers (ie alter how threads
+    // interact)... so this is just "best effort":
     public boolean failed;
     
     AnalysisThread(long seed, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect) {

From c845af549702f8bb4e44b3066aff0a1652482f29 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Wed, 11 Apr 2012 16:01:07 +0000
Subject: [PATCH 39/40] LUCENE-3969: clean up nocommits

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1324834 13f79535-47bb-0310-9956-ffa450edef68
---
 .../analysis/ValidatingTokenFilter.java       |  6 ++--
 .../HyphenationCompoundWordTokenFilter.java   |  2 +-
 .../path/ReversePathHierarchyTokenizer.java   |  1 -
 .../charfilter/TestMappingCharFilter.java     |  6 ++--
 .../analysis/core/TestRandomChains.java       | 28 +++++++++++--------
 5 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
index 976f0ff950e..f213545511c 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
@@ -27,13 +27,13 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.util.Attribute;
 
-// nocommit rename to OffsetsXXXTF?  ie we only validate
+// TODO: rename to OffsetsXXXTF?  ie we only validate
 // offsets (now anyway...)
 
 // TODO: also make a DebuggingTokenFilter, that just prints
 // all att values that come through it...
 
-// nocommit BTSTC should just append this to the chain
+// TODO: BTSTC should just append this to the chain
 // instead of checking itself:
 
 /** A TokenFilter that checks consistency of the tokens (eg
@@ -155,7 +155,7 @@ public final class ValidatingTokenFilter extends TokenFilter {
 
     // TODO: what else to validate
 
-    // nocommit check that endOffset is >= max(endOffset)
+    // TODO: check that endOffset is >= max(endOffset)
     // we've seen
   }
 
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
index a71352db1f7..71d317b0cc5 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
@@ -191,7 +191,7 @@ public class HyphenationCompoundWordTokenFilter extends
         // we only put subwords to the token stream
         // that are longer than minPartSize
         if (partLength < this.minSubwordSize) {
-          // nocommit/BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the 
+          // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the 
           // calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
           continue;
         }
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java
index 759c48c7cd6..97593c6377e 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java
@@ -81,7 +81,6 @@ public class ReversePathHierarchyTokenizer extends Tokenizer {
       throw new IllegalArgumentException("bufferSize cannot be negative");
     }
     if (skip < 0) {
-      // nocommit: not quite right right here: see line 84... if skip > numTokensFound we always get a NegativeArrayException? needs fixing!
       throw new IllegalArgumentException("skip cannot be negative");
     }
     termAtt.resizeBuffer(bufferSize);
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
index 71986253cee..fa77b400079 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
@@ -195,8 +195,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
     checkRandomData(random, analyzer, numRounds);
   }
   
-  // nocommit: wrong final offset, fix this!
-  @Ignore
+  @Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
   public void testFinalOffsetSpecialCase() throws Exception {  
     final NormalizeCharMap map = new NormalizeCharMap();
     map.add("t", "");
@@ -220,8 +219,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
     checkAnalysisConsistency(random, analyzer, false, text);
   }
   
-  // nocommit: this is intended to fail until we fix bugs
-  @Ignore
+  @Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
   public void testRandomMaps() throws Exception {
     for (int i = 0; i < 100; i++) {
       final NormalizeCharMap map = randomMap();
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 80e4a40f364..46c856374c2 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -81,6 +81,7 @@ import org.apache.lucene.analysis.position.PositionFilter;
 import org.apache.lucene.analysis.snowball.TestSnowball;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.analysis.th.ThaiWordFilter;
 import org.apache.lucene.analysis.util.CharArrayMap;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.util.AttributeSource.AttributeFactory;
@@ -105,7 +106,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
   // TODO: fix those and remove
   private static final Set<Class<?>> brokenComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
   static {
-    // nocommit can we promote some of these to be only
+    // TODO: can we promote some of these to be only
     // offsets offenders?
     Collections.<Class<?>>addAll(brokenComponents,
       // TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
@@ -132,7 +133,11 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       EdgeNGramTokenizer.class,
       // broken!
       EdgeNGramTokenFilter.class,
-      // nocommit: remove this class after we fix its finalOffset bug
+      // broken!
+      WordDelimiterFilter.class,
+      // broken!
+      TrimFilter.class,
+      // TODO: remove this class after we fix its finalOffset bug
       MappingCharFilter.class
     );
   }
@@ -142,16 +147,16 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
   private static final Set<Class<?>> brokenOffsetsComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
   static {
     Collections.<Class<?>>addAll(brokenOffsetsComponents,
-      WordDelimiterFilter.class,
-      TrimFilter.class,
       ReversePathHierarchyTokenizer.class,
       PathHierarchyTokenizer.class,
       HyphenationCompoundWordTokenFilter.class,
       DictionaryCompoundWordTokenFilter.class,
-      // nocommit: corrumpts graphs (offset consistency check):
+      // TODO: corrumpts graphs (offset consistency check):
       PositionFilter.class,
-      // nocommit it seems to mess up offsets!?
-      WikipediaTokenizer.class
+      // TODO: it seems to mess up offsets!?
+      WikipediaTokenizer.class,
+      // TODO: doesn't handle graph inputs
+      ThaiWordFilter.class
     );
   }
   
@@ -271,7 +276,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     });
     put(char.class, new ArgProducer() {
       @Override public Object create(Random random) {
-        // nocommit: fix any filters that care to throw IAE instead.
+        // TODO: fix any filters that care to throw IAE instead.
+        // also add a unicode validating filter to validate termAtt?
         // return Character.valueOf((char)random.nextInt(65536));
         while(true) {
           char c = (char)random.nextInt(65536);
@@ -534,7 +540,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         // TODO: maybe the collator one...???
         args[i] = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
       } else if (paramType == AttributeSource.class) {
-        // nocommit: args[i] = new AttributeSource();
+        // TODO: args[i] = new AttributeSource();
         // this is currently too scary to deal with!
         args[i] = null; // force IAE
       } else {
@@ -583,7 +589,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
     }
 
     public boolean offsetsAreCorrect() {
-      // nocommit: can we not do the full chain here!?
+      // TODO: can we not do the full chain here!?
       Random random = new Random(seed);
       TokenizerSpec tokenizerSpec = newTokenizer(random, new StringReader(""));
       TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
@@ -717,7 +723,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
         while (true) {
           final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
           
-          // nocommit/hack: MockGraph/MockLookahead has assertions that will trip if they follow
+          // hack: MockGraph/MockLookahead has assertions that will trip if they follow
           // an offsets violator. so we cant use them after e.g. wikipediatokenizer
           if (!spec.offsetsAreCorrect &&
               (ctor.getDeclaringClass().equals(MockGraphTokenFilter.class)

From a1c1ac512b9a26c1c4ebc86d5cd9b0a453056a18 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Wed, 11 Apr 2012 19:30:25 +0000
Subject: [PATCH 40/40] LUCENE-3969: this filter currently doesnt handle graph
 inputs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1324930 13f79535-47bb-0310-9956-ffa450edef68
---
 .../org/apache/lucene/analysis/core/TestRandomChains.java    | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 46c856374c2..016b1077c13 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -59,6 +59,7 @@ import org.apache.lucene.analysis.ValidatingTokenFilter;
 import org.apache.lucene.analysis.charfilter.CharFilter;
 import org.apache.lucene.analysis.charfilter.MappingCharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
+import org.apache.lucene.analysis.cjk.CJKBigramFilter;
 import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
 import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
 import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
@@ -156,7 +157,9 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       // TODO: it seems to mess up offsets!?
       WikipediaTokenizer.class,
       // TODO: doesn't handle graph inputs
-      ThaiWordFilter.class
+      ThaiWordFilter.class,
+      // TODO: doesn't handle graph inputs
+      CJKBigramFilter.class
     );
   }