From a03e38d5d05008aaef969a200071c03a1d6cb991 Mon Sep 17 00:00:00 2001
From: Adrien Grand <jpountz@apache.org>
Date: Fri, 26 Apr 2013 11:03:26 +0000
Subject: [PATCH] LUCENE-4955: Fix NGramTokenizer and NGramTokenFilter, and
 remove them from TestRandomChains' exclusion list.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1476135 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/CHANGES.txt                            |  13 ++
 .../ngram/Lucene43NGramTokenizer.java         | 155 ++++++++++++++++++
 .../analysis/ngram/NGramFilterFactory.java    |   2 +-
 .../analysis/ngram/NGramTokenFilter.java      | 103 +++++++++---
 .../lucene/analysis/ngram/NGramTokenizer.java | 151 +++++++++--------
 .../analysis/ngram/NGramTokenizerFactory.java |  10 +-
 .../analysis/core/TestRandomChains.java       |  15 +-
 .../analysis/ngram/NGramTokenFilterTest.java  |  76 ++++++---
 .../analysis/ngram/NGramTokenizerTest.java    |  69 ++++++--
 .../analysis/ngram/TestNGramFilters.java      |   8 +-
 10 files changed, 466 insertions(+), 136 deletions(-)
 create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 39c875604e2..ee34d6edb90 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -37,6 +37,16 @@ Optimizations
 
 ======================= Lucene 4.4.0 =======================
 
+Changes in backwards compatibility policy
+
+* LUCENE-4955: NGramTokenFilter now emits all n-grams for the same token at the
+  same position and preserves the position length and the offsets of the
+  original token. (Simon Willnauer, Adrien Grand)
+
+* LUCENE-4955: NGramTokenizer now emits n-grams in a different order
+  (a, ab, b, bc, c) instead of (a, b, c, ab, bc) and doesn't trim trailing
+  whitespaces. (Adrien Grand)
+
 Bug Fixes
 
 * LUCENE-4935: CustomScoreQuery wrongly applied its query boost twice 
@@ -46,6 +56,9 @@ Bug Fixes
   if you had a 64-bit JVM without compressed OOPS: IBM J9, or Oracle with
   large heap/explicitly disabled.  (Mike McCandless, Uwe Schindler, Robert Muir)
 
+* LUCENE-4955: NGramTokenizer now supports inputs larger than 1024 chars.
+  (Adrien Grand)
+
 Optimizations
 
 * LUCENE-4938: Don't use an unnecessarily large priority queue in IndexSearcher
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java
new file mode 100644
index 00000000000..25693da23ba
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java
@@ -0,0 +1,155 @@
+package org.apache.lucene.analysis.ngram;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+/**
+ * Old broken version of {@link NGramTokenizer}.
+ */
+@Deprecated
+public final class Lucene43NGramTokenizer extends Tokenizer {
+  public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
+  public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+  private int minGram, maxGram;
+  private int gramSize;
+  private int pos;
+  private int inLen; // length of the input AFTER trim()
+  private int charsRead; // length of the input
+  private String inStr;
+  private boolean started;
+  
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+  /**
+   * Creates NGramTokenizer with given min and max n-grams.
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public Lucene43NGramTokenizer(Reader input, int minGram, int maxGram) {
+    super(input);
+    init(minGram, maxGram);
+  }
+
+  /**
+   * Creates NGramTokenizer with given min and max n-grams.
+   * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public Lucene43NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
+    super(factory, input);
+    init(minGram, maxGram);
+  }
+
+  /**
+   * Creates NGramTokenizer with default min and max n-grams.
+   * @param input {@link Reader} holding the input to be tokenized
+   */
+  public Lucene43NGramTokenizer(Reader input) {
+    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
+  }
+  
+  private void init(int minGram, int maxGram) {
+    if (minGram < 1) {
+      throw new IllegalArgumentException("minGram must be greater than zero");
+    }
+    if (minGram > maxGram) {
+      throw new IllegalArgumentException("minGram must not be greater than maxGram");
+    }
+    this.minGram = minGram;
+    this.maxGram = maxGram;
+  }
+
+  /** Returns the next token in the stream, or null at EOS. */
+  @Override
+  public boolean incrementToken() throws IOException {
+    clearAttributes();
+    if (!started) {
+      started = true;
+      gramSize = minGram;
+      char[] chars = new char[1024];
+      charsRead = 0;
+      // TODO: refactor to a shared readFully somewhere:
+      while (charsRead < chars.length) {
+        int inc = input.read(chars, charsRead, chars.length-charsRead);
+        if (inc == -1) {
+          break;
+        }
+        charsRead += inc;
+      }
+      inStr = new String(chars, 0, charsRead).trim();  // remove any trailing empty strings 
+
+      if (charsRead == chars.length) {
+        // Read extra throwaway chars so that on end() we
+        // report the correct offset:
+        char[] throwaway = new char[1024];
+        while(true) {
+          final int inc = input.read(throwaway, 0, throwaway.length);
+          if (inc == -1) {
+            break;
+          }
+          charsRead += inc;
+        }
+      }
+
+      inLen = inStr.length();
+      if (inLen == 0) {
+        return false;
+      }
+    }
+
+    if (pos+gramSize > inLen) {            // if we hit the end of the string
+      pos = 0;                           // reset to beginning of string
+      gramSize++;                        // increase n-gram size
+      if (gramSize > maxGram)            // we are done
+        return false;
+      if (pos+gramSize > inLen)
+        return false;
+    }
+
+    int oldPos = pos;
+    pos++;
+    termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize);
+    offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
+    return true;
+  }
+  
+  @Override
+  public void end() {
+    // set final offset
+    final int finalOffset = correctOffset(charsRead);
+    this.offsetAtt.setOffset(finalOffset, finalOffset);
+  }    
+  
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    started = false;
+    pos = 0;
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java
index 42b3934cb62..60398bdf4b2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java
@@ -47,6 +47,6 @@ public class NGramFilterFactory extends TokenFilterFactory {
 
   @Override
   public NGramTokenFilter create(TokenStream input) {
-    return new NGramTokenFilter(input, minGramSize, maxGramSize);
+    return new NGramTokenFilter(luceneMatchVersion, input, minGramSize, maxGramSize);
   }
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
index ebb6e1290a5..30c2454852f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
@@ -21,37 +21,60 @@ import java.io.IOException;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.miscellaneous.LengthFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.Version;
 
 /**
  * Tokenizes the input into n-grams of the given size(s).
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version} compatibility when
+ * creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters:<ul>
+ * <li>emits all n-grams for the same token at the same position,</li>
+ * <li>does not modify offsets,</li>
+ * <li>sorts n-grams by their offset in the original token first, then
+ * increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc",
+ * "c").</li></ul>
+ * <p>You can make this filter use the old behavior by providing a version &lt;
+ * {@link Version#LUCENE_44} in the constructor but this is not recommended as
+ * it will lead to broken {@link TokenStream}s that will cause highlighting
+ * bugs.
  */
 public final class NGramTokenFilter extends TokenFilter {
   public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
   public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
 
-  private int minGram, maxGram;
-  
+  private final int minGram, maxGram;
+
   private char[] curTermBuffer;
   private int curTermLength;
   private int curGramSize;
   private int curPos;
+  private int curPosInc, curPosLen;
   private int tokStart;
-  private int tokEnd; // only used if the length changed before this filter
+  private int tokEnd;
   private boolean hasIllegalOffsets; // only if the length changed before this filter
-  
+
+  private final Version version;
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final PositionIncrementAttribute posIncAtt;
+  private final PositionLengthAttribute posLenAtt;
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 
   /**
    * Creates NGramTokenFilter with given min and max n-grams.
+   * @param version Lucene version to enable correct position increments.
+   *                See <a href="#version">above</a> for details.
    * @param input {@link TokenStream} holding the input to be tokenized
    * @param minGram the smallest n-gram to generate
    * @param maxGram the largest n-gram to generate
    */
-  public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
-    super(input);
+  public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
+    super(new LengthFilter(true, input, minGram, Integer.MAX_VALUE));
+    this.version = version;
     if (minGram < 1) {
       throw new IllegalArgumentException("minGram must be greater than zero");
     }
@@ -60,14 +83,37 @@ public final class NGramTokenFilter extends TokenFilter {
     }
     this.minGram = minGram;
     this.maxGram = maxGram;
+    if (version.onOrAfter(Version.LUCENE_44)) {
+      posIncAtt = addAttribute(PositionIncrementAttribute.class);
+      posLenAtt = addAttribute(PositionLengthAttribute.class);
+    } else {
+      posIncAtt = new PositionIncrementAttribute() {
+        @Override
+        public void setPositionIncrement(int positionIncrement) {}
+        @Override
+        public int getPositionIncrement() {
+          return 0;
+        }
+      };
+      posLenAtt = new PositionLengthAttribute() {
+        @Override
+        public void setPositionLength(int positionLength) {}        
+        @Override
+        public int getPositionLength() {
+          return 0;
+        }
+      };
+    }
   }
 
   /**
    * Creates NGramTokenFilter with default min and max n-grams.
+   * @param version Lucene version to enable correct position increments.
+   *                See <a href="#version">above</a> for details.
    * @param input {@link TokenStream} holding the input to be tokenized
    */
-  public NGramTokenFilter(TokenStream input) {
-    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
+  public NGramTokenFilter(Version version, TokenStream input) {
+    this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
   }
 
   /** Returns the next token in the stream, or null at EOS. */
@@ -82,6 +128,8 @@ public final class NGramTokenFilter extends TokenFilter {
           curTermLength = termAtt.length();
           curGramSize = minGram;
           curPos = 0;
+          curPosInc = posIncAtt.getPositionIncrement();
+          curPosLen = posLenAtt.getPositionLength();
           tokStart = offsetAtt.startOffset();
           tokEnd = offsetAtt.endOffset();
           // if length by start + end offsets doesn't match the term text then assume
@@ -89,20 +137,37 @@ public final class NGramTokenFilter extends TokenFilter {
           hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
         }
       }
-      while (curGramSize <= maxGram) {
-        while (curPos+curGramSize <= curTermLength) {     // while there is input
+      if (version.onOrAfter(Version.LUCENE_44)) {
+        if (curGramSize > maxGram || curPos + curGramSize > curTermLength) {
+          ++curPos;
+          curGramSize = minGram;
+        }
+        if (curPos + curGramSize <= curTermLength) {
           clearAttributes();
           termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
-          if (hasIllegalOffsets) {
-            offsetAtt.setOffset(tokStart, tokEnd);
-          } else {
-            offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
-          }
-          curPos++;
+          posIncAtt.setPositionIncrement(curPosInc);
+          curPosInc = 0;
+          posLenAtt.setPositionLength(curPosLen);
+          offsetAtt.setOffset(tokStart, tokEnd);
+          curGramSize++;
           return true;
         }
-        curGramSize++;                         // increase n-gram size
-        curPos = 0;
+      } else {
+        while (curGramSize <= maxGram) {
+          while (curPos+curGramSize <= curTermLength) {     // while there is input
+            clearAttributes();
+            termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
+            if (hasIllegalOffsets) {
+              offsetAtt.setOffset(tokStart, tokEnd);
+            } else {
+              offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
+            }
+            curPos++;
+            return true;
+          }
+          curGramSize++;                         // increase n-gram size
+          curPos = 0;
+        }
       }
       curTermBuffer = null;
     }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
index 81ce887ea7b..a0665bf6f5f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
@@ -17,64 +17,90 @@ package org.apache.lucene.analysis.ngram;
  * limitations under the License.
  */
 
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.util.AttributeSource;
-
 import java.io.IOException;
 import java.io.Reader;
 
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.Version;
+
 /**
  * Tokenizes the input into n-grams of the given size(s).
+ * <p>On the contrary to {@link NGramTokenFilter}, this class sets offsets so
+ * that characters between startOffset and endOffset in the original stream are
+ * the same as the term chars.
+ * <p>For example, "abcde" would be tokenized as (minGram=2, maxGram=3):
+ * <table>
+ * <tr><th>Term</th><td>ab</td><td>abc</td><td>bc</td><td>bcd</td><td>cd</td><td>cde</td><td>de</td></tr>
+ * <tr><th>Position increment</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
+ * <tr><th>Position length</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
+ * <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
+ * </table>
+ * <a name="version"/>
+ * <p>Before Lucene 4.4, this class had a different behavior:<ul>
+ * <li>It didn't support more than 1024 chars of input, the rest was trashed.</li>
+ * <li>The last whitespaces of the 1024 chars block were trimmed.</li>
+ * <li>Tokens were emitted in a different order (by increasing lengths).</li></ul>
+ * <p>Although highly discouraged, it is still possible to use the old behavior
+ * through {@link Lucene43NGramTokenizer}.
  */
 public final class NGramTokenizer extends Tokenizer {
   public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
   public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
 
-  private int minGram, maxGram;
+  private char[] buffer;
+  private int bufferStart, bufferEnd; // remaining slice of the buffer
+  private int offset;
   private int gramSize;
-  private int pos;
-  private int inLen; // length of the input AFTER trim()
-  private int charsRead; // length of the input
-  private String inStr;
-  private boolean started;
-  
+  private int minGram, maxGram;
+  private boolean exhausted;
+
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 
   /**
    * Creates NGramTokenizer with given min and max n-grams.
+   * @param version the lucene compatibility <a href="#version">version</a>
    * @param input {@link Reader} holding the input to be tokenized
    * @param minGram the smallest n-gram to generate
    * @param maxGram the largest n-gram to generate
    */
-  public NGramTokenizer(Reader input, int minGram, int maxGram) {
+  public NGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
     super(input);
-    init(minGram, maxGram);
+    init(version, minGram, maxGram);
   }
 
   /**
    * Creates NGramTokenizer with given min and max n-grams.
+   * @param version the lucene compatibility <a href="#version">version</a>
    * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
    * @param input {@link Reader} holding the input to be tokenized
    * @param minGram the smallest n-gram to generate
    * @param maxGram the largest n-gram to generate
    */
-  public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
+  public NGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
     super(factory, input);
-    init(minGram, maxGram);
+    init(version, minGram, maxGram);
   }
 
   /**
    * Creates NGramTokenizer with default min and max n-grams.
+   * @param version the lucene compatibility <a href="#version">version</a>
    * @param input {@link Reader} holding the input to be tokenized
    */
-  public NGramTokenizer(Reader input) {
-    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
+  public NGramTokenizer(Version version, Reader input) {
+    this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
   }
-  
-  private void init(int minGram, int maxGram) {
+
+  private void init(Version version, int minGram, int maxGram) {
+    if (!version.onOrAfter(Version.LUCENE_44)) {
+      throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer");
+    }
     if (minGram < 1) {
       throw new IllegalArgumentException("minGram must be greater than zero");
     }
@@ -83,73 +109,66 @@ public final class NGramTokenizer extends Tokenizer {
     }
     this.minGram = minGram;
     this.maxGram = maxGram;
+    buffer = new char[maxGram + 1024];
   }
 
   /** Returns the next token in the stream, or null at EOS. */
   @Override
   public boolean incrementToken() throws IOException {
     clearAttributes();
-    if (!started) {
-      started = true;
-      gramSize = minGram;
-      char[] chars = new char[1024];
-      charsRead = 0;
-      // TODO: refactor to a shared readFully somewhere:
-      while (charsRead < chars.length) {
-        int inc = input.read(chars, charsRead, chars.length-charsRead);
-        if (inc == -1) {
-          break;
-        }
-        charsRead += inc;
-      }
-      inStr = new String(chars, 0, charsRead).trim();  // remove any trailing empty strings 
 
-      if (charsRead == chars.length) {
-        // Read extra throwaway chars so that on end() we
-        // report the correct offset:
-        char[] throwaway = new char[1024];
-        while(true) {
-          final int inc = input.read(throwaway, 0, throwaway.length);
-          if (inc == -1) {
+    // compact
+    if (bufferStart >= buffer.length - maxGram) {
+      System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
+      bufferEnd -= bufferStart;
+      bufferStart = 0;
+
+      // fill in remaining space
+      if (!exhausted) {
+        // TODO: refactor to a shared readFully
+        while (bufferEnd < buffer.length) {
+          final int read = input.read(buffer, bufferEnd, buffer.length - bufferEnd);
+          if (read == -1) {
+            exhausted = true;
             break;
           }
-          charsRead += inc;
+          bufferEnd += read;
         }
       }
-
-      inLen = inStr.length();
-      if (inLen == 0) {
-        return false;
-      }
     }
 
-    if (pos+gramSize > inLen) {            // if we hit the end of the string
-      pos = 0;                           // reset to beginning of string
-      gramSize++;                        // increase n-gram size
-      if (gramSize > maxGram)            // we are done
-        return false;
-      if (pos+gramSize > inLen)
-        return false;
+    // should we go to the next offset?
+    if (gramSize > maxGram || bufferStart + gramSize > bufferEnd) {
+      bufferStart++;
+      offset++;
+      gramSize = minGram;
     }
 
-    int oldPos = pos;
-    pos++;
-    termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize);
-    offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
+    // are there enough chars remaining?
+    if (bufferStart + gramSize > bufferEnd) {
+      return false;
+    }
+
+    termAtt.copyBuffer(buffer, bufferStart, gramSize);
+    posIncAtt.setPositionIncrement(1);
+    posLenAtt.setPositionLength(1);
+    offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + gramSize));
+    ++gramSize;
     return true;
   }
-  
+
   @Override
   public void end() {
-    // set final offset
-    final int finalOffset = correctOffset(charsRead);
-    this.offsetAtt.setOffset(finalOffset, finalOffset);
-  }    
-  
+    final int endOffset = correctOffset(offset + bufferEnd - bufferStart);
+    offsetAtt.setOffset(endOffset, endOffset);
+  }
+
   @Override
   public void reset() throws IOException {
     super.reset();
-    started = false;
-    pos = 0;
+    bufferStart = bufferEnd = buffer.length;
+    offset = 0;
+    gramSize = minGram;
+    exhausted = false;
   }
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java
index 1b702909a77..c74bb42f5f0 100755
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java
@@ -18,8 +18,10 @@ package org.apache.lucene.analysis.ngram;
  */
 
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.util.TokenizerFactory;
 import org.apache.lucene.util.AttributeSource.AttributeFactory;
+import org.apache.lucene.util.Version;
 
 import java.io.Reader;
 import java.util.Map;
@@ -49,7 +51,11 @@ public class NGramTokenizerFactory extends TokenizerFactory {
   
   /** Creates the {@link TokenStream} of n-grams from the given {@link Reader} and {@link AttributeFactory}. */
   @Override
-  public NGramTokenizer create(AttributeFactory factory, Reader input) {
-    return new NGramTokenizer(factory, input, minGramSize, maxGramSize);
+  public Tokenizer create(AttributeFactory factory, Reader input) {
+    if (luceneMatchVersion.onOrAfter(Version.LUCENE_44)) {
+      return new NGramTokenizer(luceneMatchVersion, factory, input, minGramSize, maxGramSize);
+    } else {
+      return new Lucene43NGramTokenizer(factory, input, minGramSize, maxGramSize);
+    }
   }
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index a34d43d8692..b788c069ce0 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -54,8 +54,6 @@ import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter;
-import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
 import org.apache.lucene.analysis.ValidatingTokenFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.cjk.CJKBigramFilter;
@@ -71,14 +69,14 @@ import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
 import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
 import org.apache.lucene.analysis.miscellaneous.LengthFilter;
 import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
+import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter;
 import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
+import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
 import org.apache.lucene.analysis.miscellaneous.TrimFilter;
 import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
-import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
-import org.apache.lucene.analysis.ngram.NGramTokenFilter;
-import org.apache.lucene.analysis.ngram.NGramTokenizer;
+import org.apache.lucene.analysis.ngram.Lucene43NGramTokenizer;
 import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
 import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
 import org.apache.lucene.analysis.payloads.IdentityEncoder;
@@ -90,8 +88,9 @@ import org.apache.lucene.analysis.synonym.SynonymMap;
 import org.apache.lucene.analysis.th.ThaiWordFilter;
 import org.apache.lucene.analysis.util.CharArrayMap;
 import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.util.AttributeSource.AttributeFactory;
+import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
 import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.AttributeSource.AttributeFactory;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.Rethrow;
 import org.apache.lucene.util.Version;
@@ -162,9 +161,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
           // startOffset thats > its endOffset
           // (see LUCENE-3738 for a list of other offenders here)
           // broken!
-          NGramTokenizer.class,
-          // broken!
-          NGramTokenFilter.class,
+          Lucene43NGramTokenizer.class,
           // broken!
           EdgeNGramTokenizer.class,
           // broken!
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
index b2118472206..37db05d849e 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
@@ -26,7 +26,9 @@ import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
+import org.apache.lucene.util.Version;
 
+import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.Random;
@@ -46,7 +48,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
   public void testInvalidInput() throws Exception {
     boolean gotException = false;
     try {        
-      new NGramTokenFilter(input, 2, 1);
+      new NGramTokenFilter(TEST_VERSION_CURRENT, input, 2, 1);
     } catch (IllegalArgumentException e) {
       gotException = true;
     }
@@ -56,50 +58,64 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
   public void testInvalidInput2() throws Exception {
     boolean gotException = false;
     try {        
-      new NGramTokenFilter(input, 0, 1);
+      new NGramTokenFilter(TEST_VERSION_CURRENT, input, 0, 1);
     } catch (IllegalArgumentException e) {
       gotException = true;
     }
     assertTrue(gotException);
   }
-  
+
   public void testUnigrams() throws Exception {
-    NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
-    assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+    NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 1);
+    assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
   }
   
   public void testBigrams() throws Exception {
-    NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
-    assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5});
+    NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 2, 2);
+    assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,0,0,0}, new int[]{5,5,5,5}, new int[]{1,0,0,0});
   }
   
   public void testNgrams() throws Exception {
-    NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
+    NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 3);
     assertTokenStreamContents(filter,
-        new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"}, 
-        new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
-        new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
-        null, null, null, null, false
+        new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
+        new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
+        new int[]{5,5,5,5,5,5,5,5,5,5,5,5},
+        null,
+        new int[]{1,0,0,0,0,0,0,0,0,0,0,0},
+        null, null, false
         );
   }
-  
+
+  public void testNgramsNoIncrement() throws Exception {
+    NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 1, 3);
+    assertTokenStreamContents(filter,
+        new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
+        new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
+        new int[]{5,5,5,5,5,5,5,5,5,5,5,5},
+        null,
+        new int[]{1,0,0,0,0,0,0,0,0,0,0,0},
+        null, null, false
+        );
+  }
+
   public void testOversizedNgrams() throws Exception {
-    NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
+    NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 6, 7);
     assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
   }
   
   public void testSmallTokenInStream() throws Exception {
     input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
-    NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
-    assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
+    NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, input, 3, 3);
+    assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2});
   }
   
   public void testReset() throws Exception {
     WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
-    NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
-    assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+    NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 1, 1);
+    assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
     tokenizer.setReader(new StringReader("abcde"));
-    assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+    assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
   }
   
   // LUCENE-3642
@@ -112,14 +128,15 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
       protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
         Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
         TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
-        filters = new NGramTokenFilter(filters, 2, 2);
+        filters = new NGramTokenFilter(TEST_VERSION_CURRENT, filters, 2, 2);
         return new TokenStreamComponents(tokenizer, filters);
       }
     };
     assertAnalyzesTo(analyzer, "mosfellsbær",
         new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
         new int[]    {    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
-        new int[]    {   11,   11,   11,   11,   11,   11,   11,   11,   11,   11,   11 });
+        new int[]    {   11,   11,   11,   11,   11,   11,   11,   11,   11,   11,   11 },
+        new int[]    {     1,   0,    0,    0,    0,    0,    0,    0,    0,    0,    0  });
   }
   
   /** blast some random strings through the analyzer */
@@ -129,7 +146,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
       protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
         Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
         return new TokenStreamComponents(tokenizer, 
-            new NGramTokenFilter(tokenizer, 2, 4));
+            new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 4));
       }    
     };
     checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
@@ -142,9 +159,22 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
       protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
         Tokenizer tokenizer = new KeywordTokenizer(reader);
         return new TokenStreamComponents(tokenizer, 
-            new NGramTokenFilter(tokenizer, 2, 15));
+            new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 15));
       }    
     };
     checkAnalysisConsistency(random, a, random.nextBoolean(), "");
   }
+
+  public void testLucene43() throws IOException {
+    NGramTokenFilter filter = new NGramTokenFilter(Version.LUCENE_43, input, 2, 3);
+    assertTokenStreamContents(filter,
+        new String[]{"ab","bc","cd","de","abc","bcd","cde"},
+        new int[]{0,1,2,3,0,1,2},
+        new int[]{2,3,4,5,3,4,5},
+        null,
+        new int[]{1,1,1,1,1,1,1},
+        null, null, false
+        );
+  }
+
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
index d4ac43f2814..f56f41309a2 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
@@ -18,13 +18,21 @@ package org.apache.lucene.analysis.ngram;
  */
 
 
+import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.util.LuceneTestCase.Slow;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util._TestUtil;
+
+import com.carrotsearch.randomizedtesting.generators.RandomStrings;
 
 /**
  * Tests {@link NGramTokenizer} for correctness.
@@ -41,7 +49,7 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
   public void testInvalidInput() throws Exception {
     boolean gotException = false;
     try {        
-      new NGramTokenizer(input, 2, 1);
+      new NGramTokenizer(TEST_VERSION_CURRENT, input, 2, 1);
     } catch (IllegalArgumentException e) {
       gotException = true;
     }
@@ -51,7 +59,7 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
   public void testInvalidInput2() throws Exception {
     boolean gotException = false;
     try {        
-      new NGramTokenizer(input, 0, 1);
+      new NGramTokenizer(TEST_VERSION_CURRENT, input, 0, 1);
     } catch (IllegalArgumentException e) {
       gotException = true;
     }
@@ -59,21 +67,21 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
   }
   
   public void testUnigrams() throws Exception {
-    NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
+    NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1);
     assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
   }
   
   public void testBigrams() throws Exception {
-    NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
+    NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 2, 2);
     assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
   }
   
   public void testNgrams() throws Exception {
-    NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
+    NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 3);
     assertTokenStreamContents(tokenizer,
-        new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"}, 
-        new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
-        new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
+        new String[]{"a","ab", "abc", "b", "bc", "bcd", "c", "cd", "cde", "d", "de", "e"},
+        new int[]{0,0,0,1,1,1,2,2,2,3,3,4},
+        new int[]{1,2,3,2,3,4,3,4,5,4,5,5},
         null,
         null,
         null,
@@ -83,12 +91,12 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
   }
   
   public void testOversizedNgrams() throws Exception {
-    NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
+    NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 6, 7);
     assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
   }
   
   public void testReset() throws Exception {
-    NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
+    NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1);
     assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
     tokenizer.setReader(new StringReader("abcde"));
     assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
@@ -99,11 +107,48 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
     Analyzer a = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new NGramTokenizer(reader, 2, 4);
+        Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 4);
         return new TokenStreamComponents(tokenizer, tokenizer);
       }    
     };
     checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
     checkRandomData(random(), a, 50*RANDOM_MULTIPLIER, 1027, false, false);
   }
+
+  private void testNGrams(int minGram, int maxGram, int length) throws IOException {
+    final String s = RandomStrings.randomAsciiOfLength(random(), length);
+    final TokenStream grams = new NGramTokenizer(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram);
+    final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
+    final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
+    final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
+    final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
+    grams.reset();
+    for (int start = 0; start < s.length(); ++start) {
+      for (int end = start + minGram; end <= start + maxGram && end <= s.length(); ++end) {
+        assertTrue(grams.incrementToken());
+        assertEquals(s.substring(start, end), termAtt.toString());
+        assertEquals(1, posIncAtt.getPositionIncrement());
+        assertEquals(start, offsetAtt.startOffset());
+        assertEquals(end, offsetAtt.endOffset());
+      }
+    }
+    grams.end();
+    assertEquals(s.length(), offsetAtt.startOffset());
+    assertEquals(s.length(), offsetAtt.endOffset());
+  }
+
+  public void testLargeInput() throws IOException {
+    // test sliding
+    final int minGram = _TestUtil.nextInt(random(), 1, 100);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
+    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024));
+  }
+
+  public void testLargeMaxGram() throws IOException {
+    // test sliding with maxGram > 1024
+    final int minGram = _TestUtil.nextInt(random(), 1200, 1300);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 1300);
+    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024));
+  }
+
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
index a4ea1a2d09c..2256af6ba92 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
@@ -35,7 +35,7 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
     Reader reader = new StringReader("test");
     TokenStream stream = tokenizerFactory("NGram").create(reader);
     assertTokenStreamContents(stream, 
-        new String[] { "t", "e", "s", "t", "te", "es", "st" });
+        new String[] { "t", "te", "e", "es", "s", "st", "t" });
   }
 
   /**
@@ -47,7 +47,7 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
         "minGramSize", "2",
         "maxGramSize", "3").create(reader);
     assertTokenStreamContents(stream, 
-        new String[] { "te", "es", "st", "tes", "est" });
+        new String[] { "te", "tes", "es", "est", "st" });
   }
 
   /**
@@ -58,7 +58,7 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
     TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
     stream = tokenFilterFactory("NGram").create(stream);
     assertTokenStreamContents(stream, 
-        new String[] { "t", "e", "s", "t", "te", "es", "st" });
+        new String[] { "t", "te", "e", "es", "s", "st", "t" });
   }
 
   /**
@@ -71,7 +71,7 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
         "minGramSize", "2",
         "maxGramSize", "3").create(stream);
     assertTokenStreamContents(stream, 
-        new String[] { "te", "es", "st", "tes", "est" });
+        new String[] { "te", "tes", "es", "est", "st" });
   }
 
   /**