LUCENE-5042: Fix the n-gram tokenizers and filters.

This commit fixes n-gram tokenizers and filters so that they handle supplementary characters correctly and adds the ability to pre-tokenize the stream in tokenizers. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1492185 13f79535-47bb-0310-9956-ffa450edef68
2013-06-12 13:17:49 +00:00 · 2013-06-12 13:17:49 +00:00 · e021451d6c
parent a4d58b6f22
commit e021451d6c
21 changed files with 576 additions and 331 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -47,6 +47,10 @@ Changes in backwards compatibility policy
  (a, ab, b, bc, c) instead of (a, b, c, ab, bc) and doesn't trim trailing
  whitespaces. (Adrien Grand)

+* LUCENE-5042: The n-gram and edge n-gram tokenizers and filters now correctly
+  handle supplementary characters, and the tokenizers have the ability to
+  pre-tokenize the input stream similarly to CharTokenizer. (Adrien Grand)
+
 * LUCENE-4967: NRTManager is replaced by
  ControlledRealTimeReopenThread, for controlling which requests must
  see which indexing changes, so that it can work with any
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
@ -57,7 +57,7 @@ public final class GreekLowerCaseFilter extends TokenFilter {
      int chLen = termAtt.length();
      for (int i = 0; i < chLen;) {
        i += Character.toChars(
-            lowerCase(charUtils.codePointAt(chArray, i)), chArray, i);
+            lowerCase(charUtils.codePointAt(chArray, i, chLen)), chArray, i);
       }
      return true;
    } else {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
@ -25,21 +25,26 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.util.CharacterUtils;
 import org.apache.lucene.util.Version;

 /**
 * Tokenizes the given token into n-grams of given size(s).
 * <p>
 * This {@link TokenFilter} create n-grams from the beginning edge of a input token.
+ * <p><a name="match_version" />As of Lucene 4.4, this filter handles correctly
+ * supplementary characters.
 */
 public final class EdgeNGramTokenFilter extends TokenFilter {
  public static final int DEFAULT_MAX_GRAM_SIZE = 1;
  public static final int DEFAULT_MIN_GRAM_SIZE = 1;

+  private final CharacterUtils charUtils;
  private final int minGram;
  private final int maxGram;
  private char[] curTermBuffer;
  private int curTermLength;
+  private int curCodePointCount;
  private int curGramSize;
  private int tokStart;
  private int tokEnd; // only used if the length changed before this filter
@ -74,6 +79,9 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
      throw new IllegalArgumentException("minGram must not be greater than maxGram");
    }

+    this.charUtils = version.onOrAfter(Version.LUCENE_44)
+        ? CharacterUtils.getInstance(version)
+        : CharacterUtils.getJava4Instance();
    this.minGram = minGram;
    this.maxGram = maxGram;
  }
@ -87,6 +95,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
        } else {
          curTermBuffer = termAtt.buffer().clone();
          curTermLength = termAtt.length();
+          curCodePointCount = charUtils.codePointCount(termAtt);
          curGramSize = minGram;
          tokStart = offsetAtt.startOffset();
          tokEnd = offsetAtt.endOffset();
@ -95,7 +104,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
        }
      }
      if (curGramSize <= maxGram) {         // if we have hit the end of our n-gram size range, quit
-        if (curGramSize <= curTermLength) { // if the remaining input is too short, we can't generate any n-grams
+        if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
          // grab gramSize chars from front or back
          clearAttributes();
          offsetAtt.setOffset(tokStart, tokEnd);
@ -107,7 +116,8 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
            posIncrAtt.setPositionIncrement(0);
          }
          posLenAtt.setPositionLength(savePosLen);
-          termAtt.copyBuffer(curTermBuffer, 0, curGramSize);
+          final int charLength = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
+          termAtt.copyBuffer(curTermBuffer, 0, charLength);
          curGramSize++;
          return true;
        }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
@ -17,37 +17,23 @@ package org.apache.lucene.analysis.ngram;
 * limitations under the License.
 */

-import java.io.IOException;
 import java.io.Reader;

 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.Version;

 /**
 * Tokenizes the input from an edge into n-grams of given size(s).
 * <p>
 * This {@link Tokenizer} create n-grams from the beginning edge of a input token.
+ * <p><a name="match_version" />As of Lucene 4.4, this class supports
+ * {@link #isTokenChar(int) pre-tokenization} and correctly handles
+ * supplementary characters.
 */
-public final class EdgeNGramTokenizer extends Tokenizer {
+public class EdgeNGramTokenizer extends NGramTokenizer {
  public static final int DEFAULT_MAX_GRAM_SIZE = 1;
  public static final int DEFAULT_MIN_GRAM_SIZE = 1;

-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
-
-  private int minGram;
-  private int maxGram;
-  private int gramSize;
-  private boolean started;
-  private int inLen; // length of the input AFTER trim()
-  private int charsRead; // length of the input
-  private String inStr;
-
  /**
   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
   *
@ -57,8 +43,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {
   * @param maxGram the largest n-gram to generate
   */
  public EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
-    super(input);
-    init(version, minGram, maxGram);
+    super(version, input, minGram, maxGram, true);
  }

  /**
@ -71,102 +56,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {
   * @param maxGram the largest n-gram to generate
   */
  public EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
-    super(factory, input);
-    init(version, minGram, maxGram);
+    super(version, factory, input, minGram, maxGram, true);
  }

-  private void init(Version version, int minGram, int maxGram) {
-    if (version == null) {
-      throw new IllegalArgumentException("version must not be null");
-    }
-
-    if (minGram < 1) {
-      throw new IllegalArgumentException("minGram must be greater than zero");
-    }
-
-    if (minGram > maxGram) {
-      throw new IllegalArgumentException("minGram must not be greater than maxGram");
-    }
-
-    this.minGram = minGram;
-    this.maxGram = maxGram;
-  }
-
-  /** Returns the next token in the stream, or null at EOS. */
-  @Override
-  public boolean incrementToken() throws IOException {
-    clearAttributes();
-    // if we are just starting, read the whole input
-    if (!started) {
-      started = true;
-      gramSize = minGram;
-      char[] chars = new char[Math.min(1024, maxGram)];
-      charsRead = 0;
-      // TODO: refactor to a shared readFully somewhere:
-      boolean exhausted = false;
-      while (charsRead < maxGram) {
-        final int inc = input.read(chars, charsRead, chars.length-charsRead);
-        if (inc == -1) {
-          exhausted = true;
-          break;
-        }
-        charsRead += inc;
-        if (charsRead == chars.length && charsRead < maxGram) {
-          chars = ArrayUtil.grow(chars);
-        }
-      }
-
-      inStr = new String(chars, 0, charsRead);
-
-      if (!exhausted) {
-        // Read extra throwaway chars so that on end() we
-        // report the correct offset:
-        char[] throwaway = new char[1024];
-        while(true) {
-          final int inc = input.read(throwaway, 0, throwaway.length);
-          if (inc == -1) {
-            break;
-          }
-          charsRead += inc;
-        }
-      }
-
-      inLen = inStr.length();
-      if (inLen == 0) {
-        return false;
-      }
-      posIncrAtt.setPositionIncrement(1);
-    } else {
-      posIncrAtt.setPositionIncrement(1);
-    }
-
-    // if the remaining input is too short, we can't generate any n-grams
-    if (gramSize > inLen) {
-      return false;
-    }
-
-    // if we have hit the end of our n-gram size range, quit
-    if (gramSize > maxGram || gramSize > inLen) {
-      return false;
-    }
-
-    // grab gramSize chars from front or back
-    termAtt.setEmpty().append(inStr, 0, gramSize);
-    offsetAtt.setOffset(correctOffset(0), correctOffset(gramSize));
-    gramSize++;
-    return true;
-  }
-  
-  @Override
-  public void end() {
-    // set final offset
-    final int finalOffset = correctOffset(charsRead);
-    this.offsetAtt.setOffset(finalOffset, finalOffset);
-  }    
-
-  @Override
-  public void reset() throws IOException {
-    super.reset();
-    started = false;
-  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.util.CharacterUtils;
 import org.apache.lucene.util.Version;

 /**
@ -33,6 +34,7 @@ import org.apache.lucene.util.Version;
 * <a name="version"/>
 * <p>You must specify the required {@link Version} compatibility when
 * creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters:<ul>
+ * <li>handles supplementary characters correctly,</li>
 * <li>emits all n-grams for the same token at the same position,</li>
 * <li>does not modify offsets,</li>
 * <li>sorts n-grams by their offset in the original token first, then
@ -42,6 +44,10 @@ import org.apache.lucene.util.Version;
 * {@link Version#LUCENE_44} in the constructor but this is not recommended as
 * it will lead to broken {@link TokenStream}s that will cause highlighting
 * bugs.
+ * <p>If you were using this {@link TokenFilter} to perform partial highlighting,
+ * this won't work anymore since this filter doesn't update offsets. You should
+ * modify your analysis chain to use {@link NGramTokenizer}, and potentially
+ * override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization.
 */
 public final class NGramTokenFilter extends TokenFilter {
  public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
@ -51,6 +57,7 @@ public final class NGramTokenFilter extends TokenFilter {

  private char[] curTermBuffer;
  private int curTermLength;
+  private int curCodePointCount;
  private int curGramSize;
  private int curPos;
  private int curPosInc, curPosLen;
@ -59,6 +66,7 @@ public final class NGramTokenFilter extends TokenFilter {
  private boolean hasIllegalOffsets; // only if the length changed before this filter

  private final Version version;
+  private final CharacterUtils charUtils;
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posIncAtt;
  private final PositionLengthAttribute posLenAtt;
@ -75,6 +83,9 @@ public final class NGramTokenFilter extends TokenFilter {
  public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
    super(new LengthFilter(version, input, minGram, Integer.MAX_VALUE));
    this.version = version;
+    this.charUtils = version.onOrAfter(Version.LUCENE_44)
+        ? CharacterUtils.getInstance(version)
+        : CharacterUtils.getJava4Instance();
    if (minGram < 1) {
      throw new IllegalArgumentException("minGram must be greater than zero");
    }
@ -126,6 +137,7 @@ public final class NGramTokenFilter extends TokenFilter {
        } else {
          curTermBuffer = termAtt.buffer().clone();
          curTermLength = termAtt.length();
+          curCodePointCount = charUtils.codePointCount(termAtt);
          curGramSize = minGram;
          curPos = 0;
          curPosInc = posIncAtt.getPositionIncrement();
@ -138,13 +150,15 @@ public final class NGramTokenFilter extends TokenFilter {
        }
      }
      if (version.onOrAfter(Version.LUCENE_44)) {
-        if (curGramSize > maxGram || curPos + curGramSize > curTermLength) {
+        if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) {
          ++curPos;
          curGramSize = minGram;
        }
-        if (curPos + curGramSize <= curTermLength) {
+        if ((curPos + curGramSize) <= curCodePointCount) {
          clearAttributes();
-          termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
+          final int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
+          final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
+          termAtt.copyBuffer(curTermBuffer, start, end - start);
          posIncAtt.setPositionIncrement(curPosInc);
          curPosInc = 0;
          posLenAtt.setPositionLength(curPosLen);
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.util.CharacterUtils;
 import org.apache.lucene.util.Version;

 /**
@ -40,29 +41,47 @@ import org.apache.lucene.util.Version;
 * <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
 * </table>
 * <a name="version"/>
- * <p>Before Lucene 4.4, this class had a different behavior:<ul>
- * <li>It didn't support more than 1024 chars of input, the rest was trashed.</li>
- * <li>The last whitespaces of the 1024 chars block were trimmed.</li>
- * <li>Tokens were emitted in a different order (by increasing lengths).</li></ul>
- * <p>Although highly discouraged, it is still possible to use the old behavior
- * through {@link Lucene43NGramTokenizer}.
+ * <p>This tokenizer changed a lot in Lucene 4.4 in order to:<ul>
+ * <li>tokenize in a streaming fashion to support streams which are larger
+ * than 1024 chars (limit of the previous version),
+ * <li>count grams based on unicode code points instead of java chars (and
+ * never split in the middle of surrogate pairs),
+ * <li>give the ability to {@link #isTokenChar(int) pre-tokenize} the stream
+ * before computing n-grams.</ul>
+ * <p>Additionally, this class doesn't trim trailing whitespaces and emits
+ * tokens in a different order, tokens are now emitted by increasing start
+ * offsets while they used to be emitted by increasing lengths (which prevented
+ * from supporting large input streams).
+ * <p>Although <b style="color:red">highly</b> discouraged, it is still possible
+ * to use the old behavior through {@link Lucene43NGramTokenizer}.
 */
-public final class NGramTokenizer extends Tokenizer {
+// non-final to allow for overriding isTokenChar, but all other methods should be final
+public class NGramTokenizer extends Tokenizer {
  public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
  public static final int DEFAULT_MAX_NGRAM_SIZE = 2;

-  private char[] buffer;
-  private int bufferStart, bufferEnd; // remaining slice of the buffer
+  private CharacterUtils charUtils;
+  private CharacterUtils.CharacterBuffer charBuffer;
+  private int[] buffer; // like charBuffer, but converted to code points
+  private int bufferStart, bufferEnd; // remaining slice in buffer
  private int offset;
  private int gramSize;
  private int minGram, maxGram;
  private boolean exhausted;
+  private int lastCheckedChar; // last offset in the buffer that we checked
+  private int lastNonTokenChar; // last offset that we found to not be a token char
+  private boolean edgesOnly; // leading edges n-grams only

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

+  NGramTokenizer(Version version, Reader input, int minGram, int maxGram, boolean edgesOnly) {
+    super(input);
+    init(version, minGram, maxGram, edgesOnly);
+  }
+
  /**
   * Creates NGramTokenizer with given min and max n-grams.
   * @param version the lucene compatibility <a href="#version">version</a>
@ -71,8 +90,12 @@ public final class NGramTokenizer extends Tokenizer {
   * @param maxGram the largest n-gram to generate
   */
  public NGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
-    super(input);
-    init(version, minGram, maxGram);
+    this(version, input, minGram, maxGram, false);
+  }
+
+  NGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram, boolean edgesOnly) {
+    super(factory, input);
+    init(version, minGram, maxGram, edgesOnly);
  }

  /**
@ -84,8 +107,7 @@ public final class NGramTokenizer extends Tokenizer {
   * @param maxGram the largest n-gram to generate
   */
  public NGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
-    super(factory, input);
-    init(version, minGram, maxGram);
+    this(version, factory, input, minGram, maxGram, false);
  }

  /**
@ -97,10 +119,13 @@ public final class NGramTokenizer extends Tokenizer {
    this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
  }

-  private void init(Version version, int minGram, int maxGram) {
-    if (!version.onOrAfter(Version.LUCENE_44)) {
+  private void init(Version version, int minGram, int maxGram, boolean edgesOnly) {
+    if (!edgesOnly && !version.onOrAfter(Version.LUCENE_44)) {
      throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer");
    }
+    charUtils = version.onOrAfter(Version.LUCENE_44)
+        ? CharacterUtils.getInstance(version)
+        : CharacterUtils.getJava4Instance();
    if (minGram < 1) {
      throw new IllegalArgumentException("minGram must be greater than zero");
    }
@ -109,64 +134,104 @@ public final class NGramTokenizer extends Tokenizer {
    }
    this.minGram = minGram;
    this.maxGram = maxGram;
-    buffer = new char[maxGram + 1024];
+    this.edgesOnly = edgesOnly;
+    charBuffer = CharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
+    buffer = new int[charBuffer.getBuffer().length];
+    // Make the term att large enough
+    termAtt.resizeBuffer(2 * maxGram);
  }

-  /** Returns the next token in the stream, or null at EOS. */
  @Override
-  public boolean incrementToken() throws IOException {
+  public final boolean incrementToken() throws IOException {
    clearAttributes();

-    // compact
-    if (bufferStart >= buffer.length - maxGram) {
-      System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
-      bufferEnd -= bufferStart;
-      bufferStart = 0;
+    // termination of this loop is guaranteed by the fact that every iteration
+    // either advances the buffer (calls consumes()) or increases gramSize
+    while (true) {
+      // compact
+      if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted) {
+        System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
+        bufferEnd -= bufferStart;
+        lastCheckedChar -= bufferStart;
+        lastNonTokenChar -= bufferStart;
+        bufferStart = 0;

-      // fill in remaining space
-      if (!exhausted) {
-        // TODO: refactor to a shared readFully
-        while (bufferEnd < buffer.length) {
-          final int read = input.read(buffer, bufferEnd, buffer.length - bufferEnd);
-          if (read == -1) {
-            exhausted = true;
-            break;
-          }
-          bufferEnd += read;
+        // fill in remaining space
+        exhausted = !charUtils.fill(charBuffer, input, buffer.length - bufferEnd);
+        // convert to code points
+        bufferEnd += charUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd);
+      }
+
+      // should we go to the next offset?
+      if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd) {
+        if (bufferStart + 1 + minGram > bufferEnd) {
+          assert exhausted;
+          return false;
+        }
+        consume();
+        gramSize = minGram;
+      }
+
+      updateLastNonTokenChar();
+
+      // retry if the token to be emitted was going to not only contain token chars
+      final boolean termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize);
+      final boolean isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
+      if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) {
+        consume();
+        gramSize = minGram;
+        continue;
+      }
+
+      final int length = charUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
+      termAtt.setLength(length);
+      posIncAtt.setPositionIncrement(1);
+      posLenAtt.setPositionLength(1);
+      offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length));
+      ++gramSize;
+      return true;
+    }
+  }
+
+  private void updateLastNonTokenChar() {
+    final int termEnd = bufferStart + gramSize - 1;
+    if (termEnd > lastCheckedChar) {
+      for (int i = termEnd; i > lastCheckedChar; --i) {
+        if (!isTokenChar(buffer[i])) {
+          lastNonTokenChar = i;
+          break;
        }
      }
+      lastCheckedChar = termEnd;
    }
+  }

-    // should we go to the next offset?
-    if (gramSize > maxGram || bufferStart + gramSize > bufferEnd) {
-      bufferStart++;
-      offset++;
-      gramSize = minGram;
-    }
+  /** Consume one code point. */
+  private void consume() {
+    offset += Character.charCount(buffer[bufferStart++]);
+  }

-    // are there enough chars remaining?
-    if (bufferStart + gramSize > bufferEnd) {
-      return false;
-    }
-
-    termAtt.copyBuffer(buffer, bufferStart, gramSize);
-    posIncAtt.setPositionIncrement(1);
-    posLenAtt.setPositionLength(1);
-    offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + gramSize));
-    ++gramSize;
+  /** Only collect characters which satisfy this condition. */
+  protected boolean isTokenChar(int chr) {
    return true;
  }

  @Override
-  public void end() {
-    final int endOffset = correctOffset(offset + bufferEnd - bufferStart);
+  public final void end() {
+    assert bufferStart <= bufferEnd;
+    int endOffset = offset;
+    for (int i = bufferStart; i < bufferEnd; ++i) {
+      endOffset += Character.charCount(buffer[i]);
+    }
+    endOffset = correctOffset(endOffset);
    offsetAtt.setOffset(endOffset, endOffset);
  }

  @Override
-  public void reset() throws IOException {
+  public final void reset() throws IOException {
    super.reset();
    bufferStart = bufferEnd = buffer.length;
+    lastNonTokenChar = lastCheckedChar = bufferStart - 1;
    offset = 0;
    gramSize = minGram;
    exhausted = false;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java
@ -57,7 +57,7 @@ public final class TurkishLowerCaseFilter extends TokenFilter {
      final char[] buffer = termAtt.buffer();
      int length = termAtt.length();
      for (int i = 0; i < length;) {
-        final int ch = Character.codePointAt(buffer, i);
+        final int ch = Character.codePointAt(buffer, i, length);
    
        iOrAfter = (ch == LATIN_CAPITAL_LETTER_I || 
            (iOrAfter && Character.getType(ch) == Character.NON_SPACING_MARK));
@ -100,7 +100,7 @@ public final class TurkishLowerCaseFilter extends TokenFilter {
   */
  private boolean isBeforeDot(char s[], int pos, int len) {
    for (int i = pos; i < len;) {
-      final int ch = Character.codePointAt(s, i);
+      final int ch = Character.codePointAt(s, i, len);
      if (Character.getType(ch) != Character.NON_SPACING_MARK)
        return false;
      if (ch == COMBINING_DOT_ABOVE)
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java
@ -262,7 +262,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
    if (ignoreCase) {
      for(int i=0;i<len;) {
        final int codePointAt = charUtils.codePointAt(text1, off+i, limit);
-        if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i))
+        if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
          return false;
        i += Character.charCount(codePointAt); 
      }
@ -282,7 +282,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
    if (ignoreCase) {
      for(int i=0;i<len;) {
        final int codePointAt = charUtils.codePointAt(text1, i);
-        if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i))
+        if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
          return false;
        i += Character.charCount(codePointAt);
      }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
@ -100,7 +100,8 @@ public abstract class CharTokenizer extends Tokenizer {
    while (true) {
      if (bufferIndex >= dataLen) {
        offset += dataLen;
-        if(!charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils
+        charUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
+        if (ioBuffer.getLength() == 0) {
          dataLen = 0; // so next offset += dataLen won't decrement offset
          if (length > 0) {
            break;
@ -113,7 +114,7 @@ public abstract class CharTokenizer extends Tokenizer {
        bufferIndex = 0;
      }
      // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
-      final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex);
+      final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength());
      final int charCount = Character.charCount(c);
      bufferIndex += charCount;

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
@ -51,27 +51,6 @@ public abstract class CharacterUtils {
    return JAVA_4;
  }

-  /**
-   * Returns the code point at the given index of the char array.
-   * Depending on the {@link Version} passed to
-   * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior
-   * of {@link Character#codePointAt(char[], int)} as it would have been
-   * available on a Java 1.4 JVM or on a later virtual machine version.
-   * 
-   * @param chars
-   *          a character array
-   * @param offset
-   *          the offset to the char values in the chars array to be converted
-   * 
-   * @return the Unicode code point at the given index
-   * @throws NullPointerException
-   *           - if the array is null.
-   * @throws IndexOutOfBoundsException
-   *           - if the value offset is negative or not less than the length of
-   *           the char array.
-   */
-  public abstract int codePointAt(final char[] chars, final int offset);
-
  /**
   * Returns the code point at the given index of the {@link CharSequence}.
   * Depending on the {@link Version} passed to
@ -116,7 +95,10 @@ public abstract class CharacterUtils {
   *           the char array.
   */
  public abstract int codePointAt(final char[] chars, final int offset, final int limit);
-  
+
+  /** Return the number of characters in <code>seq</code>. */
+  public abstract int codePointCount(CharSequence seq);
+
  /**
   * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
   * of the given bufferSize.
@ -140,53 +122,101 @@ public abstract class CharacterUtils {
   * @param offset the offset to start at
   * @param limit the max char in the buffer to lower case
   */
-  public void toLowerCase(final char[] buffer, final int offset, final int limit) {
+  public final void toLowerCase(final char[] buffer, final int offset, final int limit) {
    assert buffer.length >= limit;
    assert offset <=0 && offset <= buffer.length;
    for (int i = offset; i < limit;) {
      i += Character.toChars(
              Character.toLowerCase(
-                  codePointAt(buffer, i)), buffer, i);
+                  codePointAt(buffer, i, limit)), buffer, i);
     }
  }
-  
+
+  /** Converts a sequence of Java characters to a sequence of unicode code points.
+   *  @return the number of code points written to the destination buffer */
+  public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
+    int codePointCount = 0;
+    for (int i = 0; i < srcLen; ) {
+      final int cp = codePointAt(src, srcOff + i, srcOff + srcLen);
+      final int charCount = Character.charCount(cp);
+      dest[destOff + codePointCount++] = cp;
+      i += charCount;
+    }
+    return codePointCount;
+  }
+
+  /** Converts a sequence of unicode code points to a sequence of Java characters.
+   *  @return the number of chars written to the destination buffer */
+  public final int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
+    int written = 0;
+    for (int i = 0; i < srcLen; ++i) {
+      written += Character.toChars(src[srcOff + i], dest, destOff + written);
+    }
+    return written;
+  }
+
  /**
   * Fills the {@link CharacterBuffer} with characters read from the given
-   * reader {@link Reader}. This method tries to read as many characters into
-   * the {@link CharacterBuffer} as possible, each call to fill will start
-   * filling the buffer from offset <code>0</code> up to the length of the size
-   * of the internal character array.
+   * reader {@link Reader}. This method tries to read <code>numChars</code>
+   * characters into the {@link CharacterBuffer}, each call to fill will start
+   * filling the buffer from offset <code>0</code> up to <code>numChars</code>.
+   * In case code points can span across 2 java characters, this method may
+   * only fill <code>numChars - 1</code> characters in order not to split in
+   * the middle of a surrogate pair, even if there are remaining characters in
+   * the {@link Reader}.
   * <p>
   * Depending on the {@link Version} passed to
   * {@link CharacterUtils#getInstance(Version)} this method implements
   * supplementary character awareness when filling the given buffer. For all
-   * {@link Version} &gt; 3.0 {@link #fill(CharacterBuffer, Reader)} guarantees
+   * {@link Version} &gt; 3.0 {@link #fill(CharacterBuffer, Reader, int)} guarantees
   * that the given {@link CharacterBuffer} will never contain a high surrogate
   * character as the last element in the buffer unless it is the last available
   * character in the reader. In other words, high and low surrogate pairs will
   * always be preserved across buffer boarders.
   * </p>
+   * <p>
+   * A return value of <code>false</code> means that this method call exhausted
+   * the reader, but there may be some bytes which have been read, which can be
+   * verified by checking whether <code>buffer.getLength() &gt; 0</code>.
+   * </p>
   * 
   * @param buffer
   *          the buffer to fill.
   * @param reader
   *          the reader to read characters from.
-   * @return <code>true</code> if and only if no more characters are available
-   *         in the reader, otherwise <code>false</code>.
+   * @param numChars
+   *          the number of chars to read
+   * @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer
   * @throws IOException
   *           if the reader throws an {@link IOException}.
   */
-  public abstract boolean fill(CharacterBuffer buffer, Reader reader) throws IOException;
+  public abstract boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException;
+
+  /** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
+  public final boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
+    return fill(buffer, reader, buffer.buffer.length);
+  }
+
+  /** Return the index within <code>buf[start:start+count]</code> which is by <code>offset</code>
+   *  code points from <code>index</code>. */
+  public abstract int offsetByCodePoints(char[] buf, int start, int count, int index, int offset);
+
+  static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
+    int read = 0;
+    while (read < len) {
+      final int r = reader.read(dest, offset + read, len - read);
+      if (r == -1) {
+        break;
+      }
+      read += r;
+    }
+    return read;
+  }

  private static final class Java5CharacterUtils extends CharacterUtils {
    Java5CharacterUtils() {
    }

-    @Override
-    public int codePointAt(final char[] chars, final int offset) {
-      return Character.codePointAt(chars, offset);
-    }
-
    @Override
    public int codePointAt(final CharSequence seq, final int offset) {
      return Character.codePointAt(seq, offset);
@ -198,7 +228,11 @@ public abstract class CharacterUtils {
    }

    @Override
-    public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException {
+    public boolean fill(final CharacterBuffer buffer, final Reader reader, int numChars) throws IOException {
+      assert buffer.buffer.length >= 2;
+      if (numChars < 2 || numChars > buffer.buffer.length) {
+        throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
+      }
      final char[] charBuffer = buffer.buffer;
      buffer.offset = 0;
      final int offset;
@ -206,47 +240,36 @@ public abstract class CharacterUtils {
      // Install the previously saved ending high surrogate:
      if (buffer.lastTrailingHighSurrogate != 0) {
        charBuffer[0] = buffer.lastTrailingHighSurrogate;
+        buffer.lastTrailingHighSurrogate = 0;
        offset = 1;
      } else {
        offset = 0;
      }

-      final int read = reader.read(charBuffer,
-                                   offset,
-                                   charBuffer.length - offset);
-      if (read == -1) {
-        buffer.length = offset;
-        buffer.lastTrailingHighSurrogate = 0;
-        return offset != 0;
-      }
-      assert read > 0;
-      buffer.length = read + offset;
+      final int read = readFully(reader, charBuffer, offset, numChars - offset);

-      // If we read only a single char, and that char was a
-      // high surrogate, read again:
-      if (buffer.length == 1
-          && Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
-        final int read2 = reader.read(charBuffer,
-                                      1,
-                                      charBuffer.length - 1);
-        if (read2 == -1) {
-          // NOTE: mal-formed input (ended on a high
-          // surrogate)!  Consumer must deal with it...
-          return true;
-        }
-        assert read2 > 0;
-
-        buffer.length += read2;
+      buffer.length = offset + read;
+      final boolean result = buffer.length == numChars;
+      if (buffer.length < numChars) {
+        // We failed to fill the buffer. Even if the last char is a high
+        // surrogate, there is nothing we can do
+        return result;
      }

-      if (buffer.length > 1
-          && Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
+      if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
        buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
-      } else {
-        buffer.lastTrailingHighSurrogate = 0;
      }
+      return result;
+    }

-      return true;
+    @Override
+    public int codePointCount(CharSequence seq) {
+      return Character.codePointCount(seq, 0, seq.length());
+    }
+
+    @Override
+    public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
+      return Character.offsetByCodePoints(buf, start, count, index, offset);
    }
  }

@ -254,11 +277,6 @@ public abstract class CharacterUtils {
    Java4CharacterUtils() {
    }

-    @Override
-    public int codePointAt(final char[] chars, final int offset) {
-      return chars[offset];
-    }
-
    @Override
    public int codePointAt(final CharSequence seq, final int offset) {
      return seq.charAt(offset);
@ -272,13 +290,31 @@ public abstract class CharacterUtils {
    }

    @Override
-    public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException {
+    public boolean fill(CharacterBuffer buffer, Reader reader, int numChars)
+        throws IOException {
+      assert buffer.buffer.length >= 1;
+      if (numChars < 1 || numChars > buffer.buffer.length) {
+        throw new IllegalArgumentException("numChars must be >= 1 and <= the buffer size");
+      }
      buffer.offset = 0;
-      final int read = reader.read(buffer.buffer);
-      if(read == -1)
-        return false;
+      final int read = readFully(reader, buffer.buffer, 0, numChars);
      buffer.length = read;
-      return true;
+      buffer.lastTrailingHighSurrogate = 0;
+      return read == numChars;
+    }
+
+    @Override
+    public int codePointCount(CharSequence seq) {
+      return seq.length();
+    }
+
+    @Override
+    public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
+      final int result = index + offset;
+      if (result < 0 || result > count) {
+        throw new IndexOutOfBoundsException();
+      }
+      return result;
    }

  }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
@ -170,8 +170,6 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
    char[] termBuffer = filter.getAttribute(CharTermAttribute.class).buffer();
    int length = highSurEndingLower.length();
    assertEquals('\ud801', termBuffer[length - 1]);
-    assertEquals('\udc3e', termBuffer[length]);
-    
  }
  
  public void testLowerCaseTokenizer() throws IOException {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java
@ -78,7 +78,7 @@ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
      char[] charArray = randomRealisticUnicodeString.toCharArray();
      StringBuilder builder = new StringBuilder();
      for (int j = 0; j < charArray.length;) {
-        int cp = Character.codePointAt(charArray, j);
+        int cp = Character.codePointAt(charArray, j, charArray.length);
        if (!Character.isWhitespace(cp)) {
          builder.appendCodePoint(cp);
        }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
@ -32,8 +32,10 @@ import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.core.LetterTokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.util.Version;
+import org.apache.lucene.util._TestUtil;

 /**
 * Tests {@link EdgeNGramTokenFilter} for correctness.
@ -192,9 +194,9 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
  }

  public void testGraphs() throws IOException {
-    TokenStream tk = new LetterTokenizer(Version.LUCENE_44, new StringReader("abc d efgh ij klmno p q"));
+    TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
    tk = new ShingleFilter(tk);
-    tk = new EdgeNGramTokenFilter(Version.LUCENE_44, tk, 7, 10);
+    tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
    tk.reset();
    assertTokenStreamContents(tk,
        new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
@ -205,4 +207,25 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
        23
    );
  }
+
+  public void testSupplementaryCharacters() throws IOException {
+    final String s = _TestUtil.randomUnicodeString(random(), 10);
+    final int codePointCount = s.codePointCount(0, s.length());
+    final int minGram = _TestUtil.nextInt(random(), 1, 3);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 10);
+    TokenStream tk = new KeywordTokenizer(new StringReader(s));
+    tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
+    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
+    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
+    tk.reset();
+    for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
+      assertTrue(tk.incrementToken());
+      assertEquals(0, offsetAtt.startOffset());
+      assertEquals(s.length(), offsetAtt.endOffset());
+      final int end = Character.offsetByCodePoints(s, 0, i);
+      assertEquals(s.substring(0, end), termAtt.toString());
+    }
+    assertFalse(tk.incrementToken());
+  }
+
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
@ -21,15 +21,15 @@ package org.apache.lucene.analysis.ngram;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
+import java.util.Arrays;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.util._TestUtil;

+import com.carrotsearch.randomizedtesting.generators.RandomStrings;
+
 /**
 * Tests {@link EdgeNGramTokenizer} for correctness.
 */
@ -120,25 +120,60 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
                              false);
  }

+  private static void testNGrams(int minGram, int maxGram, int length, final String nonTokenChars) throws IOException {
+    final String s = RandomStrings.randomAsciiOfLength(random(), length);
+    testNGrams(minGram, maxGram, s, nonTokenChars);
+  }
+
+  private static void testNGrams(int minGram, int maxGram, String s, String nonTokenChars) throws IOException {
+    NGramTokenizerTest.testNGrams(minGram, maxGram, s, nonTokenChars, true);
+  }
+
  public void testLargeInput() throws IOException {
-    final String input = _TestUtil.randomSimpleString(random(), 1024 * 5);
-    final int minGram = _TestUtil.nextInt(random(), 1, 1024);
-    final int maxGram = _TestUtil.nextInt(random(), minGram, 5 * 1024);
-    EdgeNGramTokenizer tk = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, new StringReader(input), minGram, maxGram);
-    final CharTermAttribute charTermAtt = tk.addAttribute(CharTermAttribute.class);
-    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
-    final PositionIncrementAttribute posIncAtt = tk.addAttribute(PositionIncrementAttribute.class);
-    tk.reset();
-    for (int i = minGram; i <= maxGram && i <= input.length(); ++i) {
-      assertTrue(tk.incrementToken());
-      assertEquals(0, offsetAtt.startOffset());
-      assertEquals(i, offsetAtt.endOffset());
-      assertEquals(1, posIncAtt.getPositionIncrement());
-      assertEquals(input.substring(0, i), charTermAtt.toString());
+    // test sliding
+    final int minGram = _TestUtil.nextInt(random(), 1, 100);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
+    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024), "");
+  }
+
+  public void testLargeMaxGram() throws IOException {
+    // test sliding with maxGram > 1024
+    final int minGram = _TestUtil.nextInt(random(), 1290, 1300);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 1300);
+    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024), "");
+  }
+
+  public void testPreTokenization() throws IOException {
+    final int minGram = _TestUtil.nextInt(random(), 1, 100);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
+    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 0, 4 * 1024), "a");
+  }
+
+  public void testHeavyPreTokenization() throws IOException {
+    final int minGram = _TestUtil.nextInt(random(), 1, 100);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
+    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 0, 4 * 1024), "abcdef");
+  }
+
+  public void testFewTokenChars() throws IOException {
+    final char[] chrs = new char[_TestUtil.nextInt(random(), 4000, 5000)];
+    Arrays.fill(chrs, ' ');
+    for (int i = 0; i < chrs.length; ++i) {
+      if (random().nextFloat() < 0.1) {
+        chrs[i] = 'a';
+      }
    }
-    assertFalse(tk.incrementToken());
-    tk.end();
-    assertEquals(input.length(), offsetAtt.startOffset());
+    final int minGram = _TestUtil.nextInt(random(), 1, 2);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 2);
+    testNGrams(minGram, maxGram, new String(chrs), " ");
+  }
+
+  public void testFullUTF8Range() throws IOException {
+    final int minGram = _TestUtil.nextInt(random(), 1, 100);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
+    final String s = _TestUtil.randomUnicodeString(random(), 4 * 1024);
+    testNGrams(minGram, maxGram, s, "");
+    testNGrams(minGram, maxGram, s, "abcdef");
  }

 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
@ -26,7 +26,10 @@ import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.util.Version;
+import org.apache.lucene.util._TestUtil;

 import java.io.IOException;
 import java.io.Reader;
@ -177,4 +180,27 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
        );
  }

+  public void testSupplementaryCharacters() throws IOException {
+    final String s = _TestUtil.randomUnicodeString(random(), 10);
+    final int codePointCount = s.codePointCount(0, s.length());
+    final int minGram = _TestUtil.nextInt(random(), 1, 3);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 10);
+    TokenStream tk = new KeywordTokenizer(new StringReader(s));
+    tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
+    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
+    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
+    tk.reset();
+    for (int start = 0; start < codePointCount; ++start) {
+      for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
+        assertTrue(tk.incrementToken());
+        assertEquals(0, offsetAtt.startOffset());
+        assertEquals(s.length(), offsetAtt.endOffset());
+        final int startIndex = Character.offsetByCodePoints(s, 0, start);
+        final int endIndex = Character.offsetByCodePoints(s, 0, end);
+        assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
+      }
+    }
+    assertFalse(tk.incrementToken());
+  }
+
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
@ -18,9 +18,12 @@ package org.apache.lucene.analysis.ngram;
 */


+import static org.apache.lucene.analysis.ngram.NGramTokenizerTest.isTokenChar;
+
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
+import java.util.Arrays;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -115,23 +118,74 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
    checkRandomData(random(), a, 50*RANDOM_MULTIPLIER, 1027, false, false);
  }

-  private void testNGrams(int minGram, int maxGram, int length) throws IOException {
+  private static void testNGrams(int minGram, int maxGram, int length, final String nonTokenChars) throws IOException {
    final String s = RandomStrings.randomAsciiOfLength(random(), length);
-    final TokenStream grams = new NGramTokenizer(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram);
+    testNGrams(minGram, maxGram, s, nonTokenChars);
+  }
+
+  private static void testNGrams(int minGram, int maxGram, String s, String nonTokenChars) throws IOException {
+    testNGrams(minGram, maxGram, s, nonTokenChars, false);
+  }
+
+  static int[] toCodePoints(CharSequence s) {
+    final int[] codePoints = new int[Character.codePointCount(s, 0, s.length())];
+    for (int i = 0, j = 0; i < s.length(); ++j) {
+      codePoints[j] = Character.codePointAt(s, i);
+      i += Character.charCount(codePoints[j]);
+    }
+    return codePoints;
+  }
+
+  static boolean isTokenChar(String nonTokenChars, int codePoint) {
+    for (int i = 0; i < nonTokenChars.length(); ) {
+      final int cp = nonTokenChars.codePointAt(i);
+      if (cp == codePoint) {
+        return false;
+      }
+      i += Character.charCount(cp);
+    }
+    return true;
+  }
+
+  static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars, boolean edgesOnly) throws IOException {
+    // convert the string to code points
+    final int[] codePoints = toCodePoints(s);
+    final int[] offsets = new int[codePoints.length + 1];
+    for (int i = 0; i < codePoints.length; ++i) {
+      offsets[i+1] = offsets[i] + Character.charCount(codePoints[i]);
+    }
+    final TokenStream grams = new NGramTokenizer(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram, edgesOnly) {
+      @Override
+      protected boolean isTokenChar(int chr) {
+        return nonTokenChars.indexOf(chr) < 0;
+      }
+    };
    final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
    final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
    grams.reset();
-    for (int start = 0; start < s.length(); ++start) {
-      for (int end = start + minGram; end <= start + maxGram && end <= s.length(); ++end) {
+    for (int start = 0; start < codePoints.length; ++start) {
+      nextGram:
+      for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) {
+        if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) {
+          // not on an edge
+          continue nextGram;
+        }
+        for (int j = start; j < end; ++j) {
+          if (!isTokenChar(nonTokenChars, codePoints[j])) {
+            continue nextGram;
+          }
+        }
        assertTrue(grams.incrementToken());
-        assertEquals(s.substring(start, end), termAtt.toString());
+        assertArrayEquals(Arrays.copyOfRange(codePoints, start, end), toCodePoints(termAtt));
        assertEquals(1, posIncAtt.getPositionIncrement());
-        assertEquals(start, offsetAtt.startOffset());
-        assertEquals(end, offsetAtt.endOffset());
+        assertEquals(1, posLenAtt.getPositionLength());
+        assertEquals(offsets[start], offsetAtt.startOffset());
+        assertEquals(offsets[end], offsetAtt.endOffset());
      }
    }
+    assertFalse(grams.incrementToken());
    grams.end();
    assertEquals(s.length(), offsetAtt.startOffset());
    assertEquals(s.length(), offsetAtt.endOffset());
@ -141,14 +195,47 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
    // test sliding
    final int minGram = _TestUtil.nextInt(random(), 1, 100);
    final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
-    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024));
+    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024), "");
  }

  public void testLargeMaxGram() throws IOException {
    // test sliding with maxGram > 1024
-    final int minGram = _TestUtil.nextInt(random(), 1200, 1300);
+    final int minGram = _TestUtil.nextInt(random(), 1290, 1300);
    final int maxGram = _TestUtil.nextInt(random(), minGram, 1300);
-    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024));
+    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024), "");
+  }
+
+  public void testPreTokenization() throws IOException {
+    final int minGram = _TestUtil.nextInt(random(), 1, 100);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
+    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 0, 4 * 1024), "a");
+  }
+
+  public void testHeavyPreTokenization() throws IOException {
+    final int minGram = _TestUtil.nextInt(random(), 1, 100);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
+    testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 0, 4 * 1024), "abcdef");
+  }
+
+  public void testFewTokenChars() throws IOException {
+    final char[] chrs = new char[_TestUtil.nextInt(random(), 4000, 5000)];
+    Arrays.fill(chrs, ' ');
+    for (int i = 0; i < chrs.length; ++i) {
+      if (random().nextFloat() < 0.1) {
+        chrs[i] = 'a';
+      }
+    }
+    final int minGram = _TestUtil.nextInt(random(), 1, 2);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 2);
+    testNGrams(minGram, maxGram, new String(chrs), " ");
+  }
+
+  public void testFullUTF8Range() throws IOException {
+    final int minGram = _TestUtil.nextInt(random(), 1, 100);
+    final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
+    final String s = _TestUtil.randomUnicodeString(random(), 4 * 1024);
+    testNGrams(minGram, maxGram, s, "");
+    testNGrams(minGram, maxGram, s, "abcdef");
  }

 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java
@ -20,10 +20,13 @@ package org.apache.lucene.analysis.util;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
+import java.util.Arrays;

 import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
+import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.Version;
+import org.apache.lucene.util._TestUtil;
 import org.junit.Test;

 /**
@ -31,32 +34,6 @@ import org.junit.Test;
 */
 public class TestCharacterUtils extends LuceneTestCase {

-  @Test
-  public void testCodePointAtCharArrayInt() {
-    CharacterUtils java4 = CharacterUtils.getJava4Instance();
-    char[] cpAt3 = "Abc\ud801\udc1c".toCharArray();
-    char[] highSurrogateAt3 = "Abc\ud801".toCharArray();
-    assertEquals((int) 'A', java4.codePointAt(cpAt3, 0));
-    assertEquals((int) '\ud801', java4.codePointAt(cpAt3, 3));
-    assertEquals((int) '\ud801', java4.codePointAt(highSurrogateAt3, 3));
-    try {
-      java4.codePointAt(highSurrogateAt3, 4);
-      fail("array index out of bounds");
-    } catch (IndexOutOfBoundsException e) {
-    }
-
-    CharacterUtils java5 = CharacterUtils.getInstance(TEST_VERSION_CURRENT);
-    assertEquals((int) 'A', java5.codePointAt(cpAt3, 0));
-    assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt(
-        cpAt3, 3));
-    assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3));
-    try {
-      java5.codePointAt(highSurrogateAt3, 4);
-      fail("array index out of bounds");
-    } catch (IndexOutOfBoundsException e) {
-    }
-  }
-
  @Test
  public void testCodePointAtCharSequenceInt() {
    CharacterUtils java4 = CharacterUtils.getJava4Instance();
@ -98,7 +75,68 @@ public class TestCharacterUtils extends LuceneTestCase {
    assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt(
        cpAt3, 3, 5));
    assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3, 4));
+  }

+  @Test
+  public void testCodePointCount() {
+    CharacterUtils java4 = CharacterUtils.getJava4Instance();
+    CharacterUtils java5 = CharacterUtils.getInstance(TEST_VERSION_CURRENT);
+    final String s = _TestUtil.randomUnicodeString(random());
+    assertEquals(s.length(), java4.codePointCount(s));
+    assertEquals(Character.codePointCount(s, 0, s.length()), java5.codePointCount(s));
+  }
+
+  @Test
+  public void testOffsetByCodePoint() {
+    CharacterUtils java4 = CharacterUtils.getJava4Instance();
+    CharacterUtils java5 = CharacterUtils.getInstance(TEST_VERSION_CURRENT);
+    for (int i = 0; i < 10; ++i) {
+      final char[] s = _TestUtil.randomUnicodeString(random()).toCharArray();
+      final int index = _TestUtil.nextInt(random(), 0, s.length);
+      final int offset = random().nextInt(7) - 3;
+      try {
+        final int o = java4.offsetByCodePoints(s, 0, s.length, index, offset);
+        assertEquals(o, index + offset);
+      } catch (IndexOutOfBoundsException e) {
+        assertTrue((index + offset) < 0 || (index + offset) > s.length);
+      }
+  
+      int o;
+      try {
+        o = java5.offsetByCodePoints(s, 0, s.length, index, offset);
+      } catch (IndexOutOfBoundsException e) {
+        try {
+          Character.offsetByCodePoints(s, 0, s.length, index, offset);
+          fail();
+        } catch (IndexOutOfBoundsException e2) {
+          // OK
+        }
+        o = -1;
+      }
+      if (o >= 0) {
+        assertEquals(Character.offsetByCodePoints(s, 0, s.length, index, offset), o);
+      }
+    }
+  }
+
+  public void testConversions() {
+    CharacterUtils java4 = CharacterUtils.getJava4Instance();
+    CharacterUtils java5 = CharacterUtils.getInstance(TEST_VERSION_CURRENT);
+    testConversions(java4);
+    testConversions(java5);
+  }
+
+  private void testConversions(CharacterUtils charUtils) {
+    final char[] orig = _TestUtil.randomUnicodeString(random(), 100).toCharArray();
+    final int[] buf = new int[orig.length];
+    final char[] restored = new char[buf.length];
+    final int o1 = random().nextInt(5);
+    final int o2 = _TestUtil.nextInt(random(), 0, o1);
+    final int o3 = _TestUtil.nextInt(random(), 0, o1);
+    final int codePointCount = charUtils.toCodePoints(orig, o1, orig.length - o1, buf, o2);
+    final int charCount = charUtils.toChars(buf, o2, codePointCount, restored, o3);
+    assertEquals(orig.length - o1, charCount);
+    assertArrayEquals(Arrays.copyOfRange(orig, o1, o1 + charCount), Arrays.copyOfRange(restored, o3, o3 + charCount));
  }

  @Test
@ -132,7 +170,7 @@ public class TestCharacterUtils extends LuceneTestCase {
      assertEquals(0, buffer.getOffset());
      assertEquals(6, buffer.getLength());
      assertEquals("hellow", new String(buffer.getBuffer()));
-      assertTrue(instance.fill(buffer,reader));
+      assertFalse(instance.fill(buffer,reader));
      assertEquals(4, buffer.getLength());
      assertEquals(0, buffer.getOffset());

@ -159,15 +197,12 @@ public class TestCharacterUtils extends LuceneTestCase {
    assertEquals(4, buffer.getLength());
    assertEquals("123\ud801", new String(buffer.getBuffer(),
        buffer.getOffset(), buffer.getLength()));
-    assertTrue(instance.fill(buffer, reader));
-    assertEquals(2, buffer.getLength());
-    assertEquals("\ud801\udc1c", new String(buffer.getBuffer(), buffer
-        .getOffset(), buffer.getLength()));
-    assertTrue(instance.fill(buffer, reader));
-    assertEquals(1, buffer.getLength());
-    assertEquals("\ud801", new String(buffer.getBuffer(), buffer
+    assertFalse(instance.fill(buffer, reader));
+    assertEquals(3, buffer.getLength());
+    assertEquals("\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer
        .getOffset(), buffer.getLength()));
    assertFalse(instance.fill(buffer, reader));
+    assertEquals(0, buffer.getLength());
  }

  @Test
--- a/lucene/build.xml
+++ b/lucene/build.xml
@ -183,7 +183,10 @@
    <forbidden-apis internalRuntimeForbidden="true" classpathref="forbidden-apis.classpath">
      <bundledSignatures name="jdk-unsafe-${javac.target}"/>
      <bundledSignatures name="jdk-deprecated-${javac.target}"/>
-      <signaturesFileSet file="${common.dir}/tools/forbiddenApis/executors.txt"/>
+      <signaturesFileSet dir="${common.dir}/tools/forbiddenApis">
+        <include name="executors.txt" />
+        <include name="chars.txt" />
+      </signaturesFileSet>
      <fileset dir="${basedir}/build" includes="**/*.class" />
    </forbidden-apis>
  </target>
--- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
@ -808,7 +808,7 @@ public final class Util {
    final int charLimit = offset + length;
    while(charIdx < charLimit) {
      scratch.grow(intIdx+1);
-      final int utf32 = Character.codePointAt(s, charIdx);
+      final int utf32 = Character.codePointAt(s, charIdx, charLimit);
      scratch.ints[intIdx] = utf32;
      charIdx += Character.charCount(utf32);
      intIdx++;
--- a/lucene/tools/forbiddenApis/chars.txt
+++ b/lucene/tools/forbiddenApis/chars.txt
@ -0,0 +1,17 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+java.lang.Character#codePointBefore(char[],int) @ Implicit start offset is error-prone when the char[] is a buffer and the first chars are random chars
+java.lang.Character#codePointAt(char[],int) @ Implicit end offset is error-prone when the char[] is a buffer and the last chars are random chars
--- a/solr/build.xml
+++ b/solr/build.xml
@ -266,6 +266,7 @@
      <bundledSignatures name="commons-io-unsafe-${commons-io.version}"/>
      <signaturesFileSet dir="${common.dir}/tools/forbiddenApis">
        <include name="executors.txt" />
+        <include name="chars.txt" />
        <include name="servlet-api.txt" />
      </signaturesFileSet>
      <fileset dir="${basedir}/build">