Import the new n-gram tokenizers and filters from Lucene.

Lucene 4.4 will feature new n-gram tokenizers and filters that should not generate broken offsets (that cause highlighting bugs) anymore. They also correctly handle supplementary characters and the tokenizers can work in a streaming fashion (they are not limited to the first 1024 chars of the stream anymore).
2013-06-13 23:28:06 +02:00 · 2013-06-13 23:28:06 +02:00 · fccbe9c185
parent a388588b1f
commit fccbe9c185
14 changed files with 1619 additions and 124 deletions
--- a/src/main/java/org/apache/lucene/analysis/ngram/XEdgeNGramTokenFilter.java
+++ b/src/main/java/org/apache/lucene/analysis/ngram/XEdgeNGramTokenFilter.java
@ -0,0 +1,214 @@
+package org.apache.lucene.analysis.ngram;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.elasticsearch.common.lucene.Lucene;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.reverse.ReverseStringFilter;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.util.XCharacterUtils;
+import org.apache.lucene.util.Version;
+
+import java.io.IOException;
+
+/**
+ * Tokenizes the given token into n-grams of given size(s).
+ * <p>
+ * This {@link TokenFilter} create n-grams from the beginning edge or ending edge of a input token.
+ * <p><a name="version"/>As of Lucene 4.4, this filter does not support
+ * {@link Side#BACK} (you can use {@link ReverseStringFilter} up-front and
+ * afterward to get the same behavior), handles supplementary characters
+ * correctly and does not update offsets anymore.
+ */
+public final class XEdgeNGramTokenFilter extends TokenFilter {
+
+    static {
+        // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640.
+        assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
+    }
+
+  public static final Side DEFAULT_SIDE = Side.FRONT;
+  public static final int DEFAULT_MAX_GRAM_SIZE = 1;
+  public static final int DEFAULT_MIN_GRAM_SIZE = 1;
+
+  /** Specifies which side of the input the n-gram should be generated from */
+  public static enum Side {
+
+    /** Get the n-gram from the front of the input */
+    FRONT {
+      @Override
+      public String getLabel() { return "front"; }
+    },
+
+    /** Get the n-gram from the end of the input */
+    @Deprecated
+    BACK  {
+      @Override
+      public String getLabel() { return "back"; }
+    };
+
+    public abstract String getLabel();
+
+    // Get the appropriate Side from a string
+    public static Side getSide(String sideName) {
+      if (FRONT.getLabel().equals(sideName)) {
+        return FRONT;
+      }
+      if (BACK.getLabel().equals(sideName)) {
+        return BACK;
+      }
+      return null;
+    }
+  }
+
+  private final XCharacterUtils charUtils;
+  private final int minGram;
+  private final int maxGram;
+  private Side side;
+  private char[] curTermBuffer;
+  private int curTermLength;
+  private int curCodePointCount;
+  private int curGramSize;
+  private int tokStart;
+  private int tokEnd;
+  private int savePosIncr;
+  private int savePosLen;
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+
+  /**
+   * Creates XEdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
+   *
+   * @param version the <a href="#version">Lucene match version</a>
+   * @param input {@link TokenStream} holding the input to be tokenized
+   * @param side the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  @Deprecated
+  public XEdgeNGramTokenFilter(Version version, TokenStream input, Side side, int minGram, int maxGram) {
+    super(input);
+
+    if (version == null) {
+      throw new IllegalArgumentException("version must not be null");
+    }
+
+    if (side == Side.BACK) {
+      throw new IllegalArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
+    }
+
+    if (side == null) {
+      throw new IllegalArgumentException("sideLabel must be either front or back");
+    }
+
+    if (minGram < 1) {
+      throw new IllegalArgumentException("minGram must be greater than zero");
+    }
+
+    if (minGram > maxGram) {
+      throw new IllegalArgumentException("minGram must not be greater than maxGram");
+    }
+
+    this.charUtils = XCharacterUtils.getInstance(version);
+    this.minGram = minGram;
+    this.maxGram = maxGram;
+    this.side = side;
+  }
+
+  /**
+   * Creates XEdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
+   *
+   * @param version the <a href="#version">Lucene match version</a>
+   * @param input {@link TokenStream} holding the input to be tokenized
+   * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  @Deprecated
+  public XEdgeNGramTokenFilter(Version version, TokenStream input, String sideLabel, int minGram, int maxGram) {
+    this(version, input, Side.getSide(sideLabel), minGram, maxGram);
+  }
+
+  /**
+   * Creates XEdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
+   *
+   * @param version the <a href="#version">Lucene match version</a>
+   * @param input {@link TokenStream} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public XEdgeNGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
+    this(version, input, Side.FRONT, minGram, maxGram);
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    while (true) {
+      if (curTermBuffer == null) {
+        if (!input.incrementToken()) {
+          return false;
+        } else {
+          curTermBuffer = termAtt.buffer().clone();
+          curTermLength = termAtt.length();
+          curCodePointCount = charUtils.codePointCount(termAtt);
+          curGramSize = minGram;
+          tokStart = offsetAtt.startOffset();
+          tokEnd = offsetAtt.endOffset();
+          savePosIncr += posIncrAtt.getPositionIncrement();
+          savePosLen = posLenAtt.getPositionLength();
+        }
+      }
+      if (curGramSize <= maxGram) {         // if we have hit the end of our n-gram size range, quit
+        if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
+          // grab gramSize chars from front or back
+          final int start = side == Side.FRONT ? 0 : charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize);
+          final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
+          clearAttributes();
+          offsetAtt.setOffset(tokStart, tokEnd);
+          // first ngram gets increment, others don't
+          if (curGramSize == minGram) {
+            posIncrAtt.setPositionIncrement(savePosIncr);
+            savePosIncr = 0;
+          } else {
+            posIncrAtt.setPositionIncrement(0);
+          }
+          posLenAtt.setPositionLength(savePosLen);
+          termAtt.copyBuffer(curTermBuffer, start, end - start);
+          curGramSize++;
+          return true;
+        }
+      }
+      curTermBuffer = null;
+    }
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    curTermBuffer = null;
+    savePosIncr = 0;
+  }
+}
--- a/src/main/java/org/apache/lucene/analysis/ngram/XEdgeNGramTokenizer.java
+++ b/src/main/java/org/apache/lucene/analysis/ngram/XEdgeNGramTokenizer.java
@ -0,0 +1,77 @@
+package org.apache.lucene.analysis.ngram;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.elasticsearch.common.lucene.Lucene;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * Tokenizes the input from an edge into n-grams of given size(s).
+ * <p>
+ * This {@link Tokenizer} create n-grams from the beginning edge or ending edge of a input token.
+ * <p><a name="version" /> As of Lucene 4.4, this tokenizer<ul>
+ * <li>can handle <code>maxGram</code> larger than 1024 chars, but beware that this will result in increased memory usage
+ * <li>doesn't trim the input,
+ * <li>sets position increments equal to 1 instead of 1 for the first token and 0 for all other ones
+ * <li>doesn't support backward n-grams anymore.
+ * <li>supports {@link #isTokenChar(int) pre-tokenization},
+ * <li>correctly handles supplementary characters.
+ * </ul>
+ * <p>Although <b style="color:red">highly</b> discouraged, it is still possible
+ * to use the old behavior through {@link Lucene43XEdgeXNGramTokenizer}.
+ */
+public class XEdgeNGramTokenizer extends XNGramTokenizer {
+
+    static {
+        // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640.
+        assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
+    }
+
+  public static final int DEFAULT_MAX_GRAM_SIZE = 1;
+  public static final int DEFAULT_MIN_GRAM_SIZE = 1;
+
+  /**
+   * Creates XEdgeXNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param version the <a href="#version">Lucene match version</a>
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public XEdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
+    super(version, input, minGram, maxGram, true);
+  }
+
+  /**
+   * Creates XEdgeXNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param version the <a href="#version">Lucene match version</a>
+   * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public XEdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
+    super(version, factory, input, minGram, maxGram, true);
+  }
+
+}
--- a/src/main/java/org/apache/lucene/analysis/ngram/XLucene43EdgeNGramTokenizer.java
+++ b/src/main/java/org/apache/lucene/analysis/ngram/XLucene43EdgeNGramTokenizer.java
@ -0,0 +1,281 @@
+package org.apache.lucene.analysis.ngram;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.elasticsearch.common.lucene.Lucene;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Version;
+
+/**
+ * Old version of {@link EdgeNGramTokenizer} which doesn't handle correctly
+ * supplementary characters.
+ */
+@Deprecated
+public final class XLucene43EdgeNGramTokenizer extends Tokenizer {
+
+    static {
+        // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640.
+        assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
+    }
+
+  public static final Side DEFAULT_SIDE = Side.FRONT;
+  public static final int DEFAULT_MAX_GRAM_SIZE = 1;
+  public static final int DEFAULT_MIN_GRAM_SIZE = 1;
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+
+  /** Specifies which side of the input the n-gram should be generated from */
+  public static enum Side {
+
+    /** Get the n-gram from the front of the input */
+    FRONT {
+      @Override
+      public String getLabel() { return "front"; }
+    },
+
+    /** Get the n-gram from the end of the input */
+    BACK  {
+      @Override
+      public String getLabel() { return "back"; }
+    };
+
+    public abstract String getLabel();
+
+    // Get the appropriate Side from a string
+    public static Side getSide(String sideName) {
+      if (FRONT.getLabel().equals(sideName)) {
+        return FRONT;
+      }
+      if (BACK.getLabel().equals(sideName)) {
+        return BACK;
+      }
+      return null;
+    }
+  }
+
+  private int minGram;
+  private int maxGram;
+  private int gramSize;
+  private Side side;
+  private boolean started;
+  private int inLen; // length of the input AFTER trim()
+  private int charsRead; // length of the input
+  private String inStr;
+
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param version the <a href="#version">Lucene match version</a>
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param side the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  @Deprecated
+  public XLucene43EdgeNGramTokenizer(Version version, Reader input, Side side, int minGram, int maxGram) {
+    super(input);
+    init(version, side, minGram, maxGram);
+  }
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param version the <a href="#version">Lucene match version</a>
+   * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param side the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  @Deprecated
+  public XLucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, Side side, int minGram, int maxGram) {
+    super(factory, input);
+    init(version, side, minGram, maxGram);
+  }
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param version the <a href="#version">Lucene match version</a>
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  @Deprecated
+  public XLucene43EdgeNGramTokenizer(Version version, Reader input, String sideLabel, int minGram, int maxGram) {
+    this(version, input, Side.getSide(sideLabel), minGram, maxGram);
+  }
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param version the <a href="#version">Lucene match version</a>
+   * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  @Deprecated
+  public XLucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, String sideLabel, int minGram, int maxGram) {
+    this(version, factory, input, Side.getSide(sideLabel), minGram, maxGram);
+  }
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param version the <a href="#version">Lucene match version</a>
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public XLucene43EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
+    this(version, input, Side.FRONT, minGram, maxGram);
+  }
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param version the <a href="#version">Lucene match version</a>
+   * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public XLucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
+    this(version, factory, input, Side.FRONT, minGram, maxGram);
+  }
+
+  private void init(Version version, Side side, int minGram, int maxGram) {
+    if (version == null) {
+      throw new IllegalArgumentException("version must not be null");
+    }
+
+    if (side == null) {
+      throw new IllegalArgumentException("sideLabel must be either front or back");
+    }
+
+    if (minGram < 1) {
+      throw new IllegalArgumentException("minGram must be greater than zero");
+    }
+
+    if (minGram > maxGram) {
+      throw new IllegalArgumentException("minGram must not be greater than maxGram");
+    }
+
+    maxGram = Math.min(maxGram, 1024);
+
+    this.minGram = minGram;
+    this.maxGram = maxGram;
+    this.side = side;
+  }
+
+  /** Returns the next token in the stream, or null at EOS. */
+  @Override
+  public boolean incrementToken() throws IOException {
+    clearAttributes();
+    // if we are just starting, read the whole input
+    if (!started) {
+      started = true;
+      gramSize = minGram;
+      final int limit = side == Side.FRONT ? maxGram : 1024;
+      char[] chars = new char[Math.min(1024, limit)];
+      charsRead = 0;
+      // TODO: refactor to a shared readFully somewhere:
+      boolean exhausted = false;
+      while (charsRead < limit) {
+        final int inc = input.read(chars, charsRead, chars.length-charsRead);
+        if (inc == -1) {
+          exhausted = true;
+          break;
+        }
+        charsRead += inc;
+        if (charsRead == chars.length && charsRead < limit) {
+          chars = ArrayUtil.grow(chars);
+        }
+      }
+
+      inStr = new String(chars, 0, charsRead);
+      inStr = inStr.trim();
+
+      if (!exhausted) {
+        // Read extra throwaway chars so that on end() we
+        // report the correct offset:
+        char[] throwaway = new char[1024];
+        while(true) {
+          final int inc = input.read(throwaway, 0, throwaway.length);
+          if (inc == -1) {
+            break;
+          }
+          charsRead += inc;
+        }
+      }
+
+      inLen = inStr.length();
+      if (inLen == 0) {
+        return false;
+      }
+      posIncrAtt.setPositionIncrement(1);
+    } else {
+      posIncrAtt.setPositionIncrement(0);
+    }
+
+    // if the remaining input is too short, we can't generate any n-grams
+    if (gramSize > inLen) {
+      return false;
+    }
+
+    // if we have hit the end of our n-gram size range, quit
+    if (gramSize > maxGram || gramSize > inLen) {
+      return false;
+    }
+
+    // grab gramSize chars from front or back
+    int start = side == Side.FRONT ? 0 : inLen - gramSize;
+    int end = start + gramSize;
+    termAtt.setEmpty().append(inStr, start, end);
+    offsetAtt.setOffset(correctOffset(start), correctOffset(end));
+    gramSize++;
+    return true;
+  }
+
+  @Override
+  public void end() {
+    // set final offset
+    final int finalOffset = correctOffset(charsRead);
+    this.offsetAtt.setOffset(finalOffset, finalOffset);
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    started = false;
+  }
+}
--- a/src/main/java/org/apache/lucene/analysis/ngram/XLucene43NGramTokenizer.java
+++ b/src/main/java/org/apache/lucene/analysis/ngram/XLucene43NGramTokenizer.java
@ -0,0 +1,164 @@
+package org.apache.lucene.analysis.ngram;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.Version;
+import org.elasticsearch.common.lucene.Lucene;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+/**
+ * Old broken version of {@link NGramTokenizer}.
+ */
+@Deprecated
+public final class XLucene43NGramTokenizer extends Tokenizer {
+
+    static {
+        // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640.
+        assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
+    }
+
+  public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
+  public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+  private int minGram, maxGram;
+  private int gramSize;
+  private int pos;
+  private int inLen; // length of the input AFTER trim()
+  private int charsRead; // length of the input
+  private String inStr;
+  private boolean started;
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+  /**
+   * Creates NGramTokenizer with given min and max n-grams.
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public XLucene43NGramTokenizer(Reader input, int minGram, int maxGram) {
+    super(input);
+    init(minGram, maxGram);
+  }
+
+  /**
+   * Creates NGramTokenizer with given min and max n-grams.
+   * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
+   * @param input {@link Reader} holding the input to be tokenized
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public XLucene43NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) {
+    super(factory, input);
+    init(minGram, maxGram);
+  }
+
+  /**
+   * Creates NGramTokenizer with default min and max n-grams.
+   * @param input {@link Reader} holding the input to be tokenized
+   */
+  public XLucene43NGramTokenizer(Reader input) {
+    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
+  }
+
+  private void init(int minGram, int maxGram) {
+    if (minGram < 1) {
+      throw new IllegalArgumentException("minGram must be greater than zero");
+    }
+    if (minGram > maxGram) {
+      throw new IllegalArgumentException("minGram must not be greater than maxGram");
+    }
+    this.minGram = minGram;
+    this.maxGram = maxGram;
+  }
+
+  /** Returns the next token in the stream, or null at EOS. */
+  @Override
+  public boolean incrementToken() throws IOException {
+    clearAttributes();
+    if (!started) {
+      started = true;
+      gramSize = minGram;
+      char[] chars = new char[1024];
+      charsRead = 0;
+      // TODO: refactor to a shared readFully somewhere:
+      while (charsRead < chars.length) {
+        int inc = input.read(chars, charsRead, chars.length-charsRead);
+        if (inc == -1) {
+          break;
+        }
+        charsRead += inc;
+      }
+      inStr = new String(chars, 0, charsRead).trim();  // remove any trailing empty strings
+
+      if (charsRead == chars.length) {
+        // Read extra throwaway chars so that on end() we
+        // report the correct offset:
+        char[] throwaway = new char[1024];
+        while(true) {
+          final int inc = input.read(throwaway, 0, throwaway.length);
+          if (inc == -1) {
+            break;
+          }
+          charsRead += inc;
+        }
+      }
+
+      inLen = inStr.length();
+      if (inLen == 0) {
+        return false;
+      }
+    }
+
+    if (pos+gramSize > inLen) {            // if we hit the end of the string
+      pos = 0;                           // reset to beginning of string
+      gramSize++;                        // increase n-gram size
+      if (gramSize > maxGram)            // we are done
+        return false;
+      if (pos+gramSize > inLen)
+        return false;
+    }
+
+    int oldPos = pos;
+    pos++;
+    termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize);
+    offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
+    return true;
+  }
+
+  @Override
+  public void end() {
+    // set final offset
+    final int finalOffset = correctOffset(charsRead);
+    this.offsetAtt.setOffset(finalOffset, finalOffset);
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    started = false;
+    pos = 0;
+  }
+}
--- a/src/main/java/org/apache/lucene/analysis/ngram/XNGramTokenFilter.java
+++ b/src/main/java/org/apache/lucene/analysis/ngram/XNGramTokenFilter.java
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.ngram;
 * limitations under the License.
 */

+import org.elasticsearch.common.lucene.Lucene;
+
 import java.io.IOException;

 import org.apache.lucene.analysis.TokenFilter;
@ -26,14 +28,15 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.util.XCharacterUtils;
 import org.apache.lucene.util.Version;
-import org.elasticsearch.common.lucene.Lucene;

 /**
 * Tokenizes the input into n-grams of the given size(s).
 * <a name="version"/>
 * <p>You must specify the required {@link Version} compatibility when
- * creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters:<ul>
+ * creating a {@link XNGramTokenFilter}. As of Lucene 4.4, this token filters:<ul>
+ * <li>handles supplementary characters correctly,</li>
 * <li>emits all n-grams for the same token at the same position,</li>
 * <li>does not modify offsets,</li>
 * <li>sorts n-grams by their offset in the original token first, then
@ -43,13 +46,18 @@ import org.elasticsearch.common.lucene.Lucene;
 * {@link Version#LUCENE_44} in the constructor but this is not recommended as
 * it will lead to broken {@link TokenStream}s that will cause highlighting
 * bugs.
+ * <p>If you were using this {@link TokenFilter} to perform partial highlighting,
+ * this won't work anymore since this filter doesn't update offsets. You should
+ * modify your analysis chain to use {@link NGramTokenizer}, and potentially
+ * override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization.
 */
 public final class XNGramTokenFilter extends TokenFilter {
-    
+
    static {
-        // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1476563
-        assert Lucene.VERSION.ordinal() < Version.LUCENE_42.ordinal()+2  : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this should can be removed"; 
+        // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640.
+        assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
    }
+
  public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
  public static final int DEFAULT_MAX_NGRAM_SIZE = 2;

@ -57,21 +65,21 @@ public final class XNGramTokenFilter extends TokenFilter {

  private char[] curTermBuffer;
  private int curTermLength;
+  private int curCodePointCount;
  private int curGramSize;
  private int curPos;
  private int curPosInc, curPosLen;
  private int tokStart;
  private int tokEnd;
-  private boolean hasIllegalOffsets; // only if the length changed before this filter

-  private final Version version;
+  private final XCharacterUtils charUtils;
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posIncAtt;
  private final PositionLengthAttribute posLenAtt;
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

  /**
-   * Creates NGramTokenFilter with given min and max n-grams.
+   * Creates XNGramTokenFilter with given min and max n-grams.
   * @param version Lucene version to enable correct position increments.
   *                See <a href="#version">above</a> for details.
   * @param input {@link TokenStream} holding the input to be tokenized
@ -80,7 +88,7 @@ public final class XNGramTokenFilter extends TokenFilter {
   */
  public XNGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
    super(new LengthFilter(true, input, minGram, Integer.MAX_VALUE));
-    this.version = version;
+    this.charUtils = XCharacterUtils.getInstance(version);
    if (minGram < 1) {
      throw new IllegalArgumentException("minGram must be greater than zero");
    }
@ -89,31 +97,12 @@ public final class XNGramTokenFilter extends TokenFilter {
    }
    this.minGram = minGram;
    this.maxGram = maxGram;
-    if (version.onOrAfter(Version.LUCENE_42)) {
-      posIncAtt = addAttribute(PositionIncrementAttribute.class);
-      posLenAtt = addAttribute(PositionLengthAttribute.class);
-    } else {
-      posIncAtt = new PositionIncrementAttribute() {
-        @Override
-        public void setPositionIncrement(int positionIncrement) {}
-        @Override
-        public int getPositionIncrement() {
-          return 0;
-        }
-      };
-      posLenAtt = new PositionLengthAttribute() {
-        @Override
-        public void setPositionLength(int positionLength) {}        
-        @Override
-        public int getPositionLength() {
-          return 0;
-        }
-      };
-    }
+    posIncAtt = addAttribute(PositionIncrementAttribute.class);
+    posLenAtt = addAttribute(PositionLengthAttribute.class);
  }

  /**
-   * Creates NGramTokenFilter with default min and max n-grams.
+   * Creates XNGramTokenFilter with default min and max n-grams.
   * @param version Lucene version to enable correct position increments.
   *                See <a href="#version">above</a> for details.
   * @param input {@link TokenStream} holding the input to be tokenized
@ -132,25 +121,24 @@ public final class XNGramTokenFilter extends TokenFilter {
        } else {
          curTermBuffer = termAtt.buffer().clone();
          curTermLength = termAtt.length();
+          curCodePointCount = charUtils.codePointCount(termAtt);
          curGramSize = minGram;
          curPos = 0;
          curPosInc = posIncAtt.getPositionIncrement();
          curPosLen = posLenAtt.getPositionLength();
          tokStart = offsetAtt.startOffset();
          tokEnd = offsetAtt.endOffset();
-          // if length by start + end offsets doesn't match the term text then assume
-          // this is a synonym and don't adjust the offsets.
-          hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
        }
      }
-      if (version.onOrAfter(Version.LUCENE_42)) {
-        if (curGramSize > maxGram || curPos + curGramSize > curTermLength) {
+        if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) {
          ++curPos;
          curGramSize = minGram;
        }
-        if (curPos + curGramSize <= curTermLength) {
+        if ((curPos + curGramSize) <= curCodePointCount) {
          clearAttributes();
-          termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
+          final int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
+          final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
+          termAtt.copyBuffer(curTermBuffer, start, end - start);
          posIncAtt.setPositionIncrement(curPosInc);
          curPosInc = 0;
          posLenAtt.setPositionLength(curPosLen);
@ -158,23 +146,6 @@ public final class XNGramTokenFilter extends TokenFilter {
          curGramSize++;
          return true;
        }
-      } else {
-        while (curGramSize <= maxGram) {
-          while (curPos+curGramSize <= curTermLength) {     // while there is input
-            clearAttributes();
-            termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
-            if (hasIllegalOffsets) {
-              offsetAtt.setOffset(tokStart, tokEnd);
-            } else {
-              offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
-            }
-            curPos++;
-            return true;
-          }
-          curGramSize++;                         // increase n-gram size
-          curPos = 0;
-        }
-      }
      curTermBuffer = null;
    }
  }
--- a/src/main/java/org/apache/lucene/analysis/ngram/XNGramTokenizer.java
+++ b/src/main/java/org/apache/lucene/analysis/ngram/XNGramTokenizer.java
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.ngram;
 * limitations under the License.
 */

+import org.elasticsearch.common.lucene.Lucene;
+
 import java.io.IOException;
 import java.io.Reader;

@ -25,8 +27,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.util.XCharacterUtils;
 import org.apache.lucene.util.Version;
-import org.elasticsearch.common.lucene.Lucene;

 /**
 * Tokenizes the input into n-grams of the given size(s).
@ -41,34 +43,53 @@ import org.elasticsearch.common.lucene.Lucene;
 * <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
 * </table>
 * <a name="version"/>
- * <p>Before Lucene 4.4, this class had a different behavior:<ul>
- * <li>It didn't support more than 1024 chars of input, the rest was trashed.</li>
- * <li>The last whitespaces of the 1024 chars block were trimmed.</li>
- * <li>Tokens were emitted in a different order (by increasing lengths).</li></ul>
- * <p>Although highly discouraged, it is still possible to use the old behavior
- * through {@link Lucene43NGramTokenizer}.
+ * <p>This tokenizer changed a lot in Lucene 4.4 in order to:<ul>
+ * <li>tokenize in a streaming fashion to support streams which are larger
+ * than 1024 chars (limit of the previous version),
+ * <li>count grams based on unicode code points instead of java chars (and
+ * never split in the middle of surrogate pairs),
+ * <li>give the ability to {@link #isTokenChar(int) pre-tokenize} the stream
+ * before computing n-grams.</ul>
+ * <p>Additionally, this class doesn't trim trailing whitespaces and emits
+ * tokens in a different order, tokens are now emitted by increasing start
+ * offsets while they used to be emitted by increasing lengths (which prevented
+ * from supporting large input streams).
+ * <p>Although <b style="color:red">highly</b> discouraged, it is still possible
+ * to use the old behavior through {@link Lucene43NGramTokenizer}.
 */
-public final class XNGramTokenizer extends Tokenizer {
+// non-final to allow for overriding isTokenChar, but all other methods should be final
+public class XNGramTokenizer extends Tokenizer {
+
+    static {
+        // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640.
+        assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
+    }
+
  public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
  public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
-  
-  static {
-      // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1476563
-      assert Lucene.VERSION.ordinal() < Version.LUCENE_42.ordinal()+2  : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this should can be removed"; 
-  }

-  private char[] buffer;
-  private int bufferStart, bufferEnd; // remaining slice of the buffer
+  private XCharacterUtils charUtils;
+  private XCharacterUtils.CharacterBuffer charBuffer;
+  private int[] buffer; // like charBuffer, but converted to code points
+  private int bufferStart, bufferEnd; // remaining slice in buffer
  private int offset;
  private int gramSize;
  private int minGram, maxGram;
  private boolean exhausted;
+  private int lastCheckedChar; // last offset in the buffer that we checked
+  private int lastNonTokenChar; // last offset that we found to not be a token char
+  private boolean edgesOnly; // leading edges n-grams only

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

+  XNGramTokenizer(Version version, Reader input, int minGram, int maxGram, boolean edgesOnly) {
+    super(input);
+    init(version, minGram, maxGram, edgesOnly);
+  }
+
  /**
   * Creates NGramTokenizer with given min and max n-grams.
   * @param version the lucene compatibility <a href="#version">version</a>
@ -77,8 +98,12 @@ public final class XNGramTokenizer extends Tokenizer {
   * @param maxGram the largest n-gram to generate
   */
  public XNGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
-    super(input);
-    init(version, minGram, maxGram);
+    this(version, input, minGram, maxGram, false);
+  }
+
+  XNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram, boolean edgesOnly) {
+    super(factory, input);
+    init(version, minGram, maxGram, edgesOnly);
  }

  /**
@ -90,8 +115,7 @@ public final class XNGramTokenizer extends Tokenizer {
   * @param maxGram the largest n-gram to generate
   */
  public XNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
-    super(factory, input);
-    init(version, minGram, maxGram);
+    this(version, factory, input, minGram, maxGram, false);
  }

  /**
@ -103,10 +127,13 @@ public final class XNGramTokenizer extends Tokenizer {
    this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
  }

-  private void init(Version version, int minGram, int maxGram) {
-    if (!version.onOrAfter(Version.LUCENE_42)) {
-      throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer");
+  private void init(Version version, int minGram, int maxGram, boolean edgesOnly) {
+    if (!version.onOrAfter(Version.LUCENE_43)) {
+      throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
    }
+    charUtils = version.onOrAfter(Version.LUCENE_43)
+        ? XCharacterUtils.getInstance(version)
+        : XCharacterUtils.getJava4Instance();
    if (minGram < 1) {
      throw new IllegalArgumentException("minGram must be greater than zero");
    }
@ -115,66 +142,107 @@ public final class XNGramTokenizer extends Tokenizer {
    }
    this.minGram = minGram;
    this.maxGram = maxGram;
-    buffer = new char[maxGram + 1024];
+    this.edgesOnly = edgesOnly;
+    charBuffer = XCharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
+    buffer = new int[charBuffer.getBuffer().length];
+    // Make the term att large enough
+    termAtt.resizeBuffer(2 * maxGram);
  }

-  /** Returns the next token in the stream, or null at EOS. */
  @Override
-  public boolean incrementToken() throws IOException {
+  public final boolean incrementToken() throws IOException {
    clearAttributes();

-    // compact
-    if (bufferStart >= buffer.length - maxGram) {
-      System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
-      bufferEnd -= bufferStart;
-      bufferStart = 0;
+    // termination of this loop is guaranteed by the fact that every iteration
+    // either advances the buffer (calls consumes()) or increases gramSize
+    while (true) {
+      // compact
+      if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted) {
+        System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
+        bufferEnd -= bufferStart;
+        lastCheckedChar -= bufferStart;
+        lastNonTokenChar -= bufferStart;
+        bufferStart = 0;

-      // fill in remaining space
-      if (!exhausted) {
-        // TODO: refactor to a shared readFully
-        while (bufferEnd < buffer.length) {
-          final int read = input.read(buffer, bufferEnd, buffer.length - bufferEnd);
-          if (read == -1) {
-            exhausted = true;
-            break;
-          }
-          bufferEnd += read;
+        // fill in remaining space
+        exhausted = !charUtils.fill(charBuffer, input, buffer.length - bufferEnd);
+        // convert to code points
+        bufferEnd += charUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd);
+      }
+
+      // should we go to the next offset?
+      if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd) {
+        if (bufferStart + 1 + minGram > bufferEnd) {
+          assert exhausted;
+          return false;
+        }
+        consume();
+        gramSize = minGram;
+      }
+
+      updateLastNonTokenChar();
+
+      // retry if the token to be emitted was going to not only contain token chars
+      final boolean termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize);
+      final boolean isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
+      if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) {
+        consume();
+        gramSize = minGram;
+        continue;
+      }
+
+      final int length = charUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
+      termAtt.setLength(length);
+      posIncAtt.setPositionIncrement(1);
+      posLenAtt.setPositionLength(1);
+      offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length));
+      ++gramSize;
+      return true;
+    }
+  }
+
+  private void updateLastNonTokenChar() {
+    final int termEnd = bufferStart + gramSize - 1;
+    if (termEnd > lastCheckedChar) {
+      for (int i = termEnd; i > lastCheckedChar; --i) {
+        if (!isTokenChar(buffer[i])) {
+          lastNonTokenChar = i;
+          break;
        }
      }
+      lastCheckedChar = termEnd;
    }
+  }

-    // should we go to the next offset?
-    if (gramSize > maxGram || bufferStart + gramSize > bufferEnd) {
-      bufferStart++;
-      offset++;
-      gramSize = minGram;
-    }
+  /** Consume one code point. */
+  private void consume() {
+    offset += Character.charCount(buffer[bufferStart++]);
+  }

-    // are there enough chars remaining?
-    if (bufferStart + gramSize > bufferEnd) {
-      return false;
-    }
-
-    termAtt.copyBuffer(buffer, bufferStart, gramSize);
-    posIncAtt.setPositionIncrement(1);
-    posLenAtt.setPositionLength(1);
-    offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + gramSize));
-    ++gramSize;
+  /** Only collect characters which satisfy this condition. */
+  protected boolean isTokenChar(int chr) {
    return true;
  }

  @Override
-  public void end() {
-    final int endOffset = correctOffset(offset + bufferEnd - bufferStart);
+  public final void end() {
+    assert bufferStart <= bufferEnd;
+    int endOffset = offset;
+    for (int i = bufferStart; i < bufferEnd; ++i) {
+      endOffset += Character.charCount(buffer[i]);
+    }
+    endOffset = correctOffset(endOffset);
    offsetAtt.setOffset(endOffset, endOffset);
  }

  @Override
-  public void reset() throws IOException {
+  public final void reset() throws IOException {
    super.reset();
    bufferStart = bufferEnd = buffer.length;
+    lastNonTokenChar = lastCheckedChar = bufferStart - 1;
    offset = 0;
    gramSize = minGram;
    exhausted = false;
+    charBuffer.reset();
  }
 }
--- a/src/main/java/org/apache/lucene/analysis/util/XCharacterUtils.java
+++ b/src/main/java/org/apache/lucene/analysis/util/XCharacterUtils.java
@ -0,0 +1,394 @@
+package org.apache.lucene.analysis.util;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.elasticsearch.common.lucene.Lucene;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.util.Version;
+
+/**
+ * {@link XCharacterUtils} provides a unified interface to Character-related
+ * operations to implement backwards compatible character operations based on a
+ * {@link Version} instance.
+ *
+ * @lucene.internal
+ */
+public abstract class XCharacterUtils {
+
+    static {
+        // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1492640.
+        assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
+    }
+
+  private static final Java4XCharacterUtils JAVA_4 = new Java4XCharacterUtils();
+  private static final Java5XCharacterUtils JAVA_5 = new Java5XCharacterUtils();
+
+  /**
+   * Returns a {@link XCharacterUtils} implementation according to the given
+   * {@link Version} instance.
+   *
+   * @param matchVersion
+   *          a version instance
+   * @return a {@link XCharacterUtils} implementation according to the given
+   *         {@link Version} instance.
+   */
+  public static XCharacterUtils getInstance(final Version matchVersion) {
+    return matchVersion.onOrAfter(Version.LUCENE_31) ? JAVA_5 : JAVA_4;
+  }
+
+  /** Return a {@link XCharacterUtils} instance compatible with Java 1.4. */
+  public static XCharacterUtils getJava4Instance() {
+    return JAVA_4;
+  }
+
+  /**
+   * Returns the code point at the given index of the {@link CharSequence}.
+   * Depending on the {@link Version} passed to
+   * {@link XCharacterUtils#getInstance(Version)} this method mimics the behavior
+   * of {@link Character#codePointAt(char[], int)} as it would have been
+   * available on a Java 1.4 JVM or on a later virtual machine version.
+   *
+   * @param seq
+   *          a character sequence
+   * @param offset
+   *          the offset to the char values in the chars array to be converted
+   *
+   * @return the Unicode code point at the given index
+   * @throws NullPointerException
+   *           - if the sequence is null.
+   * @throws IndexOutOfBoundsException
+   *           - if the value offset is negative or not less than the length of
+   *           the character sequence.
+   */
+  public abstract int codePointAt(final CharSequence seq, final int offset);
+
+  /**
+   * Returns the code point at the given index of the char array where only elements
+   * with index less than the limit are used.
+   * Depending on the {@link Version} passed to
+   * {@link XCharacterUtils#getInstance(Version)} this method mimics the behavior
+   * of {@link Character#codePointAt(char[], int)} as it would have been
+   * available on a Java 1.4 JVM or on a later virtual machine version.
+   *
+   * @param chars
+   *          a character array
+   * @param offset
+   *          the offset to the char values in the chars array to be converted
+   * @param limit the index afer the last element that should be used to calculate
+   *        codepoint.
+   *
+   * @return the Unicode code point at the given index
+   * @throws NullPointerException
+   *           - if the array is null.
+   * @throws IndexOutOfBoundsException
+   *           - if the value offset is negative or not less than the length of
+   *           the char array.
+   */
+  public abstract int codePointAt(final char[] chars, final int offset, final int limit);
+
+  /** Return the number of characters in <code>seq</code>. */
+  public abstract int codePointCount(CharSequence seq);
+
+  /**
+   * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
+   * of the given bufferSize.
+   *
+   * @param bufferSize
+   *          the internal char buffer size, must be <code>&gt;= 2</code>
+   * @return a new {@link CharacterBuffer} instance.
+   */
+  public static CharacterBuffer newCharacterBuffer(final int bufferSize) {
+    if (bufferSize < 2) {
+      throw new IllegalArgumentException("buffersize must be >= 2");
+    }
+    return new CharacterBuffer(new char[bufferSize], 0, 0);
+  }
+
+
+  /**
+   * Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting
+   * at the given offset.
+   * @param buffer the char buffer to lowercase
+   * @param offset the offset to start at
+   * @param limit the max char in the buffer to lower case
+   */
+  public final void toLowerCase(final char[] buffer, final int offset, final int limit) {
+    assert buffer.length >= limit;
+    assert offset <=0 && offset <= buffer.length;
+    for (int i = offset; i < limit;) {
+      i += Character.toChars(
+              Character.toLowerCase(
+                  codePointAt(buffer, i, limit)), buffer, i);
+     }
+  }
+
+  /** Converts a sequence of Java characters to a sequence of unicode code points.
+   *  @return the number of code points written to the destination buffer */
+  public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
+    if (srcLen < 0) {
+      throw new IllegalArgumentException("srcLen must be >= 0");
+    }
+    int codePointCount = 0;
+    for (int i = 0; i < srcLen; ) {
+      final int cp = codePointAt(src, srcOff + i, srcOff + srcLen);
+      final int charCount = Character.charCount(cp);
+      dest[destOff + codePointCount++] = cp;
+      i += charCount;
+    }
+    return codePointCount;
+  }
+
+  /** Converts a sequence of unicode code points to a sequence of Java characters.
+   *  @return the number of chars written to the destination buffer */
+  public final int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
+    if (srcLen < 0) {
+      throw new IllegalArgumentException("srcLen must be >= 0");
+    }
+    int written = 0;
+    for (int i = 0; i < srcLen; ++i) {
+      written += Character.toChars(src[srcOff + i], dest, destOff + written);
+    }
+    return written;
+  }
+
+  /**
+   * Fills the {@link CharacterBuffer} with characters read from the given
+   * reader {@link Reader}. This method tries to read <code>numChars</code>
+   * characters into the {@link CharacterBuffer}, each call to fill will start
+   * filling the buffer from offset <code>0</code> up to <code>numChars</code>.
+   * In case code points can span across 2 java characters, this method may
+   * only fill <code>numChars - 1</code> characters in order not to split in
+   * the middle of a surrogate pair, even if there are remaining characters in
+   * the {@link Reader}.
+   * <p>
+   * Depending on the {@link Version} passed to
+   * {@link XCharacterUtils#getInstance(Version)} this method implements
+   * supplementary character awareness when filling the given buffer. For all
+   * {@link Version} &gt; 3.0 {@link #fill(CharacterBuffer, Reader, int)} guarantees
+   * that the given {@link CharacterBuffer} will never contain a high surrogate
+   * character as the last element in the buffer unless it is the last available
+   * character in the reader. In other words, high and low surrogate pairs will
+   * always be preserved across buffer boarders.
+   * </p>
+   * <p>
+   * A return value of <code>false</code> means that this method call exhausted
+   * the reader, but there may be some bytes which have been read, which can be
+   * verified by checking whether <code>buffer.getLength() &gt; 0</code>.
+   * </p>
+   *
+   * @param buffer
+   *          the buffer to fill.
+   * @param reader
+   *          the reader to read characters from.
+   * @param numChars
+   *          the number of chars to read
+   * @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer
+   * @throws IOException
+   *           if the reader throws an {@link IOException}.
+   */
+  public abstract boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException;
+
+  /** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
+  public final boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
+    return fill(buffer, reader, buffer.buffer.length);
+  }
+
+  /** Return the index within <code>buf[start:start+count]</code> which is by <code>offset</code>
+   *  code points from <code>index</code>. */
+  public abstract int offsetByCodePoints(char[] buf, int start, int count, int index, int offset);
+
+  static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
+    int read = 0;
+    while (read < len) {
+      final int r = reader.read(dest, offset + read, len - read);
+      if (r == -1) {
+        break;
+      }
+      read += r;
+    }
+    return read;
+  }
+
+  private static final class Java5XCharacterUtils extends XCharacterUtils {
+    Java5XCharacterUtils() {
+    }
+
+    @Override
+    public int codePointAt(final CharSequence seq, final int offset) {
+      return Character.codePointAt(seq, offset);
+    }
+
+    @Override
+    public int codePointAt(final char[] chars, final int offset, final int limit) {
+     return Character.codePointAt(chars, offset, limit);
+    }
+
+    @Override
+    public boolean fill(final CharacterBuffer buffer, final Reader reader, int numChars) throws IOException {
+      assert buffer.buffer.length >= 2;
+      if (numChars < 2 || numChars > buffer.buffer.length) {
+        throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
+      }
+      final char[] charBuffer = buffer.buffer;
+      buffer.offset = 0;
+      final int offset;
+
+      // Install the previously saved ending high surrogate:
+      if (buffer.lastTrailingHighSurrogate != 0) {
+        charBuffer[0] = buffer.lastTrailingHighSurrogate;
+        buffer.lastTrailingHighSurrogate = 0;
+        offset = 1;
+      } else {
+        offset = 0;
+      }
+
+      final int read = readFully(reader, charBuffer, offset, numChars - offset);
+
+      buffer.length = offset + read;
+      final boolean result = buffer.length == numChars;
+      if (buffer.length < numChars) {
+        // We failed to fill the buffer. Even if the last char is a high
+        // surrogate, there is nothing we can do
+        return result;
+      }
+
+      if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
+        buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
+      }
+      return result;
+    }
+
+    @Override
+    public int codePointCount(CharSequence seq) {
+      return Character.codePointCount(seq, 0, seq.length());
+    }
+
+    @Override
+    public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
+      return Character.offsetByCodePoints(buf, start, count, index, offset);
+    }
+  }
+
+  private static final class Java4XCharacterUtils extends XCharacterUtils {
+    Java4XCharacterUtils() {
+    }
+
+    @Override
+    public int codePointAt(final CharSequence seq, final int offset) {
+      return seq.charAt(offset);
+    }
+
+    @Override
+    public int codePointAt(final char[] chars, final int offset, final int limit) {
+      if(offset >= limit)
+        throw new IndexOutOfBoundsException("offset must be less than limit");
+      return chars[offset];
+    }
+
+    @Override
+    public boolean fill(CharacterBuffer buffer, Reader reader, int numChars)
+        throws IOException {
+      assert buffer.buffer.length >= 1;
+      if (numChars < 1 || numChars > buffer.buffer.length) {
+        throw new IllegalArgumentException("numChars must be >= 1 and <= the buffer size");
+      }
+      buffer.offset = 0;
+      final int read = readFully(reader, buffer.buffer, 0, numChars);
+      buffer.length = read;
+      buffer.lastTrailingHighSurrogate = 0;
+      return read == numChars;
+    }
+
+    @Override
+    public int codePointCount(CharSequence seq) {
+      return seq.length();
+    }
+
+    @Override
+    public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
+      final int result = index + offset;
+      if (result < 0 || result > count) {
+        throw new IndexOutOfBoundsException();
+      }
+      return result;
+    }
+
+  }
+
+  /**
+   * A simple IO buffer to use with
+   * {@link XCharacterUtils#fill(CharacterBuffer, Reader)}.
+   */
+  public static final class CharacterBuffer {
+
+    private final char[] buffer;
+    private int offset;
+    private int length;
+    // NOTE: not private so outer class can access without
+    // $access methods:
+    char lastTrailingHighSurrogate;
+
+    CharacterBuffer(char[] buffer, int offset, int length) {
+      this.buffer = buffer;
+      this.offset = offset;
+      this.length = length;
+    }
+
+    /**
+     * Returns the internal buffer
+     *
+     * @return the buffer
+     */
+    public char[] getBuffer() {
+      return buffer;
+    }
+
+    /**
+     * Returns the data offset in the internal buffer.
+     *
+     * @return the offset
+     */
+    public int getOffset() {
+      return offset;
+    }
+
+    /**
+     * Return the length of the data in the internal buffer starting at
+     * {@link #getOffset()}
+     *
+     * @return the length
+     */
+    public int getLength() {
+      return length;
+    }
+
+    /**
+     * Resets the CharacterBuffer. All internals are reset to its default
+     * values.
+     */
+    public void reset() {
+      offset = 0;
+      length = 0;
+      lastTrailingHighSurrogate = 0;
+    }
+  }
+
+}
--- a/src/main/java/org/elasticsearch/index/analysis/CharMatcher.java
+++ b/src/main/java/org/elasticsearch/index/analysis/CharMatcher.java
@ -0,0 +1,137 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * A class to match character code points.
+ */
+public interface CharMatcher {
+
+    public static class ByUnicodeCategory implements CharMatcher {
+
+        public static CharMatcher of(byte unicodeCategory) {
+            return new ByUnicodeCategory(unicodeCategory);
+        }
+
+        private final byte unicodeType;
+
+        ByUnicodeCategory(byte unicodeType) {
+            this.unicodeType = unicodeType;
+        }
+
+        @Override
+        public boolean isTokenChar(int c) {
+            return Character.getType(c) == unicodeType;
+        }
+    }
+
+    public enum Basic implements CharMatcher {
+        LETTER {
+            @Override
+            public boolean isTokenChar(int c) {
+                return Character.isLetter(c);
+            }
+        },
+        DIGIT {
+            @Override
+            public boolean isTokenChar(int c) {
+                return Character.isDigit(c);
+            }
+        },
+        WHITESPACE {
+            @Override
+            public boolean isTokenChar(int c) {
+                return Character.isWhitespace(c);
+            }
+        },
+        PUNCTUATION {
+            @Override
+            public boolean isTokenChar(int c) {
+                switch (Character.getType(c)) {
+                case Character.START_PUNCTUATION:
+                case Character.END_PUNCTUATION:
+                case Character.OTHER_PUNCTUATION:
+                case Character.CONNECTOR_PUNCTUATION:
+                case Character.DASH_PUNCTUATION:
+                case Character.INITIAL_QUOTE_PUNCTUATION:
+                case Character.FINAL_QUOTE_PUNCTUATION:
+                    return true;
+                default:
+                    return false;
+                }
+            }
+        },
+        SYMBOL {
+            @Override
+            public boolean isTokenChar(int c) {
+                switch (Character.getType(c)) {
+                case Character.CURRENCY_SYMBOL:
+                case Character.MATH_SYMBOL:
+                case Character.OTHER_SYMBOL:
+                    return true;
+                 default:
+                     return false;
+                }
+            }
+        }
+    }
+
+    public final class Builder {
+        private final Set<CharMatcher> matchers;
+        Builder() {
+            matchers = new HashSet<CharMatcher>();
+        }
+        public Builder or(CharMatcher matcher) {
+            matchers.add(matcher);
+            return this;
+        }
+        public CharMatcher build() {
+            switch (matchers.size()) {
+            case 0:
+                return new CharMatcher() {
+                    @Override
+                    public boolean isTokenChar(int c) {
+                        return false;
+                    }
+                };
+            case 1:
+                return matchers.iterator().next();
+            default:
+                return new CharMatcher() {
+                    @Override
+                    public boolean isTokenChar(int c) {
+                        for (CharMatcher matcher : matchers) {
+                            if (matcher.isTokenChar(c)) {
+                                return true;
+                            }
+                        }
+                        return false;
+                    }
+                };
+            }
+        }
+    }
+
+    /** Returns true if, and only if, the provided character matches this character class. */
+    public boolean isTokenChar(int c);
+}
--- a/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java
@ -20,9 +20,10 @@
 package org.elasticsearch.index.analysis;

 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
-import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
-import org.apache.lucene.analysis.ngram.NGramTokenFilter;
+import org.apache.lucene.analysis.ngram.*;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side;
+import org.apache.lucene.analysis.reverse.ReverseStringFilter;
+import org.apache.lucene.util.Version;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.settings.Settings;
@ -51,6 +52,19 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {

    @Override
    public TokenStream create(TokenStream tokenStream) {
-        return new EdgeNGramTokenFilter(tokenStream, side, minGram, maxGram);
+        if (version.onOrAfter(Version.LUCENE_43)) {
+            TokenStream result = tokenStream;
+            // side=BACK is not supported anymore but applying ReverseStringFilter up-front and after the token filter has the same effect
+            if (side == Side.BACK) {
+                result = new ReverseStringFilter(version, result);
+            }
+            result = new XEdgeNGramTokenFilter(version, result, minGram, maxGram);
+            if (side == Side.BACK) {
+                result = new ReverseStringFilter(version, result);
+            }
+            return result;
+        } else {
+            return new EdgeNGramTokenFilter(tokenStream, side, minGram, maxGram);
+        }
    }
 }
--- a/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenizerFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenizerFactory.java
@ -19,9 +19,13 @@

 package org.elasticsearch.index.analysis;

+import org.elasticsearch.ElasticSearchIllegalArgumentException;
+
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
 import org.apache.lucene.analysis.ngram.NGramTokenizer;
+import org.apache.lucene.analysis.ngram.XEdgeNGramTokenizer;
+import org.apache.lucene.util.Version;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.settings.Settings;
@ -30,6 +34,8 @@ import org.elasticsearch.index.settings.IndexSettings;

 import java.io.Reader;

+import static org.elasticsearch.index.analysis.NGramTokenizerFactory.parseTokenChars;
+
 /**
 *
 */
@ -41,16 +47,37 @@ public class EdgeNGramTokenizerFactory extends AbstractTokenizerFactory {

    private final EdgeNGramTokenizer.Side side;

+    private final CharMatcher matcher;
+
    @Inject
    public EdgeNGramTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
        super(index, indexSettings, name, settings);
        this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
        this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
        this.side = EdgeNGramTokenizer.Side.getSide(settings.get("side", EdgeNGramTokenizer.DEFAULT_SIDE.getLabel()));
+        this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
    }

    @Override
    public Tokenizer create(Reader reader) {
-        return new EdgeNGramTokenizer(reader, side, minGram, maxGram);
+        if (version.onOrAfter(Version.LUCENE_43)) {
+            if (side == EdgeNGramTokenizer.Side.BACK) {
+                throw new ElasticSearchIllegalArgumentException("side=BACK is not supported anymore. Please fix your analysis chain or use"
+                        + " an older compatibility version (<=4.2) but beware that it might cause highlighting bugs.");
+            }
+            // LUCENE MONITOR: this token filter is a copy from lucene trunk and should go away once we upgrade to lucene 4.4
+            if (matcher == null) {
+                return new XEdgeNGramTokenizer(version, reader, minGram, maxGram);
+            } else {
+                return new XEdgeNGramTokenizer(version, reader, minGram, maxGram) {
+                    @Override
+                    protected boolean isTokenChar(int chr) {
+                        return matcher.isTokenChar(chr);
+                    }
+                };
+            }
+        } else {
+            return new EdgeNGramTokenizer(reader, side, minGram, maxGram);
+        }
    }
 }
--- a/src/main/java/org/elasticsearch/index/analysis/NGramTokenFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/NGramTokenFilterFactory.java
@ -49,7 +49,7 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {

    @Override
    public TokenStream create(TokenStream tokenStream) {
-        if (this.version.onOrAfter(Version.LUCENE_42)) {
+        if (this.version.onOrAfter(Version.LUCENE_43)) {
            // LUCENE MONITOR: this token filter is a copy from lucene trunk and should go away once we upgrade to lucene 4.4 
            return new XNGramTokenFilter(version, tokenStream, minGram, maxGram);
        }
--- a/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java
@ -19,10 +19,12 @@

 package org.elasticsearch.index.analysis;

+import com.google.common.collect.ImmutableMap;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ngram.NGramTokenizer;
 import org.apache.lucene.analysis.ngram.XNGramTokenizer;
 import org.apache.lucene.util.Version;
+import org.elasticsearch.ElasticSearchIllegalArgumentException;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.settings.Settings;
@ -30,6 +32,10 @@ import org.elasticsearch.index.Index;
 import org.elasticsearch.index.settings.IndexSettings;

 import java.io.Reader;
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.Locale;
+import java.util.Map;

 /**
 *
@ -37,22 +43,75 @@ import java.io.Reader;
 public class NGramTokenizerFactory extends AbstractTokenizerFactory {

    private final int minGram;
-
    private final int maxGram;
+    private final CharMatcher matcher;
+
+    static final Map<String, CharMatcher> MATCHERS;
+
+    static {
+        ImmutableMap.Builder<String, CharMatcher> builder = ImmutableMap.builder();
+        builder.put("letter",      CharMatcher.Basic.LETTER);
+        builder.put("digit",       CharMatcher.Basic.DIGIT);
+        builder.put("whitespace",  CharMatcher.Basic.WHITESPACE);
+        builder.put("punctuation", CharMatcher.Basic.PUNCTUATION);
+        builder.put("symbol",      CharMatcher.Basic.SYMBOL);
+        // Populate with unicode categories from java.lang.Character
+        for (Field field : Character.class.getFields()) {
+            if (!field.getName().startsWith("DIRECTIONALITY")
+                    && Modifier.isPublic(field.getModifiers())
+                    && Modifier.isStatic(field.getModifiers())
+                    && field.getType() == byte.class) {
+                try {
+                    builder.put(field.getName().toLowerCase(Locale.ROOT), CharMatcher.ByUnicodeCategory.of(field.getByte(null)));
+                } catch (Exception e) {
+                    // just ignore
+                    continue;
+                }
+            }
+        }
+        MATCHERS = builder.build();
+    }
+
+    static CharMatcher parseTokenChars(String[] characterClasses) {
+        if (characterClasses == null || characterClasses.length == 0) {
+            return null;
+        }
+        CharMatcher.Builder builder = new CharMatcher.Builder();
+        for (String characterClass : characterClasses) {
+            characterClass = characterClass.toLowerCase(Locale.ROOT).trim();
+            CharMatcher matcher = MATCHERS.get(characterClass);
+            if (matcher == null) {
+                throw new ElasticSearchIllegalArgumentException("Unknown token type: '" + characterClass + "', must be one of " + MATCHERS.keySet());
+            }
+            builder.or(matcher);
+        }
+        return builder.build();
+    }

    @Inject
    public NGramTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
        super(index, indexSettings, name, settings);
        this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
        this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
+        this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
    }

    @Override
    public Tokenizer create(Reader reader) {
-        if (this.version.onOrAfter(Version.LUCENE_42)) {
-            // LUCENE MONITOR: this token filter is a copy from lucene trunk and should go away once we upgrade to lucene 4.4 
-            return new XNGramTokenizer(version, reader, minGram, maxGram);
+        if (this.version.onOrAfter(Version.LUCENE_43)) {
+            // LUCENE MONITOR: this token filter is a copy from lucene trunk and should go away once we upgrade to lucene 4.4
+            if (matcher == null) {
+                return new XNGramTokenizer(version, reader, minGram, maxGram);
+            } else {
+                return new XNGramTokenizer(version, reader, minGram, maxGram) {
+                    @Override
+                    protected boolean isTokenChar(int chr) {
+                        return matcher.isTokenChar(chr);
+                    }
+                };
+            }
        }
        return new NGramTokenizer(reader, minGram, maxGram);
    }
+
 }
--- a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java
+++ b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java
@ -94,6 +94,7 @@ public class HighlighterSearchTests extends AbstractSharedClusterTest {
                .put("index.number_of_shards", 1)
                .put("analysis.tokenizer.autocomplete.max_gram", 20)
                .put("analysis.tokenizer.autocomplete.min_gram", 1)
+                .put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")
                .put("analysis.tokenizer.autocomplete.type", "nGram")
                .put("analysis.filter.wordDelimiter.type", "word_delimiter")
                .putArray("analysis.filter.wordDelimiter.type_table",
@ -283,6 +284,7 @@ public class HighlighterSearchTests extends AbstractSharedClusterTest {
                .put("analysis.filter.my_ngram.type", "ngram")
                .put("analysis.tokenizer.my_ngramt.max_gram", 20)
                .put("analysis.tokenizer.my_ngramt.min_gram", 1)
+                .put("analysis.tokenizer.my_ngramt.token_chars", "letter,digit")
                .put("analysis.tokenizer.my_ngramt.type", "ngram")
                .put("analysis.analyzer.name_index_analyzer.tokenizer", "my_ngramt")
                .put("analysis.analyzer.name2_index_analyzer.tokenizer", "whitespace")
--- a/src/test/java/org/elasticsearch/test/unit/index/analysis/NGramTokenizerFactoryTests.java
+++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/NGramTokenizerFactoryTests.java
@ -0,0 +1,87 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.test.unit.index.analysis;
+
+import org.elasticsearch.ElasticSearchIllegalArgumentException;
+import org.elasticsearch.common.settings.ImmutableSettings;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
+import org.elasticsearch.index.analysis.NGramTokenizerFactory;
+import org.testng.annotations.Test;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Arrays;
+
+import static org.testng.Assert.fail;
+
+public class NGramTokenizerFactoryTests {
+
+    @Test
+    public void testParseTokenChars() {
+        final Index index = new Index("test");
+        final String name = "ngr";
+        final Settings indexSettings = ImmutableSettings.EMPTY;
+        for (String tokenChars : Arrays.asList("letters", "number", "DIRECTIONALITY_UNDEFINED")) {
+            final Settings settings = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3).put("token_chars", tokenChars).build();
+            try {
+                new NGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader(""));
+                fail();
+            } catch (ElasticSearchIllegalArgumentException expected) {
+                // OK
+            }
+        }
+        for (String tokenChars : Arrays.asList("letter", " digit ", "punctuation", "DIGIT", "CoNtRoL", "dash_punctuation")) {
+            final Settings settings = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3).put("token_chars", tokenChars).build();
+            new NGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader(""));
+            // no exception
+        }
+    }
+
+    @Test
+    public void testPreTokenization() throws IOException {
+        // Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
+        final Index index = new Index("test");
+        final String name = "ngr";
+        final Settings indexSettings = ImmutableSettings.EMPTY;
+        Settings settings = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
+        AnalysisTestsHelper.assertSimpleTSOutput(new NGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader("Åbc déf g\uD801\uDC00f ")),
+                new String[] {"Åb", "Åbc", "bc", "dé", "déf", "éf", "g\uD801\uDC00", "g\uD801\uDC00f", "\uD801\uDC00f"});
+        settings = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
+        AnalysisTestsHelper.assertSimpleTSOutput(new NGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader(" a!$ 9")),
+            new String[] {" a", " a!", "a!", "a!$", "!$", "!$ ", "$ ", "$ 9", " 9"});
+    }
+
+    @Test
+    public void testPreTokenizationEdge() throws IOException {
+        // Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
+        final Index index = new Index("test");
+        final String name = "ngr";
+        final Settings indexSettings = ImmutableSettings.EMPTY;
+        Settings settings = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
+        AnalysisTestsHelper.assertSimpleTSOutput(new EdgeNGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader("Åbc déf g\uD801\uDC00f ")),
+                new String[] {"Åb", "Åbc", "dé", "déf", "g\uD801\uDC00", "g\uD801\uDC00f"});
+        settings = ImmutableSettings.builder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
+        AnalysisTestsHelper.assertSimpleTSOutput(new EdgeNGramTokenizerFactory(index, indexSettings, name, settings).create(new StringReader(" a!$ 9")),
+                new String[] {" a", " a!"});
+    }
+
+}