LUCENE-7690: Add preserveOriginal option to the NGram and EdgeNGram filters

2018-06-04 21:24:20 -04:00 · 2018-06-04 21:24:20 -04:00 · 2c1ab31b4e
parent 59087d148a
commit 2c1ab31b4e
13 changed files with 392 additions and 135 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -203,6 +203,9 @@ New Features
  now use to also take pending deletes into account which ensures that all file
  generations per segment always go forward. (Simon Willnauer)
  
+* LUCENE-7690: Add preserveOriginal option to the NGram and EdgeNGram filters.
+  (Ingomar Wesp, Shawn Heisey via Robert Muir)
+
 * LUCENE-8335: Enforce soft-deletes field up-front. Soft deletes field must be marked
  as such once it's introduced and can't be changed after the fact.
  (Nhat Nguyen via Simon Willnauer)
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java
@ -29,19 +29,21 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
 * &lt;fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100"&gt;
 *   &lt;analyzer&gt;
 *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
- *     &lt;filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="1"/&gt;
+ *     &lt;filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="2" preserveOriginal="true"/&gt;
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre>
 */
 public class EdgeNGramFilterFactory extends TokenFilterFactory {
  private final int maxGramSize;
  private final int minGramSize;
+  private final boolean preserveOriginal;

  /** Creates a new EdgeNGramFilterFactory */
  public EdgeNGramFilterFactory(Map<String, String> args) {
    super(args);
    minGramSize = getInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE);
    maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE);
+    preserveOriginal = getBoolean(args, "preserveOriginal", EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL);
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
@ -49,6 +51,6 @@ public class EdgeNGramFilterFactory extends TokenFilterFactory {

  @Override
  public TokenFilter create(TokenStream input) {
-    return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize);
+    return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal);
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
@ -32,29 +32,46 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 * supplementary characters.
 */
 public final class EdgeNGramTokenFilter extends TokenFilter {
+  /**
+   * @deprecated since 7.4 - this value will be required.
+   */
+  @Deprecated
  public static final int DEFAULT_MAX_GRAM_SIZE = 1;
+  /**
+   * @deprecated since 7.4 - this value will be required.
+   */
+  @Deprecated
  public static final int DEFAULT_MIN_GRAM_SIZE = 1;
+  public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;

  private final int minGram;
  private final int maxGram;
+  private final boolean preserveOriginal;
+
  private char[] curTermBuffer;
  private int curTermLength;
-  private int curCodePointCount;
+  private int curTermCodePointCount;
  private int curGramSize;
-  private int savePosIncr;
+  private int curPosIncr;
  private State state;
  
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

  /**
-   * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
+   * Creates an EdgeNGramTokenFilter that, for a given input term, produces all
+   * edge n-grams with lengths &gt;= minGram and &lt;= maxGram. Will
+   * optionally preserve the original term when its length is outside of the
+   * defined range.
   * 
   * @param input {@link TokenStream} holding the input to be tokenized
-   * @param minGram the smallest n-gram to generate
-   * @param maxGram the largest n-gram to generate
+   * @param minGram the minimum length of the generated n-grams
+   * @param maxGram the maximum length of the generated n-grams
+   * @param preserveOriginal Whether or not to keep the original term when it
+   * is outside the min/max size range.
   */
-  public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) {
+  public EdgeNGramTokenFilter(
+      TokenStream input, int minGram, int maxGram, boolean preserveOriginal) {
    super(input);

    if (minGram < 1) {
@ -67,6 +84,39 @@ public final class EdgeNGramTokenFilter extends TokenFilter {

    this.minGram = minGram;
    this.maxGram = maxGram;
+    this.preserveOriginal = preserveOriginal;
+  }
+
+  /**
+   * Creates an EdgeNGramTokenFilter that produces edge n-grams of the given
+   * size.
+   *
+   * @param input {@link TokenStream} holding the input to be tokenized
+   * @param gramSize the n-gram size to generate.
+   */
+  public EdgeNGramTokenFilter(TokenStream input, int gramSize) {
+    this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL);
+  }
+
+  /**
+   * Creates an EdgeNGramTokenFilter that, for a given input term, produces all
+   * edge n-grams with lengths &gt;= minGram and &lt;= maxGram.
+   * 
+   * <p>
+   * Behaves the same as
+   * {@link #EdgeNGramTokenFilter(TokenStream, int, int, boolean)
+   * NGramTokenFilter(input, minGram, maxGram, false)}
+   *
+   * @param input {@link TokenStream} holding the input to be tokenized
+   * @param minGram the minimum length of the generated n-grams
+   * @param maxGram the maximum length of the generated n-grams
+   *
+   * @deprecated since 7.4. Use
+   * {@link #EdgeNGramTokenFilter(TokenStream, int, int, boolean)} instead.
+   */
+  @Deprecated
+  public EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) {
+    this(input, minGram, maxGram, DEFAULT_PRESERVE_ORIGINAL);
  }

  @Override
@ -75,32 +125,46 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
      if (curTermBuffer == null) {
        if (!input.incrementToken()) {
          return false;
-        } else {
-          curTermBuffer = termAtt.buffer().clone();
-          curTermLength = termAtt.length();
-          curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
-          curGramSize = minGram;
+        }
        state = captureState();
-          savePosIncr += posIncrAtt.getPositionIncrement();
+        
+        curTermLength = termAtt.length();
+        curTermCodePointCount = Character.codePointCount(termAtt, 0, curTermLength);
+        curPosIncr += posIncrAtt.getPositionIncrement();
+
+        if (preserveOriginal && curTermCodePointCount < minGram) {
+          // Token is shorter than minGram, but we'd still like to keep it.
+          posIncrAtt.setPositionIncrement(curPosIncr);
+          curPosIncr = 0;
+          return true;
        }
+        
+        curTermBuffer = termAtt.buffer().clone();
+        curGramSize = minGram;
      }
-      if (curGramSize <= maxGram) {         // if we have hit the end of our n-gram size range, quit
-        if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
-          // grab gramSize chars from front or back
+
+      if (curGramSize <= curTermCodePointCount) {
+        if (curGramSize <= maxGram) { // curGramSize is between minGram and maxGram
          restoreState(state);
          // first ngram gets increment, others don't
-          if (curGramSize == minGram) {
-            posIncrAtt.setPositionIncrement(savePosIncr);
-            savePosIncr = 0;
-          } else {
-            posIncrAtt.setPositionIncrement(0);
-          }
+          posIncrAtt.setPositionIncrement(curPosIncr);
+          curPosIncr = 0;
+
          final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
          termAtt.copyBuffer(curTermBuffer, 0, charLength);
          curGramSize++;
          return true;
        }
+        else if (preserveOriginal) {
+          // Token is longer than maxGram, but we'd still like to keep it.
+          restoreState(state);
+          posIncrAtt.setPositionIncrement(0);
+          termAtt.copyBuffer(curTermBuffer, 0, curTermLength);
+          curTermBuffer = null;
+          return true;
        }
+      }
+      // Done with this input token, get next token on the next iteration.
      curTermBuffer = null;
    }
  }
@ -109,6 +173,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
  public void reset() throws IOException {
    super.reset();
    curTermBuffer = null;
-    savePosIncr = 0;
+    curPosIncr = 0;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java
@ -29,19 +29,21 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
 * &lt;fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100"&gt;
 *   &lt;analyzer&gt;
 *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
- *     &lt;filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/&gt;
+ *     &lt;filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2" preserveOriginal="true"/&gt;
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre>
 */
 public class NGramFilterFactory extends TokenFilterFactory {
  private final int maxGramSize;
  private final int minGramSize;
+  private final boolean preserveOriginal;

  /** Creates a new NGramFilterFactory */
  public NGramFilterFactory(Map<String, String> args) {
    super(args);
    minGramSize = getInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE);
    maxGramSize = getInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
+    preserveOriginal = getBoolean(args, "keepShortTerm", NGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL);
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
@ -49,6 +51,6 @@ public class NGramFilterFactory extends TokenFilterFactory {

  @Override
  public TokenFilter create(TokenStream input) {
-    return new NGramTokenFilter(input, minGramSize, maxGramSize);
+    return new NGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal);
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
@ -21,7 +21,6 @@ import java.io.IOException;

 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

@ -40,30 +39,52 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 * override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization.
 */
 public final class NGramTokenFilter extends TokenFilter {
+  /**
+   * @deprecated since 7.4 - this value will be required.
+   */
+  @Deprecated
  public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
-  public static final int DEFAULT_MAX_NGRAM_SIZE = 2;

-  private final int minGram, maxGram;
+  /**
+   * @deprecated since 7.4 - this value will be required.
+   */
+  @Deprecated
+  public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
+  public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;
+
+  private final int minGram;
+  private final int maxGram;
+  private final boolean preserveOriginal;

  private char[] curTermBuffer;
  private int curTermLength;
-  private int curCodePointCount;
+  private int curTermCodePointCount;
  private int curGramSize;
  private int curPos;
-  private int curPosInc;
+  private int curPosIncr;
  private State state;

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final PositionIncrementAttribute posIncAtt;
+  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

  /**
-   * Creates NGramTokenFilter with given min and max n-grams.
+   * Creates an NGramTokenFilter that, for a given input term, produces all
+   * contained n-grams with lengths &gt;= minGram and &lt;= maxGram. Will
+   * optionally preserve the original term when its length is outside of the
+   * defined range.
+   * 
+   * Note: Care must be taken when choosing minGram and maxGram; depending
+   * on the input token size, this filter potentially produces a huge number
+   * of terms.
+   * 
   * @param input {@link TokenStream} holding the input to be tokenized
-   * @param minGram the smallest n-gram to generate
-   * @param maxGram the largest n-gram to generate
+   * @param minGram the minimum length of the generated n-grams
+   * @param maxGram the maximum length of the generated n-grams
+   * @param preserveOriginal Whether or not to keep the original term when it
+   * is shorter than minGram or longer than maxGram
   */
-  public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
-    super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));
+  public NGramTokenFilter(TokenStream input, int minGram, int maxGram, boolean preserveOriginal) {
+    super(input);
    if (minGram < 1) {
      throw new IllegalArgumentException("minGram must be greater than zero");
    }
@ -72,50 +93,106 @@ public final class NGramTokenFilter extends TokenFilter {
    }
    this.minGram = minGram;
    this.maxGram = maxGram;
+    this.preserveOriginal = preserveOriginal;
+  }
  
-    posIncAtt = addAttribute(PositionIncrementAttribute.class);
+  /**
+   * Creates an NGramTokenFilter that produces n-grams of the indicated size.
+   * 
+   * @param input {@link TokenStream} holding the input to be tokenized
+   * @param gramSize the size of n-grams to generate.
+   */
+  public NGramTokenFilter(TokenStream input, int gramSize) {
+    this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL);
+  }
+
+  /**
+   * Creates an NGramTokenFilter that, for a given input term, produces all
+   * contained n-grams with lengths &gt;= minGram and &lt;= maxGram.
+   * 
+   * <p>
+   * Behaves the same as
+   * {@link #NGramTokenFilter(TokenStream, int, int, boolean)
+   * NGramTokenFilter(input, minGram, maxGram, false)}
+   * 
+   * @param input {@link TokenStream} holding the input to be tokenized
+   * @param minGram the minimum length of the generated n-grams
+   * @param maxGram the maximum length of the generated n-grams
+   *
+   * @deprecated since 7.4. Use
+   * {@link #NGramTokenFilter(TokenStream, int, int, boolean)} instead.
+   */
+  @Deprecated
+  public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
+    this(input, minGram, maxGram, DEFAULT_PRESERVE_ORIGINAL);
  }

  /**
   * Creates NGramTokenFilter with default min and max n-grams.
+   * 
+   * <p>
+   * Behaves the same as
+   * {@link #NGramTokenFilter(TokenStream, int, int, boolean)
+   * NGramTokenFilter(input, 1, 2, false)}
+   * 
   * @param input {@link TokenStream} holding the input to be tokenized
+   * @deprecated since 7.4. Use
+   * {@link #NGramTokenFilter(TokenStream, int, int, boolean)} instead.
   */
+  @Deprecated
  public NGramTokenFilter(TokenStream input) {
-    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
+    this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE, DEFAULT_PRESERVE_ORIGINAL);
  }

-  /** Returns the next token in the stream, or null at EOS. */
  @Override
  public final boolean incrementToken() throws IOException {
    while (true) {
      if (curTermBuffer == null) {
        if (!input.incrementToken()) {
          return false;
-        } else {
-          curTermBuffer = termAtt.buffer().clone();
-          curTermLength = termAtt.length();
-          curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
-          curGramSize = minGram;
-          curPos = 0;
-          curPosInc = posIncAtt.getPositionIncrement();
-          state = captureState();
        }
+        state = captureState();
+        
+        curTermLength = termAtt.length();
+        curTermCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
+        curPosIncr += posIncrAtt.getPositionIncrement();
+        curPos = 0;
+        
+        if (preserveOriginal && curTermCodePointCount < minGram) {
+          // Token is shorter than minGram, but we'd still like to keep it.
+          posIncrAtt.setPositionIncrement(curPosIncr);
+          curPosIncr = 0;
+          return true;
        }
        
-      if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) {
+        curTermBuffer = termAtt.buffer().clone();
+        curGramSize = minGram;
+      }
+
+      if (curGramSize > maxGram || (curPos + curGramSize) > curTermCodePointCount) {
        ++curPos;
        curGramSize = minGram;
      }
-      if ((curPos + curGramSize) <= curCodePointCount) {
+      if ((curPos + curGramSize) <= curTermCodePointCount) {
        restoreState(state);
        final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
        final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
        termAtt.copyBuffer(curTermBuffer, start, end - start);
-        posIncAtt.setPositionIncrement(curPosInc);
-        curPosInc = 0;
+        posIncrAtt.setPositionIncrement(curPosIncr);
+        curPosIncr = 0;
        curGramSize++;
        return true;
      }
+      else if (preserveOriginal && curTermCodePointCount > maxGram) {
+        // Token is longer than maxGram, but we'd still like to keep it.
+        restoreState(state);
+        posIncrAtt.setPositionIncrement(0);
+        termAtt.copyBuffer(curTermBuffer, 0, curTermLength);
+        curTermBuffer = null;
+        return true;
+      }
+      
+      // Done with this input token, get next token on next iteration.
      curTermBuffer = null;  
    }
  }
@ -124,5 +201,6 @@ public final class NGramTokenFilter extends TokenFilter {
  public void reset() throws IOException {
    super.reset();
    curTermBuffer = null;
+    curPosIncr = 0;
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
@ -236,7 +236,7 @@ public class TestBugInSomething extends BaseTokenStreamTestCase {
        //TokenStream stream = new SopTokenFilter(tokenizer);
        TokenStream stream = new ShingleFilter(tokenizer, 5);
        //stream = new SopTokenFilter(stream);
-        stream = new NGramTokenFilter(stream, 55, 83);
+        stream = new NGramTokenFilter(stream, 55, 83, false);
        //stream = new SopTokenFilter(stream);
        return new TokenStreamComponents(tokenizer, stream);
      }  
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
@ -50,49 +50,73 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {

  public void testInvalidInput() throws Exception {
    expectThrows(IllegalArgumentException.class, () -> {
-      new EdgeNGramTokenFilter(input, 0, 0);
+      new EdgeNGramTokenFilter(input, 0, 0, false);
    });
  }

  public void testInvalidInput2() throws Exception {
    expectThrows(IllegalArgumentException.class, () -> {   
-      new EdgeNGramTokenFilter(input, 2, 1);
+      new EdgeNGramTokenFilter(input, 2, 1, false);
    });
  }

  public void testInvalidInput3() throws Exception {
    expectThrows(IllegalArgumentException.class, () -> {      
-      new EdgeNGramTokenFilter(input, -1, 2);
+      new EdgeNGramTokenFilter(input, -1, 2, false);
    });
  }

  public void testFrontUnigram() throws Exception {
-    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 1);
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 1, false);
    assertTokenStreamContents(tokenizer, new String[]{"a"}, new int[]{0}, new int[]{5});
  }

  public void testOversizedNgrams() throws Exception {
-    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6);
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, false);
    assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]);
  }

+  public void testOversizedNgramsPreserveOriginal() throws Exception {
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 6, 6, true);
+    assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5});
+  }
+
+  public void testPreserveOriginal() throws Exception {
+    final String inputString = "a bcd efghi jk";
+
+    { // preserveOriginal = false
+      TokenStream ts = whitespaceMockTokenizer(inputString);
+      EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false);
+      assertTokenStreamContents(filter,
+          new String[] { "bc", "bcd",  "ef", "efg",  "jk" },
+          new int[]    {    2,     2,    6,      6,    12 },
+          new int[]    {    5,     5,   11,     11,    14 },
+          new int[]    {    2,     0,    1,      0,     1 });
+    }
+
+    { // preserveOriginal = true
+      TokenStream ts = whitespaceMockTokenizer(inputString);
+      EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, true);
+      assertTokenStreamContents(filter,
+          new String[] { "a", "bc", "bcd",  "ef", "efg", "efghi", "jk" },
+          new int[]    {  0,     2,     2,    6,      6,      6,    12 },
+          new int[]    {  1,     5,     5,   11,     11,     11,    14 },
+          new int[]    {  1,     1,     0,    1,      0,      0,     1 });
+    }
+  }
+
  public void testFrontRangeOfNgrams() throws Exception {
-    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3);
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 1, 3, false);
    assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
  }

  public void testFilterPositions() throws Exception {
    TokenStream ts = whitespaceMockTokenizer("abcde vwxyz");
-    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3);
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(ts, 1, 3, false);
    assertTokenStreamContents(tokenizer,
-                              new String[]{"a","ab","abc","v","vw","vwx"},
-                              new int[]{0,0,0,6,6,6},
-                              new int[]{5,5,5,11,11,11},
-                              null,
-                              new int[]{1,0,0,1,0,0},
-                              null,
-                              null,
-                              false);
+        new String[] {"a","ab","abc","v","vw","vwx"},
+        new int[]      {0,   0,    0,  6,   6,    6},
+        new int[]      {5,   5,    5, 11,  11,   11});
  }

  private static class PositionFilter extends TokenFilter {
@ -128,7 +152,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
  public void testFirstTokenPositionIncrement() throws Exception {
    TokenStream ts = whitespaceMockTokenizer("a abc");
    ts = new PositionFilter(ts); // All but first token will get 0 position increment
-    EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3);
+    EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(ts, 2, 3, false);
    // The first token "a" will not be output, since it's smaller than the mingram size of 2.
    // The second token on input to EdgeNGramTokenFilter will have position increment of 0,
    // which should be increased to 1, since this is the first output token in the stream.
@ -142,14 +166,14 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
  
  public void testSmallTokenInStream() throws Exception {
    input = whitespaceMockTokenizer("abc de fgh");
-    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 3, 3);
+    EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, 3, 3, false);
    assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
  }
  
  public void testReset() throws Exception {
    WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("abcde"));
-    EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3);
+    EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3, false);
    assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
    tokenizer.setReader(new StringReader("abcde"));
    assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
@ -160,13 +184,14 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
    for (int i = 0; i < 10; i++) {
      final int min = TestUtil.nextInt(random(), 2, 10);
      final int max = TestUtil.nextInt(random(), min, 20);
+      final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
    
      Analyzer a = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
          Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
          return new TokenStreamComponents(tokenizer, 
-            new EdgeNGramTokenFilter(tokenizer, min, max));
+            new EdgeNGramTokenFilter(tokenizer, min, max, preserveOriginal));
        }    
      };
      checkRandomData(random(), a, 100*RANDOM_MULTIPLIER);
@ -181,7 +206,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new KeywordTokenizer();
        return new TokenStreamComponents(tokenizer, 
-            new EdgeNGramTokenFilter(tokenizer, 2, 15));
+            new EdgeNGramTokenFilter(tokenizer, 2, 15, false));
      }    
    };
    checkAnalysisConsistency(random, a, random.nextBoolean(), "");
@ -192,7 +217,7 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
    TokenStream tk = new LetterTokenizer();
    ((Tokenizer)tk).setReader(new StringReader("abc d efgh ij klmno p q"));
    tk = new ShingleFilter(tk);
-    tk = new EdgeNGramTokenFilter(tk, 7, 10);
+    tk = new EdgeNGramTokenFilter(tk, 7, 10, false);
    assertTokenStreamContents(tk,
        new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
        new int[]    { 6,11,11,14 },
@ -204,23 +229,44 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
  }

  public void testSupplementaryCharacters() throws IOException {
+    for (int i = 0; i < 20; i++) {
      final String s = TestUtil.randomUnicodeString(random(), 10);
      final int codePointCount = s.codePointCount(0, s.length());
      final int minGram = TestUtil.nextInt(random(), 1, 3);
      final int maxGram = TestUtil.nextInt(random(), minGram, 10);
+      final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
+
      TokenStream tk = new KeywordTokenizer();
      ((Tokenizer)tk).setReader(new StringReader(s));
-    tk = new EdgeNGramTokenFilter(tk, minGram, maxGram);
+      tk = new EdgeNGramTokenFilter(tk, minGram, maxGram, preserveOriginal);
      final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
      final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
      tk.reset();
-    for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
+
+      if (codePointCount < minGram && preserveOriginal) {
        assertTrue(tk.incrementToken());
        assertEquals(0, offsetAtt.startOffset());
        assertEquals(s.length(), offsetAtt.endOffset());
-      final int end = Character.offsetByCodePoints(s, 0, i);
+        assertEquals(s, termAtt.toString());
+      }
+
+      for (int j = minGram; j <= Math.min(codePointCount, maxGram); j++) {
+        assertTrue(tk.incrementToken());
+        assertEquals(0, offsetAtt.startOffset());
+        assertEquals(s.length(), offsetAtt.endOffset());
+        final int end = Character.offsetByCodePoints(s, 0, j);
        assertEquals(s.substring(0, end), termAtt.toString());
      }
+
+      if (codePointCount > maxGram && preserveOriginal) {
+        assertTrue(tk.incrementToken());
+        assertEquals(0, offsetAtt.startOffset());
+        assertEquals(s.length(), offsetAtt.endOffset());
+        assertEquals(s, termAtt.toString());
+      }
+
      assertFalse(tk.incrementToken());
+      tk.close();
+    }
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
@ -48,28 +48,28 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
  
  public void testInvalidInput() throws Exception {
    expectThrows(IllegalArgumentException.class, () -> {
-      new NGramTokenFilter(input, 2, 1);
+      new NGramTokenFilter(input, 2, 1, false);
    });
  }
  
  public void testInvalidInput2() throws Exception {
    expectThrows(IllegalArgumentException.class, () -> {     
-      new NGramTokenFilter(input, 0, 1);
+      new NGramTokenFilter(input, 0, 1, false);
    });
  }

  public void testUnigrams() throws Exception {
-    NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
+    NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1, false);
    assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
  }
  
  public void testBigrams() throws Exception {
-    NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
+    NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2, false);
    assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,0,0,0}, new int[]{5,5,5,5}, new int[]{1,0,0,0});
  }
  
  public void testNgrams() throws Exception {
-    NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
+    NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3, false);
    assertTokenStreamContents(filter,
        new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
        new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
@ -81,7 +81,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
  }

  public void testNgramsNoIncrement() throws Exception {
-    NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
+    NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3, false);
    assertTokenStreamContents(filter,
        new String[]{"a","ab","abc","b","bc","bcd","c","cd","cde","d","de","e"},
        new int[]{0,0,0,0,0,0,0,0,0,0,0,0},
@ -93,25 +93,61 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
  }

  public void testOversizedNgrams() throws Exception {
-    NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
+    NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7, false);
    assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
  }
  
+  public void testOversizedNgramsPreserveOriginal() throws Exception {
+    NGramTokenFilter tokenizer = new NGramTokenFilter(input, 6, 6, true);
+    assertTokenStreamContents(tokenizer, new String[] {"abcde"}, new int[] {0}, new int[] {5});
+  }
+  
  public void testSmallTokenInStream() throws Exception {
    input = whitespaceMockTokenizer("abc de fgh");
-    NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
-    assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2});
+    NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, false);
+    assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10}, new int[] {1, 2});
+  }
+  
+  public void testSmallTokenInStreamPreserveOriginal() throws Exception {
+    input = whitespaceMockTokenizer("abc de fgh");
+    NGramTokenFilter tokenizer = new NGramTokenFilter(input, 3, 3, true);
+    assertTokenStreamContents(tokenizer, new String[]{"abc","de","fgh"}, new int[]{0,4,7}, new int[]{3,6,10}, new int[] {1, 1, 1});
+
  }
  
  public void testReset() throws Exception {
    WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("abcde"));
-    NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
+    NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1, false);
    assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
    tokenizer.setReader(new StringReader("abcde"));
    assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
  }
  
+  public void testKeepShortTermKeepLongTerm() throws Exception {
+    final String inputString = "a bcd efghi jk";
+
+    { // preserveOriginal = false
+      TokenStream ts = whitespaceMockTokenizer(inputString);
+      NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, false);
+      assertTokenStreamContents(filter,
+          new String[] { "bc", "bcd",  "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "jk" },
+          new int[]    {    2,     2,     2,    6,     6,    6,     6,    6,     6,    6,   12 },
+          new int[]    {    5,     5,     5,   11,    11,   11,    11,   11,    11,   11,   14 },
+          new int[]    {    2,     0,     0,    1,     0,    0,     0,    0,     0,    0,    1 });
+    }
+
+    { // preserveOriginal = true
+      TokenStream ts = whitespaceMockTokenizer(inputString);
+      NGramTokenFilter filter = new NGramTokenFilter(ts, 2, 3, true);
+      assertTokenStreamContents(filter,
+          new String[] { "a", "bc", "bcd",  "cd", "ef", "efg", "fg", "fgh", "gh", "ghi", "hi", "efghi", "jk" },
+          new int[]    {   0,    2,     2,     2,    6,     6,    6,     6,    6,     6,    6,       6,   12 },
+          new int[]    {   1,    5,     5,     5,   11,    11,   11,    11,   11,    11,   11,      11,   14 },
+          new int[]    {   1,    1,     0,     0,    1,     0,    0,     0,    0,     0,    0,       0,    1 });
+    }
+  }
+  
  // LUCENE-3642
  // EdgeNgram blindly adds term length to offset, but this can take things out of bounds
  // wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
@ -122,7 +158,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
        TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
-        filters = new NGramTokenFilter(filters, 2, 2);
+        filters = new NGramTokenFilter(filters, 2, 2, false);
        return new TokenStreamComponents(tokenizer, filters);
      }
    };
@ -139,12 +175,14 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
    for (int i = 0; i < 10; i++) {
      final int min = TestUtil.nextInt(random(), 2, 10);
      final int max = TestUtil.nextInt(random(), min, 20);
+      final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
+      
      Analyzer a = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
          Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
          return new TokenStreamComponents(tokenizer, 
-              new NGramTokenFilter(tokenizer, min, max));
+              new NGramTokenFilter(tokenizer, min, max, preserveOriginal));
        }    
      };
      checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
@ -159,7 +197,7 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new KeywordTokenizer();
        return new TokenStreamComponents(tokenizer, 
-            new NGramTokenFilter(tokenizer, 2, 15));
+            new NGramTokenFilter(tokenizer, 2, 15, false));
      }    
    };
    checkAnalysisConsistency(random, a, random.nextBoolean(), "");
@ -167,16 +205,27 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
  }

  public void testSupplementaryCharacters() throws IOException {
+    for (int i = 0; i < 20; i++) {
      final String s = TestUtil.randomUnicodeString(random(), 10);
      final int codePointCount = s.codePointCount(0, s.length());
      final int minGram = TestUtil.nextInt(random(), 1, 3);
      final int maxGram = TestUtil.nextInt(random(), minGram, 10);
+      final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
+
      TokenStream tk = new KeywordTokenizer();
      ((Tokenizer)tk).setReader(new StringReader(s));
-    tk = new NGramTokenFilter(tk, minGram, maxGram);
+      tk = new NGramTokenFilter(tk, minGram, maxGram, preserveOriginal);
      final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
      final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
      tk.reset();
+
+      if (codePointCount < minGram && preserveOriginal) {
+        assertTrue(tk.incrementToken());
+        assertEquals(0, offsetAtt.startOffset());
+        assertEquals(s.length(), offsetAtt.endOffset());
+        assertEquals(s, termAtt.toString());
+      }
+      
      for (int start = 0; start < codePointCount; ++start) {
        for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
          assertTrue(tk.incrementToken());
@ -187,7 +236,16 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
          assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
        }
      }
-    assertFalse(tk.incrementToken());
+      
+      if (codePointCount > maxGram && preserveOriginal) {
+        assertTrue(tk.incrementToken());
+        assertEquals(0, offsetAtt.startOffset());
+        assertEquals(s.length(), offsetAtt.endOffset());
+        assertEquals(s, termAtt.toString());
      }
      
+      assertFalse(tk.incrementToken());
+      tk.close();
+    }
+  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
@ -56,12 +56,14 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
  }

  /**
-   * Test the NGramFilterFactory
+   * Test the NGramFilterFactory with old defaults
   */
  public void testNGramFilter() throws Exception {
    Reader reader = new StringReader("test");
    TokenStream stream = whitespaceMockTokenizer(reader);
-    stream = tokenFilterFactory("NGram").create(stream);
+    stream = tokenFilterFactory("NGram",
+        "minGramSize", "1",
+        "maxGramSize", "2").create(stream);
    assertTokenStreamContents(stream, 
        new String[] { "t", "te", "e", "es", "s", "st", "t" });
  }
@ -126,12 +128,13 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
  }

  /**
-   * Test EdgeNGramFilterFactory
+   * Test EdgeNGramFilterFactory with old defaults
   */
  public void testEdgeNGramFilter() throws Exception {
    Reader reader = new StringReader("test");
    TokenStream stream = whitespaceMockTokenizer(reader);
-    stream = tokenFilterFactory("EdgeNGram").create(stream);
+    stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1",
+        "maxGramSize", "1").create(stream);
    assertTokenStreamContents(stream, 
        new String[] { "t" });
  }
@ -173,7 +176,8 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
  
  /** Test that bogus arguments result in exception */
  public void testBogusArguments() throws Exception {
-    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
+    IllegalArgumentException expected = null;
+    expected = expectThrows(IllegalArgumentException.class, () -> {
      tokenizerFactory("NGram", "bogusArg", "bogusValue");
    });
    assertTrue(expected.getMessage().contains("Unknown parameters"));
@ -184,12 +188,12 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
    assertTrue(expected.getMessage().contains("Unknown parameters"));
    
    expected = expectThrows(IllegalArgumentException.class, () -> {
-      tokenFilterFactory("NGram", "bogusArg", "bogusValue");
+      tokenFilterFactory("NGram", "minGramSize", "2", "maxGramSize", "5", "bogusArg", "bogusValue");
    });
    assertTrue(expected.getMessage().contains("Unknown parameters"));
    
    expected = expectThrows(IllegalArgumentException.class, () -> {
-      tokenFilterFactory("EdgeNGram", "bogusArg", "bogusValue");
+      tokenFilterFactory("EdgeNGram", "minGramSize", "2", "maxGramSize", "5", "bogusArg", "bogusValue");
    });
    assertTrue(expected.getMessage().contains("Unknown parameters"));
  }
--- a/lucene/classification/src/test/org/apache/lucene/classification/BM25NBClassifierTest.java
+++ b/lucene/classification/src/test/org/apache/lucene/classification/BM25NBClassifierTest.java
@ -87,7 +87,7 @@ public class BM25NBClassifierTest extends ClassificationTestBase<BytesRef> {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      final Tokenizer tokenizer = new KeywordTokenizer();
-      return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20)));
+      return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false)));
    }
  }

--- a/lucene/classification/src/test/org/apache/lucene/classification/CachingNaiveBayesClassifierTest.java
+++ b/lucene/classification/src/test/org/apache/lucene/classification/CachingNaiveBayesClassifierTest.java
@ -86,7 +86,7 @@ public class CachingNaiveBayesClassifierTest extends ClassificationTestBase<Byte
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      final Tokenizer tokenizer = new KeywordTokenizer();
-      return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20)));
+      return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false)));
    }
  }

--- a/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
+++ b/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
@ -89,7 +89,7 @@ public class SimpleNaiveBayesClassifierTest extends ClassificationTestBase<Bytes
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      final Tokenizer tokenizer = new KeywordTokenizer();
-      return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20)));
+      return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20, false)));
    }
  }

--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java
@ -359,7 +359,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
                : "no need \"textgrams\" when minPrefixChars="+minPrefixChars;
        if (fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars > 0) {
          // TODO: should use an EdgeNGramTokenFilterFactory here
-          TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars);
+          TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars, false);
          return new TokenStreamComponents(components.getTokenizer(), filter);
        } else {
          return components;