Updated Analyzing/Fuzzysuggester from lucene trunk

* Minor alignments (like setter to ctor) * FuzzySuggester has a unicode aware flag, which is not exposed in the fuzzy completion request parameters * Made XAnalyzingSuggester flags (PAYLOAD_SEP, END_BYTE, SEP_LABEL) to be written into the postings format, so we can retain backwards compatibility * The above change also implies, that these flags can be set per instantiated XAnalyzingSuggester * CompletionPostingsFormatTest now uses a randomProvider for writing data to check for bwc
2025-03-25 01:19:02 +00:00 · 2013-11-25 18:22:34 +01:00 · 2013-11-25 18:22:34 +01:00 · bf74f49fdd
commit bf74f49fdd
parent 9f5d01ca4c
11 changed files with 605 additions and 80 deletions
--- a/docs/reference/search/suggesters/completion-suggest.asciidoc
+++ b/docs/reference/search/suggesters/completion-suggest.asciidoc
@ -218,6 +218,11 @@ The following parameters are supported:
    Minimum length of the input, which is not
    checked for fuzzy alternatives, defaults to `1`

+`unicode_aware`::
+    Sets all are measurements (like edit distance,
+    transpositions and lengths) in unicode code points
+    (actual letters) instead of bytes.
+
 NOTE: If you want to stick with the default values, but
      still use fuzzy, you can either use `fuzzy: {}`
      or `fuzzy: true`.
--- a/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java
+++ b/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java
@ -26,6 +26,8 @@ import org.apache.lucene.search.suggest.InputIterator;
 import org.apache.lucene.search.suggest.Lookup;
 import org.apache.lucene.search.suggest.Sort;
 import org.apache.lucene.store.*;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.*;
 import org.apache.lucene.util.automaton.*;
 import org.apache.lucene.util.fst.*;
@ -34,10 +36,7 @@ import org.apache.lucene.util.fst.PairOutputs.Pair;
 import org.apache.lucene.util.fst.Util.MinResult;
 import org.elasticsearch.common.collect.HppcMaps;

-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
+import java.io.*;
 import java.util.*;

 /**
@ -53,8 +52,9 @@ import java.util.*;
 * then the partial text "ghost chr..." could see the
 * suggestion "The Ghost of Christmas Past".  Note that
 * position increments MUST NOT be preserved for this example
- * to work, so you should call
- * {@link #setPreservePositionIncrements(boolean) setPreservePositionIncrements(false)}.
+ * to work, so you should call the constructor with
+ * <code>preservePositionIncrements</code> parameter set to
+ * false
 *
 * <p>
 * If SynonymFilter is used to map wifi and wireless network to
@ -124,24 +124,24 @@ public class XAnalyzingSuggester extends Lookup {
  private final boolean preserveSep;

  /** Include this flag in the options parameter to {@link
-   *  #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,FST,boolean,int)} to always
+   *  #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int)} to always
   *  return the exact match first, regardless of score.  This
   *  has no performance impact but could result in
   *  low-quality suggestions. */
  public static final int EXACT_FIRST = 1;

  /** Include this flag in the options parameter to {@link
-   *  #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,FST,boolean,int)} to preserve
+   *  #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int)} to preserve
   *  token separators when matching. */
  public static final int PRESERVE_SEP = 2;

  /** Represents the separation between tokens, if
   *  PRESERVE_SEP was specified */
-  private static final int SEP_LABEL = 0xFF; 
+  public static final int SEP_LABEL = '\u001F';

  /** Marks end of the analyzed input and start of dedup
   *  byte. */
-  private static final int END_BYTE = 0x0;
+  public static final int END_BYTE = 0x0;

  /** Maximum number of dup surface forms (different surface
   *  forms for the same analyzed form). */
@ -160,27 +160,31 @@ public class XAnalyzingSuggester extends Lookup {

  private boolean hasPayloads;

-  private static final int PAYLOAD_SEP = '\u001f';
+  private final int sepLabel;
+  private final int payloadSep;
+  private final int endByte;
+
+  public static final int PAYLOAD_SEP = '\u001f';

  /** Whether position holes should appear in the automaton. */
  private boolean preservePositionIncrements;

  /**
-   * Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,FST,boolean,int)
+   * Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int)
   * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
   * PRESERVE_SEP, 256, -1)}
   */
  public XAnalyzingSuggester(Analyzer analyzer) {
-    this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, null, false, 0);
+    this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
  }

  /**
-   * Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,FST,boolean,int)
+   * Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int)
   * AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST |
   * PRESERVE_SEP, 256, -1)}
   */
  public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
-    this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, null, false, 0);
+    this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
  }

  /**
@ -199,8 +203,9 @@ public class XAnalyzingSuggester extends Lookup {
   *   to expand from the analyzed form.  Set this to -1 for
   *   no limit.
   */
-  public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions
-          , FST<Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput) { 
+  public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
+                             boolean preservePositionIncrements, FST<Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
+                             int sepLabel, int payloadSep, int endByte) {
      // SIMON EDIT: I added fst, hasPayloads and maxAnalyzedPathsForOneInput 
    this.indexAnalyzer = indexAnalyzer;
    this.queryAnalyzer = queryAnalyzer;
@ -226,16 +231,13 @@ public class XAnalyzingSuggester extends Lookup {
    }
    this.maxGraphExpansions = maxGraphExpansions;
    this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
-    this.preservePositionIncrements = true;
-  }
-
-  /** Whether to take position holes (position increment > 1) into account when
-   *  building the automaton, <code>true</code> by default. */
-  public void setPreservePositionIncrements(boolean preservePositionIncrements) {
    this.preservePositionIncrements = preservePositionIncrements;
+    this.sepLabel = sepLabel;
+    this.payloadSep = payloadSep;
+    this.endByte = endByte;
  }

-    /** Returns byte size of the underlying FST. */
+  /** Returns byte size of the underlying FST. */
  public long sizeInBytes() {
    return fst == null ? 0 : fst.sizeInBytes();
  }
@ -251,7 +253,7 @@ public class XAnalyzingSuggester extends Lookup {

  // Replaces SEP with epsilon or remaps them if
  // we were asked to preserve them:
-  private static void replaceSep(Automaton a, boolean preserveSep) {
+  private static void replaceSep(Automaton a, boolean preserveSep, int replaceSep) {

    State[] states = a.getNumberedStates();

@ -265,7 +267,7 @@ public class XAnalyzingSuggester extends Lookup {
        if (t.getMin() == TokenStreamToAutomaton.POS_SEP) {
          if (preserveSep) {
            // Remap to SEP_LABEL:
-            newTransitions.add(new Transition(SEP_LABEL, t.getDest()));
+            newTransitions.add(new Transition(replaceSep, t.getDest()));
          } else {
            copyDestTransitions(state, t.getDest(), newTransitions);
            a.setDeterministic(false);
@ -289,21 +291,30 @@ public class XAnalyzingSuggester extends Lookup {
    }
  }

+  protected Automaton convertAutomaton(Automaton a) {
+    return a;
+  }
+
  /** Just escapes the 0xff byte (which we still for SEP). */
  private static final class  EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {

    final BytesRef spare = new BytesRef();
+    private char sepLabel;
+
+    public EscapingTokenStreamToAutomaton(char sepLabel) {
+      this.sepLabel = sepLabel;
+    }

    @Override
    protected BytesRef changeToken(BytesRef in) {
      int upto = 0;
      for(int i=0;i<in.length;i++) {
        byte b = in.bytes[in.offset+i];
-        if (b == (byte) SEP_LABEL) {
+        if (b == (byte) sepLabel) {
          if (spare.bytes.length == upto) {
            spare.grow(upto+2);
          }
-          spare.bytes[upto++] = (byte) SEP_LABEL;
+          spare.bytes[upto++] = (byte) sepLabel;
          spare.bytes[upto++] = b;
        } else {
          if (spare.bytes.length == upto) {
@ -321,7 +332,7 @@ public class XAnalyzingSuggester extends Lookup {
  public TokenStreamToAutomaton getTokenStreamToAutomaton() {
    final TokenStreamToAutomaton tsta;
    if (preserveSep) {
-      tsta = new EscapingTokenStreamToAutomaton();
+      tsta = new EscapingTokenStreamToAutomaton((char) sepLabel);
    } else {
      // When we're not preserving sep, we don't steal 0xff
      // byte, so we don't need to do any escaping:
@ -387,7 +398,7 @@ public class XAnalyzingSuggester extends Lookup {
      }
      return scratchA.compareTo(scratchB);
    }
-  };
+  }

  @Override
  public void build(InputIterator iterator) throws IOException {
@ -454,7 +465,7 @@ public class XAnalyzingSuggester extends Lookup {

          if (hasPayloads) {
            for(int i=0;i<surfaceForm.length;i++) {
-              if (surfaceForm.bytes[i] == PAYLOAD_SEP) {
+              if (surfaceForm.bytes[i] == payloadSep) {
                throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
              }
            }
@ -558,7 +569,7 @@ public class XAnalyzingSuggester extends Lookup {
          int payloadLength = scratch.length - payloadOffset;
          BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
          System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
-          br.bytes[surface.length] = PAYLOAD_SEP;
+          br.bytes[surface.length] = (byte) payloadSep;
          System.arraycopy(scratch.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);
          br.length = br.bytes.length;
          builder.add(scratchInts, outputs.newPair(cost, br));
@ -566,8 +577,10 @@ public class XAnalyzingSuggester extends Lookup {
      }
      fst = builder.finish();

-      //Util.dotToFile(fst, "/tmp/suggest.dot");
-      
+      //PrintWriter pw = new PrintWriter("/tmp/out.dot");
+      //Util.toDot(fst, pw, true, true);
+      //pw.close();
+
      success = true;
    } finally {
      if (success) {
@ -616,7 +629,7 @@ public class XAnalyzingSuggester extends Lookup {
    if (hasPayloads) {
      int sepIndex = -1;
      for(int i=0;i<output2.length;i++) {
-        if (output2.bytes[output2.offset+i] == PAYLOAD_SEP) {
+        if (output2.bytes[output2.offset+i] == payloadSep) {
          sepIndex = i;
          break;
        }
@ -649,7 +662,7 @@ public class XAnalyzingSuggester extends Lookup {
          return false;
        }
      }
-      return output2.bytes[output2.offset + key.length] == PAYLOAD_SEP;
+      return output2.bytes[output2.offset + key.length] == payloadSep;
    } else {
      return key.bytesEquals(output2);
    }
@ -667,6 +680,14 @@ public class XAnalyzingSuggester extends Lookup {
    }

    //System.out.println("lookup key=" + key + " num=" + num);
+    for (int i = 0; i < key.length(); i++) {
+      if (key.charAt(i) == 0x1E) {
+        throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
+      }
+      if (key.charAt(i) == 0x1F) {
+        throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
+      }
+    }
    final BytesRef utf8Key = new BytesRef(key);
    try {

@ -688,13 +709,13 @@ public class XAnalyzingSuggester extends Lookup {

      final List<LookupResult> results = new ArrayList<LookupResult>();

-      List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst);
+      List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);

      if (exactFirst) {

        int count = 0;
        for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
-          if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
+          if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
            // This node has END_BYTE arc leaving, meaning it's an
            // "exact" match:
            count++;
@ -712,7 +733,7 @@ public class XAnalyzingSuggester extends Lookup {
        // pruned our exact match from one of these nodes
        // ...:
        for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
-          if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
+          if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
            // This node has END_BYTE arc leaving, meaning it's an
            // "exact" match:
            searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
@ -820,13 +841,12 @@ public class XAnalyzingSuggester extends Lookup {
    throws IOException {
    return prefixPaths;
  }
-  
-  final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
-      // Analyze surface form:
+
+  public final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
+    // Analyze surface form:
    TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString());
    return toFiniteStrings(ts2a, ts);
  }
-  
  public final Set<IntsRef> toFiniteStrings(final TokenStreamToAutomaton ts2a, TokenStream ts) throws IOException {
      // Analyze surface form:

@ -836,7 +856,7 @@ public class XAnalyzingSuggester extends Lookup {
      Automaton automaton = ts2a.toAutomaton(ts);
      ts.close();

-      replaceSep(automaton, preserveSep);
+      replaceSep(automaton, preserveSep, sepLabel);

      assert SpecialOperations.isFinite(automaton);

@ -862,7 +882,7 @@ public class XAnalyzingSuggester extends Lookup {
    // This way we could eg differentiate "net" from "net ",
    // which we can't today...

-    replaceSep(automaton, preserveSep);
+    replaceSep(automaton, preserveSep, sepLabel);

    // TODO: we can optimize this somewhat by determinizing
    // while we convert
@ -903,7 +923,6 @@ public class XAnalyzingSuggester extends Lookup {
  
    public static class XBuilder {
        private Builder<Pair<Long, BytesRef>> builder;
-        BytesRef previousAnalyzed = null;
        private int maxSurfaceFormsPerAnalyzedForm;
        private IntsRef scratchInts = new IntsRef();
        private final PairOutputs<Long, BytesRef> outputs;
@ -912,8 +931,10 @@ public class XAnalyzingSuggester extends Lookup {
        private final SurfaceFormAndPayload[] surfaceFormsAndPayload;
        private int count;
        private ObjectIntOpenHashMap<BytesRef> seenSurfaceForms = HppcMaps.Object.Integer.ensureNoNullKeys(256, 0.75f);
+        private int payloadSep;

-        public XBuilder(int maxSurfaceFormsPerAnalyzedForm, boolean hasPayloads) {
+        public XBuilder(int maxSurfaceFormsPerAnalyzedForm, boolean hasPayloads, int payloadSep) {
+            this.payloadSep = payloadSep;
            this.outputs = new PairOutputs<Long, BytesRef>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
            this.builder = new Builder<Pair<Long, BytesRef>>(FST.INPUT_TYPE.BYTE1, outputs);
            this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
@ -983,7 +1004,7 @@ public class XAnalyzingSuggester extends Lookup {
                int len = surface.length + 1 + payload.length;
                final BytesRef br = new BytesRef(len);
                System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
-                br.bytes[surface.length] = PAYLOAD_SEP;
+                br.bytes[surface.length] = (byte) payloadSep;
                System.arraycopy(payload.bytes, payload.offset, br.bytes, surface.length + 1, payload.length);
                br.length = len;
                payloadRef = br;
--- a/src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java
+++ b/src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java
@ -19,6 +19,7 @@
 package org.apache.lucene.search.suggest.analyzing;

 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStreamToAutomaton;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.automaton.*;
@ -48,6 +49,9 @@ import java.util.Set;
 * #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be
 * edited.  We allow up to 1 (@link
 * #DEFAULT_MAX_EDITS} edit.
+ * If {@link #unicodeAware} parameter in the constructor is set to true, maxEdits,
+ * minFuzzyLength, transpositions and nonFuzzyPrefix are measured in Unicode code
+ * points (actual letters) instead of bytes.*
 *
 * <p>
 * NOTE: This suggester does not boost suggestions that
@ -60,12 +64,22 @@ import java.util.Set;
 * like synonyms to keep the complexity of the prefix intersection low for good
 * lookup performance. At index time, complex analyzers can safely be used.
 * </p>
+ *
+ * @lucene.experimental
 */
 public final class XFuzzySuggester extends XAnalyzingSuggester {
    private final int maxEdits;
    private final boolean transpositions;
    private final int nonFuzzyPrefix;
    private final int minFuzzyLength;
+    private final boolean unicodeAware;
+
+    /**
+     *  Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix
+     *  parameters in Unicode code points (actual letters)
+     *  instead of bytes.
+     */
+    public static final boolean DEFAULT_UNICODE_AWARE = false;

    /**
     * The default minimum length of the key passed to {@link
@ -108,7 +122,7 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
     */
    public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
        this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS,
-                DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, null, false, 0);
+                DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE);

    }

@ -133,11 +147,15 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
     *        Levenshtein algorithm.
     * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
     * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
+     * @param sepLabel separation label
+     * @param payloadSep payload separator byte
+     * @param endByte end byte marker byte
     */
    public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
-                           int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength,
-                           FST<PairOutputs.Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput) {
-        super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, fst, hasPayloads, maxAnalyzedPathsForOneInput);
+                           int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength, boolean unicodeAware,
+                           FST<PairOutputs.Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
+                           int sepLabel, int payloadSep, int endByte) {
+        super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, true, fst, hasPayloads, maxAnalyzedPathsForOneInput, sepLabel, payloadSep, endByte);
        if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
            throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
        }
@ -152,6 +170,7 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
        this.transpositions = transpositions;
        this.nonFuzzyPrefix = nonFuzzyPrefix;
        this.minFuzzyLength = minFuzzyLength;
+        this.unicodeAware = unicodeAware;
    }

    @Override
@ -170,7 +189,7 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
        // "compete") ... in which case I think the wFST needs
        // to be log weights or something ...

-        Automaton levA = toLevenshteinAutomata(lookupAutomaton);
+        Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
    /*
      Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
      w.write(levA.toDot());
@ -180,6 +199,24 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
        return FSTUtil.intersectPrefixPaths(levA, fst);
    }

+    @Override
+    protected Automaton convertAutomaton(Automaton a) {
+      if (unicodeAware) {
+        Automaton utf8automaton = new UTF32ToUTF8().convert(a);
+        BasicOperations.determinize(utf8automaton);
+        return utf8automaton;
+      } else {
+        return a;
+      }
+    }
+
+    @Override
+    public TokenStreamToAutomaton getTokenStreamToAutomaton() {
+      final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton();
+      tsta.setUnicodeArcs(unicodeAware);
+      return tsta;
+    }
+
    Automaton toLevenshteinAutomata(Automaton automaton) {
        final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
        Automaton subs[] = new Automaton[ref.size()];
@ -197,7 +234,7 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
                // to allow the trailing dedup bytes to be
                // edited... but then 0 byte is "in general" allowed
                // on input (but not in UTF8).
-                LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);
+                LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
                Automaton levAutomaton = lev.toAutomaton(maxEdits);
                Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
                combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
--- a/src/main/java/org/elasticsearch/search/suggest/completion/AnalyzingCompletionLookupProvider.java
+++ b/src/main/java/org/elasticsearch/search/suggest/completion/AnalyzingCompletionLookupProvider.java
@ -55,7 +55,8 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
    private static final int MAX_GRAPH_EXPANSIONS = -1;

    public static final String CODEC_NAME = "analyzing";
-    public static final int CODEC_VERSION = 1;
+    public static final int CODEC_VERSION_START = 1;
+    public static final int CODEC_VERSION_LATEST = 2;

    private boolean preserveSep;
    private boolean preservePositionIncrements;
@ -73,8 +74,7 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
        int options = preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0;
        // needs to fixed in the suggester first before it can be supported
        //options |= exactFirst ? XAnalyzingSuggester.EXACT_FIRST : 0;
-        prototype = new XAnalyzingSuggester(null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, null, false, 1);
-        prototype.setPreservePositionIncrements(preservePositionIncrements);
+        prototype = new XAnalyzingSuggester(null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE);
    }

    @Override
@ -84,7 +84,7 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider

    @Override
    public FieldsConsumer consumer(final IndexOutput output) throws IOException {
-        CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION);
+        CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST);
        return new FieldsConsumer() {
            private Map<FieldInfo, Long> fieldOffsets = new HashMap<FieldInfo, Long>();

@ -111,7 +111,7 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
            public TermsConsumer addField(final FieldInfo field) throws IOException {

                return new TermsConsumer() {
-                    final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(maxSurfaceFormsPerAnalyzedForm, hasPayloads);
+                    final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP);
                    final CompletionPostingsConsumer postingsConsumer = new CompletionPostingsConsumer(AnalyzingCompletionLookupProvider.this, builder);

                    @Override
@ -156,6 +156,9 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
                            options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0;
                            options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0;
                            output.writeVInt(options);
+                            output.writeVInt(XAnalyzingSuggester.SEP_LABEL);
+                            output.writeVInt(XAnalyzingSuggester.END_BYTE);
+                            output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP);
                        }
                    }
                };
@ -200,7 +203,7 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider

    @Override
    public LookupFactory load(IndexInput input) throws IOException {
-        CodecUtil.checkHeader(input, CODEC_NAME, CODEC_VERSION, CODEC_VERSION);
+        int version = CodecUtil.checkHeader(input, CODEC_NAME, CODEC_VERSION_START, CODEC_VERSION_LATEST);
        final Map<String, AnalyzingSuggestHolder> lookupMap = new HashMap<String, AnalyzingSuggestHolder>();
        input.seek(input.length() - 8);
        long metaPointer = input.readLong();
@ -225,8 +228,23 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
            boolean preserveSep = (options & SERIALIZE_PRESERVE_SEPERATORS) != 0;
            boolean hasPayloads = (options & SERIALIZE_HAS_PAYLOADS) != 0;
            boolean preservePositionIncrements = (options & SERIALIZE_PRESERVE_POSITION_INCREMENTS) != 0;
-            lookupMap.put(entry.getValue(), new AnalyzingSuggestHolder(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions,
-                    hasPayloads, maxAnalyzedPathsForOneInput, fst));
+
+            // first version did not include these three fields, so fall back to old default (before the analyzingsuggester
+            // was updated in Lucene, so we cannot use the suggester defaults)
+            int sepLabel, payloadSep, endByte;
+            if (version == CODEC_VERSION_START) {
+                sepLabel = 0xFF;
+                payloadSep = '\u001f';
+                endByte = 0x0;
+            } else {
+                sepLabel = input.readVInt();
+                endByte = input.readVInt();
+                payloadSep = input.readVInt();
+            }
+
+            AnalyzingSuggestHolder holder = new AnalyzingSuggestHolder(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions,
+                    hasPayloads, maxAnalyzedPathsForOneInput, fst, sepLabel, payloadSep, endByte);
+            lookupMap.put(entry.getValue(), holder);
        }
        return new LookupFactory() {
            @Override
@ -242,17 +260,16 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
                    suggester = new XFuzzySuggester(mapper.indexAnalyzer(), mapper.searchAnalyzer(), flags,
                            analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
                            suggestionContext.getFuzzyEditDistance(), suggestionContext.isFuzzyTranspositions(),
-                            suggestionContext.getFuzzyPrefixLength(), suggestionContext.getFuzzyMinLength(),
+                            suggestionContext.getFuzzyPrefixLength(), suggestionContext.getFuzzyMinLength(), suggestionContext.isFuzzyUnicodeAware(),
                            analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
-                            analyzingSuggestHolder.maxAnalyzedPathsForOneInput);
+                            analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte);

                } else {
                    suggester = new XAnalyzingSuggester(mapper.indexAnalyzer(), mapper.searchAnalyzer(), flags,
                            analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
-                            analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
-                            analyzingSuggestHolder.maxAnalyzedPathsForOneInput);
+                            analyzingSuggestHolder.preservePositionIncrements, analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
+                            analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte);
                }
-                suggester.setPreservePositionIncrements(analyzingSuggestHolder.preservePositionIncrements);
                return suggester;
            }

@ -280,6 +297,11 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider

                return new CompletionStats(sizeInBytes, completionFields);
            }
+
+            @Override
+            AnalyzingSuggestHolder getAnalyzingSuggestHolder(FieldMapper<?> mapper) {
+                return lookupMap.get(mapper.names().indexName());
+            }
        };
    }

@ -291,9 +313,16 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
        final boolean hasPayloads;
        final int maxAnalyzedPathsForOneInput;
        final FST<Pair<Long, BytesRef>> fst;
+        final int sepLabel;
+        final int payloadSep;
+        final int endByte;

        public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
                                      boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst) {
+            this(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, hasPayloads, maxAnalyzedPathsForOneInput, fst, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE);
+        }
+
+        public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst, int sepLabel, int payloadSep, int endByte) {
            this.preserveSep = preserveSep;
            this.preservePositionIncrements = preservePositionIncrements;
            this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
@ -301,8 +330,10 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
            this.hasPayloads = hasPayloads;
            this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
            this.fst = fst;
+            this.sepLabel = sepLabel;
+            this.payloadSep = payloadSep;
+            this.endByte = endByte;
        }
-
    }

    @Override
--- a/src/main/java/org/elasticsearch/search/suggest/completion/Completion090PostingsFormat.java
+++ b/src/main/java/org/elasticsearch/search/suggest/completion/Completion090PostingsFormat.java
@ -362,5 +362,6 @@ public class Completion090PostingsFormat extends PostingsFormat {
    public static abstract class LookupFactory {
        public abstract Lookup getLookup(FieldMapper<?> mapper, CompletionSuggestionContext suggestionContext);
        public abstract CompletionStats stats(String ... fields);
+        abstract AnalyzingCompletionLookupProvider.AnalyzingSuggestHolder getAnalyzingSuggestHolder(FieldMapper<?> mapper);
    }
 }
--- a/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestParser.java
+++ b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestParser.java
@ -68,6 +68,8 @@ public class CompletionSuggestParser implements SuggestContextParser {
                            suggestion.setFuzzyMinLength(parser.intValue());
                        } else if ("prefix_length".equals(fuzzyConfigName) || "prefixLength".equals(fuzzyConfigName)) {
                            suggestion.setFuzzyPrefixLength(parser.intValue());
+                        } else if ("unicode_aware".equals(fuzzyConfigName) || "unicodeAware".equals(fuzzyConfigName)) {
+                            suggestion.setFuzzyUnicodeAware(parser.booleanValue());
                        }
                    }
                }
--- a/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionContext.java
+++ b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionContext.java
@ -34,6 +34,7 @@ public class CompletionSuggestionContext extends SuggestionSearchContext.Suggest
    private int fuzzyMinLength = XFuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH;
    private int fuzzyPrefixLength = XFuzzySuggester.DEFAULT_NON_FUZZY_PREFIX;
    private boolean fuzzy = false;
+    private boolean fuzzyUnicodeAware = XFuzzySuggester.DEFAULT_UNICODE_AWARE;

    public CompletionSuggestionContext(Suggester suggester) {
        super(suggester);
@ -86,4 +87,12 @@ public class CompletionSuggestionContext extends SuggestionSearchContext.Suggest
    public boolean isFuzzy() {
        return fuzzy;
    }
+
+    public void setFuzzyUnicodeAware(boolean fuzzyUnicodeAware) {
+        this.fuzzyUnicodeAware = fuzzyUnicodeAware;
+    }
+
+    public boolean isFuzzyUnicodeAware() {
+        return fuzzyUnicodeAware;
+    }
 }
--- a/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionFuzzyBuilder.java
+++ b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionFuzzyBuilder.java
@ -38,6 +38,7 @@ public class CompletionSuggestionFuzzyBuilder extends SuggestBuilder.SuggestionB
    private boolean fuzzyTranspositions = XFuzzySuggester.DEFAULT_TRANSPOSITIONS;
    private int fuzzyMinLength = XFuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH;
    private int fuzzyPrefixLength = XFuzzySuggester.DEFAULT_NON_FUZZY_PREFIX;
+    private boolean unicodeAware = XFuzzySuggester.DEFAULT_UNICODE_AWARE;

    public int getFuzzyEditDistance() {
        return fuzzyEditDistance;
@ -75,6 +76,15 @@ public class CompletionSuggestionFuzzyBuilder extends SuggestBuilder.SuggestionB
        return this;
    }

+    public boolean isUnicodeAware() {
+        return unicodeAware;
+    }
+
+    public CompletionSuggestionFuzzyBuilder setUnicodeAware(boolean unicodeAware) {
+        this.unicodeAware = unicodeAware;
+        return this;
+    }
+
    @Override
    protected XContentBuilder innerToXContent(XContentBuilder builder, ToXContent.Params params) throws IOException {
        builder.startObject("fuzzy");
@ -91,6 +101,9 @@ public class CompletionSuggestionFuzzyBuilder extends SuggestBuilder.SuggestionB
        if (fuzzyPrefixLength != XFuzzySuggester.DEFAULT_NON_FUZZY_PREFIX) {
            builder.field("prefix_length", fuzzyPrefixLength);
        }
+        if (unicodeAware != XFuzzySuggester.DEFAULT_UNICODE_AWARE) {
+            builder.field("unicode_aware", unicodeAware);
+        }

        builder.endObject();
        return builder;
--- a/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchTests.java
+++ b/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchTests.java
@ -566,6 +566,36 @@ public class CompletionSuggestSearchTests extends ElasticsearchIntegrationTest {
        assertSuggestions(suggestResponse, false, "foo", "Nirvana");
    }

+    @Test
+    public void testThatFuzzySuggesterIsUnicodeAware() throws Exception {
+        createIndexAndMapping("simple", "simple", true, true, true);
+
+        client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder()
+                .startObject().startObject(FIELD)
+                .startArray("input").value("ööööö").endArray()
+                .endObject().endObject()
+        ).get();
+
+        refresh();
+
+        // suggestion with a character, which needs unicode awareness
+        CompletionSuggestionFuzzyBuilder completionSuggestionBuilder =
+                new CompletionSuggestionFuzzyBuilder("foo").field(FIELD).text("öööи").size(10).setUnicodeAware(true);
+
+        SuggestResponse suggestResponse = client().prepareSuggest(INDEX).addSuggestion(completionSuggestionBuilder).execute().actionGet();
+        assertSuggestions(suggestResponse, false, "foo", "ööööö");
+
+        // removing unicode awareness leads to no result
+        completionSuggestionBuilder.setUnicodeAware(false);
+        suggestResponse = client().prepareSuggest(INDEX).addSuggestion(completionSuggestionBuilder).execute().actionGet();
+        assertSuggestions(suggestResponse, false, "foo");
+
+        // increasing edit distance instead of unicode awareness works again, as this is only a single character
+        completionSuggestionBuilder.setFuzzyEditDistance(2);
+        suggestResponse = client().prepareSuggest(INDEX).addSuggestion(completionSuggestionBuilder).execute().actionGet();
+        assertSuggestions(suggestResponse, false, "foo", "ööööö");
+    }
+
    @Test
    public void testThatStatsAreWorking() throws Exception {
        String otherField = "testOtherField";
@ -650,8 +680,11 @@ public class CompletionSuggestSearchTests extends ElasticsearchIntegrationTest {

        refresh();

+        assertSuggestions("f", "Feed the trolls", "Feed trolls");
+        assertSuggestions("fe", "Feed the trolls", "Feed trolls");
+        assertSuggestions("fee", "Feed the trolls", "Feed trolls");
+        assertSuggestions("feed", "Feed the trolls", "Feed trolls");
        assertSuggestions("feed t", "Feed the trolls", "Feed trolls");
-        assertSuggestions("feed th", "Feed the trolls");
        assertSuggestions("feed the", "Feed the trolls");
        // stop word complete, gets ignored on query time, makes it "feed" only
        assertSuggestions("feed the ", "Feed the trolls", "Feed trolls");
--- a/src/test/java/org/elasticsearch/search/suggest/completion/AnalyzingCompletionLookupProviderV1.java
+++ b/src/test/java/org/elasticsearch/search/suggest/completion/AnalyzingCompletionLookupProviderV1.java
@ -0,0 +1,330 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.search.suggest.completion;
+
+import com.carrotsearch.hppc.ObjectLongOpenHashMap;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.codecs.*;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.search.suggest.Lookup;
+import org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester;
+import org.apache.lucene.search.suggest.analyzing.XFuzzySuggester;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.fst.ByteSequenceOutputs;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.PairOutputs;
+import org.apache.lucene.util.fst.PairOutputs.Pair;
+import org.apache.lucene.util.fst.PositiveIntOutputs;
+import org.elasticsearch.common.regex.Regex;
+import org.elasticsearch.index.mapper.FieldMapper;
+import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat.CompletionLookupProvider;
+import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat.LookupFactory;
+import org.elasticsearch.search.suggest.completion.AnalyzingCompletionLookupProvider.AnalyzingSuggestHolder;
+
+import java.io.IOException;
+import java.util.*;
+/**
+ * This is an older implementation of the AnalyzingCompletionLookupProvider class
+ * We use this to test for backwards compatibility in our tests, namely
+ * CompletionPostingsFormatTest
+ * This ensures upgrades between versions work smoothly
+ */
+public class AnalyzingCompletionLookupProviderV1 extends CompletionLookupProvider {
+
+    // for serialization
+    public static final int SERIALIZE_PRESERVE_SEPERATORS = 1;
+    public static final int SERIALIZE_HAS_PAYLOADS = 2;
+    public static final int SERIALIZE_PRESERVE_POSITION_INCREMENTS = 4;
+
+    private static final int MAX_SURFACE_FORMS_PER_ANALYZED_FORM = 256;
+    private static final int MAX_GRAPH_EXPANSIONS = -1;
+
+    public static final String CODEC_NAME = "analyzing";
+    public static final int CODEC_VERSION = 1;
+
+    private boolean preserveSep;
+    private boolean preservePositionIncrements;
+    private int maxSurfaceFormsPerAnalyzedForm;
+    private int maxGraphExpansions;
+    private boolean hasPayloads;
+    private final XAnalyzingSuggester prototype;
+
+    // important, these are the settings from the old xanalyzingsuggester
+    public static final int SEP_LABEL = 0xFF;
+    public static final int END_BYTE = 0x0;
+    public static final int PAYLOAD_SEP = '\u001f';
+
+    public AnalyzingCompletionLookupProviderV1(boolean preserveSep, boolean exactFirst, boolean preservePositionIncrements, boolean hasPayloads) {
+        this.preserveSep = preserveSep;
+        this.preservePositionIncrements = preservePositionIncrements;
+        this.hasPayloads = hasPayloads;
+        this.maxSurfaceFormsPerAnalyzedForm = MAX_SURFACE_FORMS_PER_ANALYZED_FORM;
+        this.maxGraphExpansions = MAX_GRAPH_EXPANSIONS;
+        int options = preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0;
+        // needs to fixed in the suggester first before it can be supported
+        //options |= exactFirst ? XAnalyzingSuggester.EXACT_FIRST : 0;
+        prototype = new XAnalyzingSuggester(null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements,
+                null, false, 1, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
+    }
+
+    @Override
+    public String getName() {
+        return "analyzing";
+    }
+
+    @Override
+    public FieldsConsumer consumer(final IndexOutput output) throws IOException {
+        CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION);
+        return new FieldsConsumer() {
+            private Map<FieldInfo, Long> fieldOffsets = new HashMap<FieldInfo, Long>();
+
+            @Override
+            public void close() throws IOException {
+                try { /*
+                       * write the offsets per field such that we know where
+                       * we need to load the FSTs from
+                       */
+                    long pointer = output.getFilePointer();
+                    output.writeVInt(fieldOffsets.size());
+                    for (Map.Entry<FieldInfo, Long> entry : fieldOffsets.entrySet()) {
+                        output.writeString(entry.getKey().name);
+                        output.writeVLong(entry.getValue());
+                    }
+                    output.writeLong(pointer);
+                    output.flush();
+                } finally {
+                    IOUtils.close(output);
+                }
+            }
+
+            @Override
+            public TermsConsumer addField(final FieldInfo field) throws IOException {
+
+                return new TermsConsumer() {
+                    final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(maxSurfaceFormsPerAnalyzedForm, hasPayloads, PAYLOAD_SEP);
+                    final CompletionPostingsConsumer postingsConsumer = new CompletionPostingsConsumer(AnalyzingCompletionLookupProviderV1.this, builder);
+
+                    @Override
+                    public PostingsConsumer startTerm(BytesRef text) throws IOException {
+                        builder.startTerm(text);
+                        return postingsConsumer;
+                    }
+
+                    @Override
+                    public Comparator<BytesRef> getComparator() throws IOException {
+                        return BytesRef.getUTF8SortedAsUnicodeComparator();
+                    }
+
+                    @Override
+                    public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+                        builder.finishTerm(stats.docFreq); // use  doc freq as a fallback
+                    }
+
+                    @Override
+                    public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
+                        /*
+                         * Here we are done processing the field and we can
+                         * buid the FST and write it to disk.
+                         */
+                        FST<Pair<Long, BytesRef>> build = builder.build();
+                        assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]";
+                        /*
+                         * it's possible that the FST is null if we have 2 segments that get merged
+                         * and all docs that have a value in this field are deleted. This will cause
+                         * a consumer to be created but it doesn't consume any values causing the FSTBuilder
+                         * to return null.
+                         */
+                        if (build != null) {
+                            fieldOffsets.put(field, output.getFilePointer());
+                            build.save(output);
+                            /* write some more meta-info */
+                            output.writeVInt(postingsConsumer.getMaxAnalyzedPathsForOneInput());
+                            output.writeVInt(maxSurfaceFormsPerAnalyzedForm);
+                            output.writeInt(maxGraphExpansions); // can be negative
+                            int options = 0;
+                            options |= preserveSep ? SERIALIZE_PRESERVE_SEPERATORS : 0;
+                            options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0;
+                            options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0;
+                            output.writeVInt(options);
+                        }
+                    }
+                };
+            }
+        };
+    }
+
+    private static final class CompletionPostingsConsumer extends PostingsConsumer {
+        private final SuggestPayload spare = new SuggestPayload();
+        private AnalyzingCompletionLookupProviderV1 analyzingSuggestLookupProvider;
+        private XAnalyzingSuggester.XBuilder builder;
+        private int maxAnalyzedPathsForOneInput = 0;
+
+        public CompletionPostingsConsumer(AnalyzingCompletionLookupProviderV1 analyzingSuggestLookupProvider, XAnalyzingSuggester.XBuilder builder) {
+            this.analyzingSuggestLookupProvider = analyzingSuggestLookupProvider;
+            this.builder = builder;
+        }
+
+        @Override
+        public void startDoc(int docID, int freq) throws IOException {
+        }
+
+        @Override
+        public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
+            analyzingSuggestLookupProvider.parsePayload(payload, spare);
+            builder.addSurface(spare.surfaceForm, spare.payload, spare.weight);
+            // multi fields have the same surface form so we sum up here
+            maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1);
+        }
+
+        @Override
+        public void finishDoc() throws IOException {
+        }
+
+        public int getMaxAnalyzedPathsForOneInput() {
+            return maxAnalyzedPathsForOneInput;
+        }
+    }
+
+    ;
+
+
+    @Override
+    public LookupFactory load(IndexInput input) throws IOException {
+        CodecUtil.checkHeader(input, CODEC_NAME, CODEC_VERSION, CODEC_VERSION);
+        final Map<String, AnalyzingSuggestHolder> lookupMap = new HashMap<String, AnalyzingSuggestHolder>();
+        input.seek(input.length() - 8);
+        long metaPointer = input.readLong();
+        input.seek(metaPointer);
+        int numFields = input.readVInt();
+
+        Map<Long, String> meta = new TreeMap<Long, String>();
+        for (int i = 0; i < numFields; i++) {
+            String name = input.readString();
+            long offset = input.readVLong();
+            meta.put(offset, name);
+        }
+
+        for (Map.Entry<Long, String> entry : meta.entrySet()) {
+            input.seek(entry.getKey());
+            FST<Pair<Long, BytesRef>> fst = new FST<Pair<Long, BytesRef>>(input, new PairOutputs<Long, BytesRef>(
+                    PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
+            int maxAnalyzedPathsForOneInput = input.readVInt();
+            int maxSurfaceFormsPerAnalyzedForm = input.readVInt();
+            int maxGraphExpansions = input.readInt();
+            int options = input.readVInt();
+            boolean preserveSep = (options & SERIALIZE_PRESERVE_SEPERATORS) != 0;
+            boolean hasPayloads = (options & SERIALIZE_HAS_PAYLOADS) != 0;
+            boolean preservePositionIncrements = (options & SERIALIZE_PRESERVE_POSITION_INCREMENTS) != 0;
+            lookupMap.put(entry.getValue(), new AnalyzingSuggestHolder(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions,
+                    hasPayloads, maxAnalyzedPathsForOneInput, fst));
+        }
+        return new LookupFactory() {
+            @Override
+            public Lookup getLookup(FieldMapper<?> mapper, CompletionSuggestionContext suggestionContext) {
+                AnalyzingSuggestHolder analyzingSuggestHolder = lookupMap.get(mapper.names().indexName());
+                if (analyzingSuggestHolder == null) {
+                    return null;
+                }
+                int flags = analyzingSuggestHolder.preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0;
+
+                XAnalyzingSuggester suggester;
+                if (suggestionContext.isFuzzy()) {
+                    suggester = new XFuzzySuggester(mapper.indexAnalyzer(), mapper.searchAnalyzer(), flags,
+                            analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
+                            suggestionContext.getFuzzyEditDistance(), suggestionContext.isFuzzyTranspositions(),
+                            suggestionContext.getFuzzyPrefixLength(), suggestionContext.getFuzzyMinLength(), false,
+                            analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
+                            analyzingSuggestHolder.maxAnalyzedPathsForOneInput, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
+
+                } else {
+                    suggester = new XAnalyzingSuggester(mapper.indexAnalyzer(), mapper.searchAnalyzer(), flags,
+                            analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
+                            analyzingSuggestHolder.preservePositionIncrements,
+                            analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
+                            analyzingSuggestHolder.maxAnalyzedPathsForOneInput, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
+                }
+                return suggester;
+            }
+
+            @Override
+            public CompletionStats stats(String... fields) {
+                long sizeInBytes = 0;
+                ObjectLongOpenHashMap<String> completionFields = null;
+                if (fields != null  && fields.length > 0) {
+                    completionFields = new ObjectLongOpenHashMap<String>(fields.length);
+                }
+
+                for (Map.Entry<String, AnalyzingSuggestHolder> entry : lookupMap.entrySet()) {
+                    sizeInBytes += entry.getValue().fst.sizeInBytes();
+                    if (fields == null || fields.length == 0) {
+                        continue;
+                    }
+                    for (String field : fields) {
+                        // support for getting fields by regex as in fielddata
+                        if (Regex.simpleMatch(field, entry.getKey())) {
+                            long fstSize = entry.getValue().fst.sizeInBytes();
+                            completionFields.addTo(field, fstSize);
+                        }
+                    }
+                }
+
+                return new CompletionStats(sizeInBytes, completionFields);
+            }
+            @Override
+            AnalyzingSuggestHolder getAnalyzingSuggestHolder(FieldMapper<?> mapper) {
+                return lookupMap.get(mapper.names().indexName());
+            }
+        };
+    }
+
+    /*
+    // might be readded when we change the current impl, right now not needed
+    static class AnalyzingSuggestHolder {
+        final boolean preserveSep;
+        final boolean preservePositionIncrements;
+        final int maxSurfaceFormsPerAnalyzedForm;
+        final int maxGraphExpansions;
+        final boolean hasPayloads;
+        final int maxAnalyzedPathsForOneInput;
+        final FST<Pair<Long, BytesRef>> fst;
+
+        public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
+                                      boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst) {
+            this.preserveSep = preserveSep;
+            this.preservePositionIncrements = preservePositionIncrements;
+            this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
+            this.maxGraphExpansions = maxGraphExpansions;
+            this.hasPayloads = hasPayloads;
+            this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
+            this.fst = fst;
+        }
+
+    }
+    */
+
+    @Override
+    public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
+        return prototype.toFiniteStrings(prototype.getTokenStreamToAutomaton(), stream);
+    }
+}
--- a/src/test/java/org/elasticsearch/search/suggest/completion/CompletionPostingsFormatTest.java
+++ b/src/test/java/org/elasticsearch/search/suggest/completion/CompletionPostingsFormatTest.java
@ -17,8 +17,9 @@
 * under the License.
 */

-package org.elasticsearch.search.suggest;
+package org.elasticsearch.search.suggest.completion;

+import com.google.common.collect.Lists;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.codecs.*;
 import org.apache.lucene.document.Document;
@ -42,10 +43,8 @@ import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
 import org.elasticsearch.index.codec.postingsformat.PreBuiltPostingsFormatProvider;
 import org.elasticsearch.index.mapper.FieldMapper.Names;
 import org.elasticsearch.index.mapper.core.CompletionFieldMapper;
-import org.elasticsearch.search.suggest.completion.AnalyzingCompletionLookupProvider;
-import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat;
+import org.elasticsearch.search.suggest.SuggestUtils;
 import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat.LookupFactory;
-import org.elasticsearch.search.suggest.completion.CompletionSuggestionContext;
 import org.elasticsearch.test.ElasticsearchTestCase;
 import org.junit.Test;

@ -56,28 +55,33 @@ import java.util.HashMap;
 import java.util.List;

 import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.is;

 public class CompletionPostingsFormatTest extends ElasticsearchTestCase {

    @Test
    public void testCompletionPostingsFormat() throws IOException {
-        AnalyzingCompletionLookupProvider provider = new AnalyzingCompletionLookupProvider(true, false, true, true);
+        AnalyzingCompletionLookupProviderV1 providerV1 = new AnalyzingCompletionLookupProviderV1(true, false, true, true);
+        AnalyzingCompletionLookupProvider currentProvider = new AnalyzingCompletionLookupProvider(true, false, true, true);
+        List<Completion090PostingsFormat.CompletionLookupProvider> providers = Lists.newArrayList(providerV1, currentProvider);
+
+        Completion090PostingsFormat.CompletionLookupProvider randomProvider = providers.get(getRandom().nextInt(providers.size()));
        RAMDirectory dir = new RAMDirectory();
        IndexOutput output = dir.createOutput("foo.txt", IOContext.DEFAULT);
-        FieldsConsumer consumer = provider.consumer(output);
+        FieldsConsumer consumer = randomProvider.consumer(output);
        FieldInfo fieldInfo = new FieldInfo("foo", true, 1, false, true, true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
                DocValuesType.SORTED, DocValuesType.BINARY, new HashMap<String, String>());
        TermsConsumer addField = consumer.addField(fieldInfo);

        PostingsConsumer postingsConsumer = addField.startTerm(new BytesRef("foofightersgenerator"));
        postingsConsumer.startDoc(0, 1);
-        postingsConsumer.addPosition(256 - 2, provider.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
+        postingsConsumer.addPosition(256 - 2, randomProvider.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
                1);
        postingsConsumer.finishDoc();
        addField.finishTerm(new BytesRef("foofightersgenerator"), new TermStats(1, 1));
        addField.startTerm(new BytesRef("generator"));
        postingsConsumer.startDoc(0, 1);
-        postingsConsumer.addPosition(256 - 1, provider.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
+        postingsConsumer.addPosition(256 - 1, randomProvider.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
                1);
        postingsConsumer.finishDoc();
        addField.finishTerm(new BytesRef("generator"), new TermStats(1, 1));
@ -86,7 +90,7 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
        output.close();

        IndexInput input = dir.openInput("foo.txt", IOContext.DEFAULT);
-        LookupFactory load = provider.load(input);
+        LookupFactory load = currentProvider.load(input);
        PostingsFormatProvider format = new PreBuiltPostingsFormatProvider(new ElasticSearch090PostingsFormat());
        NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT));
        Lookup lookup = load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE), new CompletionSuggestionContext(null));
@ -96,6 +100,46 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
        dir.close();
    }

+    @Test
+    public void testProviderBackwardCompatibilityForVersion1() throws IOException {
+        AnalyzingCompletionLookupProviderV1 providerV1 = new AnalyzingCompletionLookupProviderV1(true, false, true, true);
+        AnalyzingCompletionLookupProvider currentProvider = new AnalyzingCompletionLookupProvider(true, false, true, true);
+
+        RAMDirectory dir = new RAMDirectory();
+        IndexOutput output = dir.createOutput("foo.txt", IOContext.DEFAULT);
+        FieldsConsumer consumer = providerV1.consumer(output);
+        FieldInfo fieldInfo = new FieldInfo("foo", true, 1, false, true, true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
+                DocValuesType.SORTED, DocValuesType.BINARY, new HashMap<String, String>());
+        TermsConsumer addField = consumer.addField(fieldInfo);
+
+        PostingsConsumer postingsConsumer = addField.startTerm(new BytesRef("foofightersgenerator"));
+        postingsConsumer.startDoc(0, 1);
+        postingsConsumer.addPosition(256 - 2, providerV1.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
+                1);
+        postingsConsumer.finishDoc();
+        addField.finishTerm(new BytesRef("foofightersgenerator"), new TermStats(1, 1));
+        addField.startTerm(new BytesRef("generator"));
+        postingsConsumer.startDoc(0, 1);
+        postingsConsumer.addPosition(256 - 1, providerV1.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
+                1);
+        postingsConsumer.finishDoc();
+        addField.finishTerm(new BytesRef("generator"), new TermStats(1, 1));
+        addField.finish(1, 1, 1);
+        consumer.close();
+        output.close();
+
+        IndexInput input = dir.openInput("foo.txt", IOContext.DEFAULT);
+        LookupFactory load = currentProvider.load(input);
+
+        PostingsFormatProvider format = new PreBuiltPostingsFormatProvider(new ElasticSearch090PostingsFormat());
+        NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT));
+        AnalyzingCompletionLookupProvider.AnalyzingSuggestHolder analyzingSuggestHolder = load.getAnalyzingSuggestHolder(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE));
+        assertThat(analyzingSuggestHolder.sepLabel, is(AnalyzingCompletionLookupProviderV1.SEP_LABEL));
+        assertThat(analyzingSuggestHolder.payloadSep, is(AnalyzingCompletionLookupProviderV1.PAYLOAD_SEP));
+        assertThat(analyzingSuggestHolder.endByte, is(AnalyzingCompletionLookupProviderV1.END_BYTE));
+        dir.close();
+    }
+
    @Test
    public void testDuellCompletions() throws IOException, NoSuchFieldException, SecurityException, IllegalArgumentException,
            IllegalAccessException {
@ -105,8 +149,7 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
        final int options = preserveSeparators ? AnalyzingSuggester.PRESERVE_SEP : 0;

        XAnalyzingSuggester reference = new XAnalyzingSuggester(new StandardAnalyzer(TEST_VERSION_CURRENT), new StandardAnalyzer(
-                TEST_VERSION_CURRENT), options, 256, -1, null, false, 1);
-        reference.setPreservePositionIncrements(preservePositionIncrements);
+                TEST_VERSION_CURRENT), options, 256, -1, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE);
        LineFileDocs docs = new LineFileDocs(getRandom());
        int num = atLeast(150);
        final String[] titles = new String[num];