LUCENE-5752: cutover suggesters

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5752@1602031 13f79535-47bb-0310-9956-ffa450edef68
2014-06-11 21:56:21 +00:00 · 2014-06-11 21:56:21 +00:00 · c98b9a8d52
parent 7339f52d92
commit c98b9a8d52
10 changed files with 259 additions and 200 deletions
--- a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.RollingBuffer;
 import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.LightAutomaton;
 import org.apache.lucene.util.automaton.State;
 import org.apache.lucene.util.automaton.Transition;

@ -61,15 +62,15 @@ public class TokenStreamToAutomaton {

  private static class Position implements RollingBuffer.Resettable {
    // Any tokens that ended at our position arrive to this state:
-    State arriving;
+    int arriving = -1;

    // Any tokens that start at our position leave from this state:
-    State leaving;
+    int leaving = -1;

    @Override
    public void reset() {
-      arriving = null;
-      leaving = null;
+      arriving = -1;
+      leaving = -1;
    }
  }

@ -98,9 +99,9 @@ public class TokenStreamToAutomaton {
   *  TokenStream}, and creates the corresponding
   *  automaton where arcs are bytes (or Unicode code points 
   *  if unicodeArcs = true) from each term. */
-  public Automaton toAutomaton(TokenStream in) throws IOException {
-    final Automaton a = new Automaton();
-    boolean deterministic = true;
+  public LightAutomaton toAutomaton(TokenStream in) throws IOException {
+    final LightAutomaton.Builder builder = new LightAutomaton.Builder();
+    builder.createState();

    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
@ -132,34 +133,29 @@ public class TokenStreamToAutomaton {
        pos += posInc;

        posData = positions.get(pos);
-        assert posData.leaving == null;
+        assert posData.leaving == -1;

-        if (posData.arriving == null) {
+        if (posData.arriving == -1) {
          // No token ever arrived to this position
          if (pos == 0) {
            // OK: this is the first token
-            posData.leaving = a.getInitialState();
+            posData.leaving = 0;
          } else {
            // This means there's a hole (eg, StopFilter
            // does this):
-            posData.leaving = new State();
-            addHoles(a.getInitialState(), positions, pos);
+            posData.leaving = builder.createState();
+            addHoles(builder, positions, pos);
          }
        } else {
-          posData.leaving = new State();
-          posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
+          posData.leaving = builder.createState();
+          builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
          if (posInc > 1) {
            // A token spanned over a hole; add holes
            // "under" it:
-            addHoles(a.getInitialState(), positions, pos);
+            addHoles(builder, positions, pos);
          }
        }
        positions.freeBefore(pos);
-      } else {
-        // note: this isn't necessarily true. its just that we aren't surely det.
-        // we could optimize this further (e.g. buffer and sort synonyms at a position)
-        // but thats probably overkill. this is cheap and dirty
-        deterministic = false;
      }

      final int endPos = pos + posLengthAtt.getPositionLength();
@ -168,31 +164,33 @@ public class TokenStreamToAutomaton {
      final BytesRef termUTF8 = changeToken(term);
      int[] termUnicode = null;
      final Position endPosData = positions.get(endPos);
-      if (endPosData.arriving == null) {
-        endPosData.arriving = new State();
+      if (endPosData.arriving == -1) {
+        endPosData.arriving = builder.createState();
      }

-      State state = posData.leaving;
      int termLen;
      if (unicodeArcs) {
        final String utf16 = termUTF8.utf8ToString();
        termUnicode = new int[utf16.codePointCount(0, utf16.length())];
        termLen = termUnicode.length;
-        for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp))
+        for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
          termUnicode[j++] = cp = utf16.codePointAt(i);
+        }
      } else {
        termLen = termUTF8.length;
      }

+      int state = posData.leaving;
+
      for(int byteIDX=0;byteIDX<termLen;byteIDX++) {
-        final State nextState = byteIDX == termLen-1 ? endPosData.arriving : new State();
+        final int nextState = byteIDX == termLen-1 ? endPosData.arriving : builder.createState();
        int c;
        if (unicodeArcs) {
          c = termUnicode[byteIDX];
        } else {
          c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
        }
-        state.addTransition(new Transition(c, nextState));
+        builder.addTransition(state, nextState, c);
        state = nextState;
      }

@ -200,28 +198,26 @@ public class TokenStreamToAutomaton {
    }

    in.end();
-    State endState = null;
+    int endState = -1;
    if (offsetAtt.endOffset() > maxOffset) {
-      endState = new State();
-      endState.setAccept(true);
+      endState = builder.createState();
+      builder.setAccept(endState, true);
    }

    pos++;
    while (pos <= positions.getMaxPos()) {
      posData = positions.get(pos);
-      if (posData.arriving != null) {
-        if (endState != null) {
-          posData.arriving.addTransition(new Transition(POS_SEP, endState));
+      if (posData.arriving != -1) {
+        if (endState != -1) {
+          builder.addTransition(posData.arriving, endState, POS_SEP);
        } else {
-          posData.arriving.setAccept(true);
+          builder.setAccept(posData.arriving, true);
        }
      }
      pos++;
    }

-    //toDot(a);
-    a.setDeterministic(deterministic);
-    return a;
+    return builder.finish();
  }

  // for debugging!
@ -235,26 +231,26 @@ public class TokenStreamToAutomaton {
  }
  */

-  private static void addHoles(State startState, RollingBuffer<Position> positions, int pos) {
+  private static void addHoles(LightAutomaton.Builder builder, RollingBuffer<Position> positions, int pos) {
    Position posData = positions.get(pos);
    Position prevPosData = positions.get(pos-1);

-    while(posData.arriving == null || prevPosData.leaving == null) {
-      if (posData.arriving == null) {
-        posData.arriving = new State();
-        posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
+    while(posData.arriving == -1 || prevPosData.leaving == -1) {
+      if (posData.arriving == -1) {
+        posData.arriving = builder.createState();
+        builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
      }
-      if (prevPosData.leaving == null) {
+      if (prevPosData.leaving == -1) {
        if (pos == 1) {
-          prevPosData.leaving = startState;
+          prevPosData.leaving = 0;
        } else {
-          prevPosData.leaving = new State();
+          prevPosData.leaving = builder.createState();
        }
-        if (prevPosData.arriving != null) {
-          prevPosData.arriving.addTransition(new Transition(POS_SEP, prevPosData.leaving));
+        if (prevPosData.arriving != -1) {
+          builder.addTransition(prevPosData.arriving, prevPosData.leaving, POS_SEP);
        }
      }
-      prevPosData.leaving.addTransition(new Transition(HOLE, posData.arriving));
+      builder.addTransition(prevPosData.leaving, posData.arriving, HOLE);
      pos--;
      if (pos <= 0) {
        break;
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
@ -483,6 +483,21 @@ final public class BasicAutomata {
    return a;
  }

+  public static LightAutomaton makeStringLight(int[] word, int offset, int length) {
+    LightAutomaton a = new LightAutomaton();
+    a.createState();
+    int s = 0;
+    for (int i = offset; i < offset+length; i++) {
+      int s2 = a.createState();
+      a.addTransition(s, s2, word[i]);
+      s = s2;
+    }
+    a.setAccept(s, true);
+    a.finish();
+
+    return a;
+  }
+
  /**
   * Returns a new (deterministic and minimal) automaton that accepts the union
   * of the given collection of {@link BytesRef}s representing UTF-8 encoded
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/LightAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/LightAutomaton.java
@ -35,7 +35,7 @@ import org.apache.lucene.util.Sorter;
 //   - could use packed int arrays instead
 //   - could encode dest w/ delta from to?

-// nocommit should we keep determinized bit?
+// nocommit should we keep determinized bit?  it could be entirely privately computed now?

 /** Uses only int[]s to represent the automaton, but requires that all
 *  transitions for each state are added at once.  If this is too restrictive,
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8Light.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8Light.java
@ -274,6 +274,9 @@ public final class UTF32ToUTF8Light {
    //if (utf32.isSingleton()) {
    //utf32 = utf32.cloneExpanded();
    //}
+    if (utf32.getNumStates() == 0) {
+      return utf32;
+    }

    int[] map = new int[utf32.getNumStates()];
    Arrays.fill(map, -1);
--- a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
@ -18,8 +18,8 @@ package org.apache.lucene.analysis;
 */

 import java.io.IOException;
-import java.io.StringWriter;
 import java.io.PrintWriter;
+import java.io.StringWriter;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@ -32,6 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.BasicAutomata;
 import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.LightAutomaton;

 public class TestGraphTokenizers extends BaseTokenStreamTestCase {

@ -409,9 +410,10 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
      new Token[] {
        token("abc", 1, 1),
      });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-    final Automaton expected = BasicAutomata.makeString("abc");
-    assertTrue(BasicOperations.sameLanguage(expected, actual));
+    final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+    final LightAutomaton expected = s2a("abc");
+    assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+                                            BasicOperations.determinize(actual)));
  }

  public void testMultipleHoles() throws Exception {
@ -420,9 +422,10 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
        token("a", 1, 1),
        token("b", 3, 1),
      });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-    final Automaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); 
-    assertTrue(BasicOperations.sameLanguage(expected, actual));
+    final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+    final LightAutomaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); 
+    assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+                                            BasicOperations.determinize(actual)));
  }

  public void testSynOverMultipleHoles() throws Exception {
@ -432,11 +435,12 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
        token("x", 0, 3),
        token("b", 3, 1),
      });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-    final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); 
-    final Automaton a2 = join(s2a("x"), SEP_A, s2a("b")); 
-    final Automaton expected = BasicOperations.union(a1, a2);
-    assertTrue(BasicOperations.sameLanguage(expected, actual));
+    final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+    final LightAutomaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); 
+    final LightAutomaton a2 = join(s2a("x"), SEP_A, s2a("b")); 
+    final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
+    assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+                                            BasicOperations.determinize(actual)));
  }

  // for debugging!
@ -450,25 +454,25 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
  }
  */

-  private static final Automaton SEP_A = BasicAutomata.makeChar(TokenStreamToAutomaton.POS_SEP);
-  private static final Automaton HOLE_A = BasicAutomata.makeChar(TokenStreamToAutomaton.HOLE);
+  private static final LightAutomaton SEP_A = BasicAutomata.makeCharLight(TokenStreamToAutomaton.POS_SEP);
+  private static final LightAutomaton HOLE_A = BasicAutomata.makeCharLight(TokenStreamToAutomaton.HOLE);

-  private Automaton join(String ... strings) {
-    List<Automaton> as = new ArrayList<>();
+  private LightAutomaton join(String ... strings) {
+    List<LightAutomaton> as = new ArrayList<>();
    for(String s : strings) {
-      as.add(BasicAutomata.makeString(s));
+      as.add(s2a(s));
      as.add(SEP_A);
    }
    as.remove(as.size()-1);
-    return BasicOperations.concatenate(as);
+    return BasicOperations.concatenateLight(as);
  }

-  private Automaton join(Automaton ... as) {
-    return BasicOperations.concatenate(Arrays.asList(as));
+  private LightAutomaton join(LightAutomaton ... as) {
+    return BasicOperations.concatenateLight(Arrays.asList(as));
  }

-  private Automaton s2a(String s) {
-    return BasicAutomata.makeString(s);
+  private LightAutomaton s2a(String s) {
+    return BasicAutomata.makeStringLight(s);
  }

  public void testTwoTokens() throws Exception {
@ -478,11 +482,12 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
        token("abc", 1, 1),
        token("def", 1, 1),
      });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-    final Automaton expected =  join("abc", "def");
+    final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+    final LightAutomaton expected =  join("abc", "def");

    //toDot(actual);
-    assertTrue(BasicOperations.sameLanguage(expected, actual));
+    assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+                                            BasicOperations.determinize(actual)));
  }

  public void testHole() throws Exception {
@ -492,12 +497,13 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
        token("abc", 1, 1),
        token("def", 2, 1),
      });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+    final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);

-    final Automaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"));
+    final LightAutomaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"));

    //toDot(actual);
-    assertTrue(BasicOperations.sameLanguage(expected, actual));
+    assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+                                            BasicOperations.determinize(actual)));
  }

  public void testOverlappedTokensSausage() throws Exception {
@ -508,11 +514,12 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
        token("abc", 1, 1),
        token("xyz", 0, 1)
      });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-    final Automaton a1 = BasicAutomata.makeString("abc");
-    final Automaton a2 = BasicAutomata.makeString("xyz");
-    final Automaton expected = BasicOperations.union(a1, a2);
-    assertTrue(BasicOperations.sameLanguage(expected, actual));
+    final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+    final LightAutomaton a1 = s2a("abc");
+    final LightAutomaton a2 = s2a("xyz");
+    final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
+    assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+                                            BasicOperations.determinize(actual)));
  }

  public void testOverlappedTokensLattice() throws Exception {
@ -523,13 +530,14 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
        token("xyz", 0, 2),
        token("def", 1, 1),
      });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-    final Automaton a1 = BasicAutomata.makeString("xyz");
-    final Automaton a2 = join("abc", "def");
+    final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+    final LightAutomaton a1 = s2a("xyz");
+    final LightAutomaton a2 = join("abc", "def");
                                                                   
-    final Automaton expected = BasicOperations.union(a1, a2);
+    final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
    //toDot(actual);
-    assertTrue(BasicOperations.sameLanguage(expected, actual));
+    assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+                                            BasicOperations.determinize(actual)));
  }

  public void testSynOverHole() throws Exception {
@ -540,14 +548,15 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
        token("X", 0, 2),
        token("b", 2, 1),
      });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-    final Automaton a1 = BasicOperations.union(
+    final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+    final LightAutomaton a1 = BasicOperations.unionLight(
                                               join(s2a("a"), SEP_A, HOLE_A),
-                                               BasicAutomata.makeString("X"));
-    final Automaton expected = BasicOperations.concatenate(a1,
+                                               s2a("X"));
+    final LightAutomaton expected = BasicOperations.concatenateLight(a1,
                                                           join(SEP_A, s2a("b")));
    //toDot(actual);
-    assertTrue(BasicOperations.sameLanguage(expected, actual));
+    assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+                                            BasicOperations.determinize(actual)));
  }

  public void testSynOverHole2() throws Exception {
@ -558,11 +567,12 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
        token("abc", 0, 3),
        token("def", 2, 1),
      });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-    final Automaton expected = BasicOperations.union(
+    final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+    final LightAutomaton expected = BasicOperations.unionLight(
                                                     join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")),
-                                                     BasicAutomata.makeString("abc"));
-    assertTrue(BasicOperations.sameLanguage(expected, actual));
+                                                     s2a("abc"));
+    assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+                                            BasicOperations.determinize(actual)));
  }

  public void testOverlappedTokensLattice2() throws Exception {
@ -574,12 +584,13 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
        token("def", 1, 1),
        token("ghi", 1, 1),
      });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-    final Automaton a1 = BasicAutomata.makeString("xyz");
-    final Automaton a2 = join("abc", "def", "ghi");
-    final Automaton expected = BasicOperations.union(a1, a2);
+    final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+    final LightAutomaton a1 = s2a("xyz");
+    final LightAutomaton a2 = join("abc", "def", "ghi");
+    final LightAutomaton expected = BasicOperations.unionLight(a1, a2);
    //toDot(actual);
-    assertTrue(BasicOperations.sameLanguage(expected, actual));
+    assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+                                            BasicOperations.determinize(actual)));
  }

  public void testToDot() throws Exception {
@ -594,10 +605,11 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
      new Token[] {
        token("abc", 2, 1),
      });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-    final Automaton expected = join(HOLE_A, SEP_A, s2a("abc"));
+    final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+    final LightAutomaton expected = join(HOLE_A, SEP_A, s2a("abc"));
    //toDot(actual);
-    assertTrue(BasicOperations.sameLanguage(expected, actual));
+    assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+                                            BasicOperations.determinize(actual)));
  }

  // TODO: testEndsWithHole... but we need posInc to set in TS.end()
@ -608,9 +620,10 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
        token("a", 1, 1),
        token("X", 0, 10),
      });
-    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
-    final Automaton expected = BasicOperations.union(BasicAutomata.makeString("a"),
-                                                     BasicAutomata.makeString("X"));
-    assertTrue(BasicOperations.sameLanguage(expected, actual));
+    final LightAutomaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
+    final LightAutomaton expected = BasicOperations.unionLight(s2a("a"),
+                                                               s2a("X"));
+    assertTrue(BasicOperations.sameLanguage(BasicOperations.determinize(expected),
+                                            BasicOperations.determinize(actual)));
  }
 }
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
@ -17,6 +17,16 @@ package org.apache.lucene.search.suggest.analyzing;
 * limitations under the License.
 */

+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.TokenStreamToAutomaton;
@ -35,28 +45,20 @@ import org.apache.lucene.util.OfflineSorter;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.LightAutomaton;
 import org.apache.lucene.util.automaton.SpecialOperations;
 import org.apache.lucene.util.automaton.State;
 import org.apache.lucene.util.automaton.Transition;
 import org.apache.lucene.util.fst.Builder;
 import org.apache.lucene.util.fst.ByteSequenceOutputs;
-import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FST.BytesReader;
-import org.apache.lucene.util.fst.PairOutputs;
+import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.PairOutputs.Pair;
+import org.apache.lucene.util.fst.PairOutputs;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
-import org.apache.lucene.util.fst.Util;
 import org.apache.lucene.util.fst.Util.Result;
 import org.apache.lucene.util.fst.Util.TopResults;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
+import org.apache.lucene.util.fst.Util;

 /**
 * Suggester that first analyzes the surface form, adds the
@ -255,37 +257,65 @@ public class AnalyzingSuggester extends Lookup {
    return fst == null ? 0 : fst.ramBytesUsed();
  }

-  private void copyDestTransitions(State from, State to, List<Transition> transitions) {
-    if (to.isAccept()) {
-      from.setAccept(true);
-    }
-    for(Transition t : to.getTransitions()) {
-      transitions.add(t);
+  private int[] topoSortStates(LightAutomaton a) {
+    int[] states = new int[a.getNumStates()];
+    final Set<Integer> visited = new HashSet<>();
+    final LinkedList<Integer> worklist = new LinkedList<>();
+    worklist.add(0);
+    visited.add(0);
+    int upto = 0;
+    states[upto] = 0;
+    upto++;
+    LightAutomaton.Transition t = new LightAutomaton.Transition();
+    while (worklist.size() > 0) {
+      int s = worklist.removeFirst();
+      int count = a.initTransition(s, t);
+      for (int i=0;i<count;i++) {
+        a.getNextTransition(t);
+        if (!visited.contains(t.dest)) {
+          visited.add(t.dest);
+          worklist.add(t.dest);
+          states[upto++] = t.dest;
+        }
+      }
    }
+    return states;
  }

+
  // Replaces SEP with epsilon or remaps them if
  // we were asked to preserve them:
-  private void replaceSep(Automaton a) {
+  private LightAutomaton replaceSep(LightAutomaton a) {

-    State[] states = a.getNumberedStates();
+    LightAutomaton result = new LightAutomaton();
+
+    // Copy all states over
+    int numStates = a.getNumStates();
+    for(int s=0;s<numStates;s++) {
+      result.createState();
+      result.setAccept(s, a.isAccept(s));
+    }

    // Go in reverse topo sort so we know we only have to
    // make one pass:
-    for(int stateNumber=states.length-1;stateNumber >=0;stateNumber--) {
-      final State state = states[stateNumber];
+    LightAutomaton.Transition t = new LightAutomaton.Transition();
+    int[] topoSortStates = topoSortStates(a);
+    for(int i=0;i<topoSortStates.length;i++) {
+      int state = topoSortStates[topoSortStates.length-1-i];
      List<Transition> newTransitions = new ArrayList<>();
-      for(Transition t : state.getTransitions()) {
-        assert t.getMin() == t.getMax();
-        if (t.getMin() == TokenStreamToAutomaton.POS_SEP) {
+      int count = a.initTransition(state, t);
+      for(int j=0;j<count;j++) {
+        a.getNextTransition(t);
+        if (t.min == TokenStreamToAutomaton.POS_SEP) {
+          assert t.max == TokenStreamToAutomaton.POS_SEP;
          if (preserveSep) {
            // Remap to SEP_LABEL:
-            newTransitions.add(new Transition(SEP_LABEL, t.getDest()));
+            result.addTransition(state, t.dest, SEP_LABEL);
          } else {
-            copyDestTransitions(state, t.getDest(), newTransitions);
-            a.setDeterministic(false);
+            result.addEpsilon(state, t.dest);
          }
-        } else if (t.getMin() == TokenStreamToAutomaton.HOLE) {
+        } else if (t.min == TokenStreamToAutomaton.HOLE) {
+          assert t.max == TokenStreamToAutomaton.HOLE;

          // Just remove the hole: there will then be two
          // SEP tokens next to each other, which will only
@ -294,19 +324,21 @@ public class AnalyzingSuggester extends Lookup {
          // that's somehow a problem we can always map HOLE
          // to a dedicated byte (and escape it in the
          // input).
-          copyDestTransitions(state, t.getDest(), newTransitions);
-          a.setDeterministic(false);
+          result.addEpsilon(state, t.dest);
        } else {
-          newTransitions.add(t);
+          result.addTransition(state, t.dest, t.min, t.max);
        }
      }
-      state.setTransitions(newTransitions.toArray(new Transition[newTransitions.size()]));
    }
+
+    result.finish();
+
+    return result;
  }

  /** Used by subclass to change the lookup automaton, if
   *  necessary. */
-  protected Automaton convertAutomaton(Automaton a) {
+  protected LightAutomaton convertAutomaton(LightAutomaton a) {
    return a;
  }
  
@ -665,8 +697,7 @@ public class AnalyzingSuggester extends Lookup {
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {
-
-      Automaton lookupAutomaton = toLookupAutomaton(key);
+      LightAutomaton lookupAutomaton = toLookupAutomaton(key);

      final CharsRef spare = new CharsRef();

@ -818,7 +849,7 @@ public class AnalyzingSuggester extends Lookup {

  /** Returns all prefix paths to initialize the search. */
  protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
-                                                                       Automaton lookupAutomaton,
+                                                                       LightAutomaton lookupAutomaton,
                                                                       FST<Pair<Long,BytesRef>> fst)
    throws IOException {
    return prefixPaths;
@ -826,7 +857,7 @@ public class AnalyzingSuggester extends Lookup {
  
  final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
    // Analyze surface form:
-    Automaton automaton = null;
+    LightAutomaton automaton = null;
    try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {

      // Create corresponding automaton: labels are bytes
@ -835,7 +866,7 @@ public class AnalyzingSuggester extends Lookup {
      automaton = ts2a.toAutomaton(ts);
    }

-    replaceSep(automaton);
+    automaton = replaceSep(automaton);
    automaton = convertAutomaton(automaton);

    // TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings
@ -848,32 +879,25 @@ public class AnalyzingSuggester extends Lookup {
    // TODO: we could walk & add simultaneously, so we
    // don't have to alloc [possibly biggish]
    // intermediate HashSet in RAM:
+
    return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
  }

-  final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
+  final LightAutomaton toLookupAutomaton(final CharSequence key) throws IOException {
    // TODO: is there a Reader from a CharSequence?
    // Turn tokenstream into automaton:
-    Automaton automaton = null;
+    LightAutomaton automaton = null;
    try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
-      automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
+        automaton = getTokenStreamToAutomaton().toAutomaton(ts);
    }

-    // TODO: we could use the end offset to "guess"
-    // whether the final token was a partial token; this
-    // would only be a heuristic ... but maybe an OK one.
-    // This way we could eg differentiate "net" from "net ",
-    // which we can't today...
-
-    replaceSep(automaton);
+    automaton = replaceSep(automaton);

    // TODO: we can optimize this somewhat by determinizing
    // while we convert
-    BasicOperations.determinize(automaton);
+    automaton = BasicOperations.determinize(automaton);
    return automaton;
  }
-  
-  

  /**
   * Returns the weight associated with an input string,
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
@ -17,12 +17,14 @@ package org.apache.lucene.search.suggest.analyzing;
 * limitations under the License.
 */

+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
-import java.io.IOException;

 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.LightAutomaton;
 import org.apache.lucene.util.automaton.State;
 import org.apache.lucene.util.automaton.Transition;
 import org.apache.lucene.util.fst.FST;
@ -43,7 +45,7 @@ public class FSTUtil {
  public static final class Path<T> {

    /** Node in the automaton where path ends: */
-    public final State state;
+    public final int state;

    /** Node in the FST where path ends: */
    public final FST.Arc<T> fstNode;
@ -55,7 +57,7 @@ public class FSTUtil {
    public final IntsRef input;

    /** Sole constructor. */
-    public Path(State state, FST.Arc<T> fstNode, T output, IntsRef input) {
+    public Path(int state, FST.Arc<T> fstNode, T output, IntsRef input) {
      this.state = state;
      this.fstNode = fstNode;
      this.output = output;
@ -67,21 +69,23 @@ public class FSTUtil {
   * Enumerates all minimal prefix paths in the automaton that also intersect the FST,
   * accumulating the FST end node and output for each path.
   */
-  public static <T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst)
+  public static <T> List<Path<T>> intersectPrefixPaths(LightAutomaton a, FST<T> fst)
      throws IOException {
-    assert a.isDeterministic();
+    assert BasicOperations.isDeterministic(a);
    final List<Path<T>> queue = new ArrayList<>();
    final List<Path<T>> endNodes = new ArrayList<>();
-    queue.add(new Path<>(a.getInitialState(), fst
+    queue.add(new Path<>(0, fst
        .getFirstArc(new FST.Arc<T>()), fst.outputs.getNoOutput(),
        new IntsRef()));
    
    final FST.Arc<T> scratchArc = new FST.Arc<>();
    final FST.BytesReader fstReader = fst.getBytesReader();
-    
+
+    LightAutomaton.Transition t = new LightAutomaton.Transition();
+
    while (queue.size() != 0) {
      final Path<T> path = queue.remove(queue.size() - 1);
-      if (path.state.isAccept()) {
+      if (a.isAccept(path.state)) {
        endNodes.add(path);
        // we can stop here if we accept this path,
        // we accept all further paths too
@ -89,18 +93,20 @@ public class FSTUtil {
      }
      
      IntsRef currentInput = path.input;
-      for (Transition t : path.state.getTransitions()) {
-        final int min = t.getMin();
-        final int max = t.getMax();
+      int count = a.initTransition(path.state, t);
+      for (int i=0;i<count;i++) {
+        a.getNextTransition(t);
+        final int min = t.min;
+        final int max = t.max;
        if (min == max) {
-          final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(),
+          final FST.Arc<T> nextArc = fst.findTargetArc(t.min,
              path.fstNode, scratchArc, fstReader);
          if (nextArc != null) {
            final IntsRef newInput = new IntsRef(currentInput.length + 1);
            newInput.copyInts(currentInput);
-            newInput.ints[currentInput.length] = t.getMin();
+            newInput.ints[currentInput.length] = t.min;
            newInput.length = currentInput.length + 1;
-            queue.add(new Path<>(t.getDest(), new FST.Arc<T>()
+            queue.add(new Path<>(t.dest, new FST.Arc<T>()
                .copyFrom(nextArc), fst.outputs
                .add(path.output, nextArc.output), newInput));
          }
@ -122,7 +128,7 @@ public class FSTUtil {
            newInput.copyInts(currentInput);
            newInput.ints[currentInput.length] = nextArc.label;
            newInput.length = currentInput.length + 1;
-            queue.add(new Path<>(t.getDest(), new FST.Arc<T>()
+            queue.add(new Path<>(t.dest, new FST.Arc<T>()
                .copyFrom(nextArc), fst.outputs
                .add(path.output, nextArc.output), newInput));
            final int label = nextArc.label; // used in assert
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
@ -32,8 +32,10 @@ import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.BasicAutomata;
 import org.apache.lucene.util.automaton.BasicOperations;
 import org.apache.lucene.util.automaton.LevenshteinAutomata;
+import org.apache.lucene.util.automaton.LightAutomaton;
 import org.apache.lucene.util.automaton.SpecialOperations;
 import org.apache.lucene.util.automaton.UTF32ToUTF8;
+import org.apache.lucene.util.automaton.UTF32ToUTF8Light;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.PairOutputs.Pair;

@ -177,7 +179,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
  
  @Override
  protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
-                                                                       Automaton lookupAutomaton,
+                                                                       LightAutomaton lookupAutomaton,
                                                                       FST<Pair<Long,BytesRef>> fst)
    throws IOException {

@ -191,7 +193,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
    // "compete") ... in which case I think the wFST needs
    // to be log weights or something ...

-    Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
+    LightAutomaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
    /*
      Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8);
      w.write(levA.toDot());
@ -202,10 +204,10 @@ public final class FuzzySuggester extends AnalyzingSuggester {
  }

  @Override
-  protected Automaton convertAutomaton(Automaton a) {
+  protected LightAutomaton convertAutomaton(LightAutomaton a) {
    if (unicodeAware) {
-      Automaton utf8automaton = new UTF32ToUTF8().convert(a);
-      BasicOperations.determinize(utf8automaton);
+      LightAutomaton utf8automaton = new UTF32ToUTF8Light().convert(a);
+      utf8automaton = BasicOperations.determinize(utf8automaton);
      return utf8automaton;
    } else {
      return a;
@ -219,16 +221,16 @@ public final class FuzzySuggester extends AnalyzingSuggester {
    return tsta;
  }

-  Automaton toLevenshteinAutomata(Automaton automaton) {
+  LightAutomaton toLevenshteinAutomata(LightAutomaton automaton) {
    final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
-    Automaton subs[] = new Automaton[ref.size()];
+    LightAutomaton subs[] = new LightAutomaton[ref.size()];
    int upto = 0;
    for (IntsRef path : ref) {
      if (path.length <= nonFuzzyPrefix || path.length < minFuzzyLength) {
-        subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length);
+        subs[upto] = BasicAutomata.makeStringLight(path.ints, path.offset, path.length);
        upto++;
      } else {
-        Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix);
+        LightAutomaton prefix = BasicAutomata.makeStringLight(path.ints, path.offset, nonFuzzyPrefix);
        int ints[] = new int[path.length-nonFuzzyPrefix];
        System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length);
        // TODO: maybe add alphaMin to LevenshteinAutomata,
@ -237,9 +239,8 @@ public final class FuzzySuggester extends AnalyzingSuggester {
        // edited... but then 0 byte is "in general" allowed
        // on input (but not in UTF8).
        LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
-        Automaton levAutomaton = lev.toAutomaton(maxEdits);
-        Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
-        combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
+        LightAutomaton levAutomaton = lev.toLightAutomaton(maxEdits);
+        LightAutomaton combined = BasicOperations.concatenateLight(prefix, levAutomaton);
        subs[upto] = combined;
        upto++;
      }
@ -247,19 +248,17 @@ public final class FuzzySuggester extends AnalyzingSuggester {

    if (subs.length == 0) {
      // automaton is empty, there is no accepted paths through it
-      return BasicAutomata.makeEmpty(); // matches nothing
+      return BasicAutomata.makeEmptyLight(); // matches nothing
    } else if (subs.length == 1) {
      // no synonyms or anything: just a single path through the tokenstream
      return subs[0];
    } else {
      // multiple paths: this is really scary! is it slow?
      // maybe we should not do this and throw UOE?
-      Automaton a = BasicOperations.union(Arrays.asList(subs));
+      LightAutomaton a = BasicOperations.unionLight(Arrays.asList(subs));
      // TODO: we could call toLevenshteinAutomata() before det? 
      // this only happens if you have multiple paths anyway (e.g. synonyms)
-      BasicOperations.determinize(a);
-
-      return a;
+      return BasicOperations.determinize(a);
    }
  }
 }
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/InputArrayIterator.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/InputArrayIterator.java
@ -92,4 +92,4 @@ public final class InputArrayIterator implements InputIterator {
  public boolean hasContexts() {
    return hasContexts;
  }
-}
+}
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
@ -40,14 +40,16 @@ import org.apache.lucene.analysis.TokenStreamToAutomaton;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.search.suggest.Lookup.LookupResult;
 import org.apache.lucene.search.suggest.Input;
 import org.apache.lucene.search.suggest.InputArrayIterator;
+import org.apache.lucene.search.suggest.Lookup.LookupResult;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.LightAutomaton;
 import org.apache.lucene.util.automaton.State;
 import org.apache.lucene.util.fst.Util;

@ -752,29 +754,30 @@ public class FuzzySuggesterTest extends LuceneTestCase {
      // us the "answer key" (ie maybe we have a bug in
      // suggester.toLevA ...) ... but testRandom2() fixes
      // this:
-      Automaton automaton = suggester.convertAutomaton(suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)));
-      assertTrue(automaton.isDeterministic());
+      LightAutomaton automaton = suggester.convertAutomaton(suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)));
+      assertTrue(BasicOperations.isDeterministic(automaton));
+
      // TODO: could be faster... but its slowCompletor for a reason
      BytesRef spare = new BytesRef();
      for (TermFreqPayload2 e : slowCompletor) {
        spare.copyChars(e.analyzedForm);
        Set<IntsRef> finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton);
        for (IntsRef intsRef : finiteStrings) {
-          State p = automaton.getInitialState();
+          int p = 0;
          BytesRef ref = Util.toBytesRef(intsRef, spare);
          boolean added = false;
          for (int i = ref.offset; i < ref.length; i++) {
-            State q = p.step(ref.bytes[i] & 0xff);
-            if (q == null) {
+            int q = automaton.step(p, ref.bytes[i] & 0xff);
+            if (q == -1) {
              break;
-            } else if (q.isAccept()) {
+            } else if (automaton.isAccept(q)) {
              matches.add(new LookupResult(e.surfaceForm, e.weight));
              added = true;
              break;
            }
            p = q;
          }
-          if (!added && p.isAccept()) {
+          if (!added && automaton.isAccept(p)) {
            matches.add(new LookupResult(e.surfaceForm, e.weight));
          } 
        }