LUCENE-5628: change getFiniteStrings to iterative impl

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1592362 13f79535-47bb-0310-9956-ffa450edef68
2014-05-04 15:47:59 +00:00 · 2014-05-04 15:47:59 +00:00 · 8c4e826818
parent 6e9cbf3986
commit 8c4e826818
5 changed files with 387 additions and 55 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -152,6 +152,12 @@ Bug fixes
 * LUCENE-5639: Fix PositionLengthAttribute implementation in Token class.
  (Uwe Schindler, Robert Muir)

+* LUCENE-5628: Change getFiniteStrings to iterative not recursive
+  implementation, so that building suggesters on a long suggestion
+  doesn't risk overflowing the stack; previously it consumed one Java
+  stack frame per character in the expanded suggestion (Robert Muir,
+  Simon Willnauer, Mike McCandless).
+
 Test Framework

 * LUCENE-5622: Fail tests if they print over the given limit of bytes to 
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/SpecialOperations.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/SpecialOperations.java
@ -30,12 +30,16 @@
 package org.apache.lucene.util.automaton;

 import java.util.BitSet;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.IdentityHashMap;
 import java.util.Set;

+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.fst.Util;

 /**
@ -212,57 +216,159 @@ final public class SpecialOperations {
    return accept;
  }

+  private static class PathNode {
+
+    /** Which state the path node ends on, whose
+     *  transitions we are enumerating. */
+    public State state;
+
+    /** Which state the current transition leads to. */
+    public State to;
+
+    /** Which transition we are on. */
+    public int transition;
+
+    /** Which label we are on, in the min-max range of the
+     *  current Transition */
+    public int label;
+
+    public void resetState(State state) {
+      assert state.numTransitions() != 0;
+      this.state = state;
+      transition = 0;
+      Transition t = state.transitionsArray[transition];
+      label = t.min;
+      to = t.to;
+    }
+
+    /** Returns next label of current transition, or
+     *  advances to next transition and returns its first
+     *  label, if current one is exhausted.  If there are
+     *  no more transitions, returns -1. */
+    public int nextLabel() {
+      if (label > state.transitionsArray[transition].max) {
+        // We've exhaused the current transition's labels;
+        // move to next transitions:
+        transition++;
+        if (transition >= state.numTransitions()) {
+          // We're done iterating transitions leaving this state
+          return -1;
+        }
+        Transition t = state.transitionsArray[transition];
+        label = t.min;
+        to = t.to;
+      }
+      return label++;
+    }
+  }
+
+  private static PathNode getNode(PathNode[] nodes, int index) {
+    assert index < nodes.length;
+    if (nodes[index] == null) {
+      nodes[index] = new PathNode();
+    }
+    return nodes[index];
+  }
+
  // TODO: this is a dangerous method ... Automaton could be
  // huge ... and it's better in general for caller to
  // enumerate & process in a single walk:

-  /**
-   * Returns the set of accepted strings, assuming that at most
-   * <code>limit</code> strings are accepted. If more than <code>limit</code> 
-   * strings are accepted, the first limit strings found are returned. If <code>limit</code>&lt;0, then 
-   * the limit is infinite.
-   */
+  /** Returns the set of accepted strings, up to at most
+   *  <code>limit</code> strings. If more than <code>limit</code> 
+   *  strings are accepted, the first limit strings found are returned. If <code>limit</code> == -1, then 
+   *  the limit is infinite.  If the {@link Automaton} has
+   *  cycles then this method might throw {@code
+   *  IllegalArgumentException} but that is not guaranteed
+   *  when the limit is set. */
  public static Set<IntsRef> getFiniteStrings(Automaton a, int limit) {
-    HashSet<IntsRef> strings = new HashSet<>();
-    if (a.isSingleton()) {
-      if (limit > 0) {
-        strings.add(Util.toUTF32(a.singleton, new IntsRef()));
-      }
-    } else if (!getFiniteStrings(a.initial, new HashSet<State>(), strings, new IntsRef(), limit)) {
-      return strings;
+    Set<IntsRef> results = new HashSet<>();
+
+    if (limit == -1 || limit > 0) {
+      // OK
+    } else {
+      throw new IllegalArgumentException("limit must be -1 (which means no limit), or > 0; got: " + limit);
    }
-    return strings;
-  }
-  
-  /**
-   * Returns the strings that can be produced from the given state, or
-   * false if more than <code>limit</code> strings are found. 
-   * <code>limit</code>&lt;0 means "infinite".
-   */
-  private static boolean getFiniteStrings(State s, HashSet<State> pathstates, 
-      HashSet<IntsRef> strings, IntsRef path, int limit) {
-    pathstates.add(s);
-    for (Transition t : s.getTransitions()) {
-      if (pathstates.contains(t.to)) {
-        return false;
+
+    if (a.isSingleton()) {
+      // Easy case: automaton accepts only 1 string
+      results.add(Util.toUTF32(a.singleton, new IntsRef()));
+    } else {
+
+      if (a.initial.accept) {
+        // Special case the empty string, as usual:
+        results.add(new IntsRef());
      }
-      for (int n = t.min; n <= t.max; n++) {
-        path.grow(path.length+1);
-        path.ints[path.length] = n;
-        path.length++;
-        if (t.to.accept) {
-          strings.add(IntsRef.deepCopyOf(path));
-          if (limit >= 0 && strings.size() > limit) {
-            return false;
+
+      if (a.initial.numTransitions() > 0 && (limit == -1 || results.size() < limit)) {
+
+        // TODO: we could use state numbers here and just
+        // alloc array, but asking for states array can be
+        // costly (it's lazily computed):
+
+        // Tracks which states are in the current path, for
+        // cycle detection:
+        Set<State> pathStates = Collections.newSetFromMap(new IdentityHashMap<State,Boolean>());
+
+        // Stack to hold our current state in the
+        // recursion/iteration:
+        PathNode[] nodes = new PathNode[4];
+
+        pathStates.add(a.initial);
+        PathNode root = getNode(nodes, 0);
+        root.resetState(a.initial);
+
+        IntsRef string = new IntsRef(1);
+        string.length = 1;
+
+        while (string.length > 0) {
+
+          PathNode node = nodes[string.length-1];
+
+          // Get next label leaving the current node:
+          int label = node.nextLabel();
+
+          if (label != -1) {
+            string.ints[string.length-1] = label;
+
+            if (node.to.accept) {
+              // This transition leads to an accept state,
+              // so we save the current string:
+              results.add(IntsRef.deepCopyOf(string));
+              if (results.size() == limit) {
+                break;
+              }
+            }
+
+            if (node.to.numTransitions() != 0) {
+              // Now recurse: the destination of this transition has
+              // outgoing transitions:
+              if (pathStates.contains(node.to)) {
+                throw new IllegalArgumentException("automaton has cycles");
+              }
+              pathStates.add(node.to);
+
+              // Push node onto stack:
+              if (nodes.length == string.length) {
+                PathNode[] newNodes = new PathNode[ArrayUtil.oversize(nodes.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+                System.arraycopy(nodes, 0, newNodes, 0, nodes.length);
+                nodes = newNodes;
+              }
+              getNode(nodes, string.length).resetState(node.to);
+              string.length++;
+              string.grow(string.length);
+            }
+          } else {
+            // No more transitions leaving this state,
+            // pop/return back to previous state:
+            assert pathStates.contains(node.state);
+            pathStates.remove(node.state);
+            string.length--;
          }
        }
-        if (!getFiniteStrings(t.to, pathstates, strings, path, limit)) {
-          return false;
-        }
-        path.length--;
      }
    }
-    pathstates.remove(s);
-    return true;
+
+    return results;
  }
 }
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/State.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/State.java
@ -277,9 +277,4 @@ public class State implements Comparable<State> {
  public int compareTo(State s) {
    return s.id - id;
  }
-
-  @Override
-  public int hashCode() {
-    return id;
-  }  
 }
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestSpecialOperations.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestSpecialOperations.java
@ -1,12 +1,5 @@
 package org.apache.lucene.util.automaton;

-import java.util.Set;
-
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.IntsRef;
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.fst.Util;
-
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
@ -24,6 +17,18 @@ import org.apache.lucene.util.fst.Util;
 * limitations under the License.
 */

+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.fst.Util;
+
 public class TestSpecialOperations extends LuceneTestCase {
  /**
   * tests against the original brics implementation.
@ -36,14 +41,24 @@ public class TestSpecialOperations extends LuceneTestCase {
      assertEquals(AutomatonTestUtil.isFiniteSlow(a), SpecialOperations.isFinite(b));
    }
  }
+
+  /** Pass false for testRecursive if the expected strings
+   *  may be too long */
+  private Set<IntsRef> getFiniteStrings(Automaton a, int limit, boolean testRecursive) {
+    Set<IntsRef> result = SpecialOperations.getFiniteStrings(a, limit);
+    if (testRecursive) {
+      assertEquals(AutomatonTestUtil.getFiniteStringsRecursive(a, limit), result);
+    }
+    return result;
+  }
  
  /**
   * Basic test for getFiniteStrings
   */
-  public void testFiniteStrings() {
+  public void testFiniteStringsBasic() {
    Automaton a = BasicOperations.union(BasicAutomata.makeString("dog"), BasicAutomata.makeString("duck"));
    MinimizationOperations.minimize(a);
-    Set<IntsRef> strings = SpecialOperations.getFiniteStrings(a, -1);
+    Set<IntsRef> strings = getFiniteStrings(a, -1, true);
    assertEquals(2, strings.size());
    IntsRef dog = new IntsRef();
    Util.toIntsRef(new BytesRef("dog"), dog);
@ -52,4 +67,156 @@ public class TestSpecialOperations extends LuceneTestCase {
    Util.toIntsRef(new BytesRef("duck"), duck);
    assertTrue(strings.contains(duck));
  }
+
+  public void testFiniteStringsEatsStack() {
+    char[] chars = new char[50000];
+    TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
+    String bigString1 = new String(chars);
+    TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
+    String bigString2 = new String(chars);
+    Automaton a = BasicOperations.union(BasicAutomata.makeString(bigString1), BasicAutomata.makeString(bigString2));
+    Set<IntsRef> strings = getFiniteStrings(a, -1, false);
+    assertEquals(2, strings.size());
+    IntsRef scratch = new IntsRef();
+    Util.toUTF32(bigString1.toCharArray(), 0, bigString1.length(), scratch);
+    assertTrue(strings.contains(scratch));
+    Util.toUTF32(bigString2.toCharArray(), 0, bigString2.length(), scratch);
+    assertTrue(strings.contains(scratch));
+  }
+
+  public void testRandomFiniteStrings1() {
+
+    int numStrings = atLeast(500);
+    if (VERBOSE) {
+      System.out.println("TEST: numStrings=" + numStrings);
+    }
+
+    Set<IntsRef> strings = new HashSet<IntsRef>();
+    List<Automaton> automata = new ArrayList<Automaton>();
+    for(int i=0;i<numStrings;i++) {
+      String s = TestUtil.randomSimpleString(random(), 1, 200);
+      automata.add(BasicAutomata.makeString(s));
+      IntsRef scratch = new IntsRef();
+      Util.toUTF32(s.toCharArray(), 0, s.length(), scratch);
+      strings.add(scratch);
+      if (VERBOSE) {
+        System.out.println("  add string=" + s);
+      }
+    }
+
+    // TODO: we could sometimes use
+    // DaciukMihovAutomatonBuilder here
+
+    // TODO: what other random things can we do here...
+    Automaton a = BasicOperations.union(automata);
+    if (random().nextBoolean()) {
+      Automaton.minimize(a);
+      if (VERBOSE) {
+        System.out.println("TEST: a.minimize numStates=" + a.getNumberOfStates());
+      }
+    } else if (random().nextBoolean()) {
+      if (VERBOSE) {
+        System.out.println("TEST: a.determinize");
+      }
+      a.determinize();
+    } else if (random().nextBoolean()) {
+      if (VERBOSE) {
+        System.out.println("TEST: a.reduce");
+      }
+      a.reduce();
+    } else if (random().nextBoolean()) {
+      if (VERBOSE) {
+        System.out.println("TEST: a.getNumberedStates");
+      }
+      a.getNumberedStates();
+    }
+
+    Set<IntsRef> actual = getFiniteStrings(a, -1, true);
+    if (strings.equals(actual) == false) {
+      System.out.println("strings.size()=" + strings.size() + " actual.size=" + actual.size());
+      List<IntsRef> x = new ArrayList<>(strings);
+      Collections.sort(x);
+      List<IntsRef> y = new ArrayList<>(actual);
+      Collections.sort(y);
+      int end = Math.min(x.size(), y.size());
+      for(int i=0;i<end;i++) {
+        System.out.println("  i=" + i + " string=" + toString(x.get(i)) + " actual=" + toString(y.get(i)));
+      }
+      fail("wrong strings found");
+    }
+  }
+
+  // ascii only!
+  private static String toString(IntsRef ints) {
+    BytesRef br = new BytesRef(ints.length);
+    for(int i=0;i<ints.length;i++) {
+      br.bytes[i] = (byte) ints.ints[i];
+    }
+    br.length = ints.length;
+    return br.utf8ToString();
+  }
+
+  public void testWithCycle() throws Exception {
+    try {
+      SpecialOperations.getFiniteStrings(new RegExp("abc.*", RegExp.NONE).toAutomaton(), -1);
+      fail("did not hit exception");
+    } catch (IllegalArgumentException iae) {
+      // expected
+    }
+  }
+
+  public void testRandomFiniteStrings2() {
+    // Just makes sure we can run on any random finite
+    // automaton:
+    int iters = atLeast(100);
+    for(int i=0;i<iters;i++) {
+      Automaton a = AutomatonTestUtil.randomAutomaton(random());
+      try {
+        // Must pass a limit because the random automaton
+        // can accept MANY strings:
+        SpecialOperations.getFiniteStrings(a, TestUtil.nextInt(random(), 1, 1000));
+        // NOTE: cannot do this, because the method is not
+        // guaranteed to detect cycles when you have a limit
+        //assertTrue(SpecialOperations.isFinite(a));
+      } catch (IllegalArgumentException iae) {
+        assertFalse(SpecialOperations.isFinite(a));
+      }
+    }
+  }
+
+  public void testInvalidLimit() {
+    Automaton a = AutomatonTestUtil.randomAutomaton(random());
+    try {
+      SpecialOperations.getFiniteStrings(a, -7);
+      fail("did not hit exception");
+    } catch (IllegalArgumentException iae) {
+      // expected
+    }
+  }
+
+  public void testInvalidLimit2() {
+    Automaton a = AutomatonTestUtil.randomAutomaton(random());
+    try {
+      SpecialOperations.getFiniteStrings(a, 0);
+      fail("did not hit exception");
+    } catch (IllegalArgumentException iae) {
+      // expected
+    }
+  }
+
+  public void testSingletonNoLimit() {
+    Set<IntsRef> result = SpecialOperations.getFiniteStrings(BasicAutomata.makeString("foobar"), -1);
+    assertEquals(1, result.size());
+    IntsRef scratch = new IntsRef();
+    Util.toUTF32("foobar".toCharArray(), 0, 6, scratch);
+    assertTrue(result.contains(scratch));
+  }
+
+  public void testSingletonLimit1() {
+    Set<IntsRef> result = SpecialOperations.getFiniteStrings(BasicAutomata.makeString("foobar"), 1);
+    assertEquals(1, result.size());
+    IntsRef scratch = new IntsRef();
+    Util.toUTF32("foobar".toCharArray(), 0, 6, scratch);
+    assertTrue(result.contains(scratch));
+  }
 }
--- a/lucene/test-framework/src/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/automaton/AutomatonTestUtil.java
@ -28,8 +28,10 @@ import java.util.Random;
 import java.util.Set;

 import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.fst.Util;

 /**
 * Utilities for testing automata.
@ -387,6 +389,62 @@ public class AutomatonTestUtil {
    a.removeDeadTransitions();
  }

+  /**
+   * Simple, original implementation of getFiniteStrings.
+   *
+   * <p>Returns the set of accepted strings, assuming that at most
+   * <code>limit</code> strings are accepted. If more than <code>limit</code> 
+   * strings are accepted, the first limit strings found are returned. If <code>limit</code>&lt;0, then 
+   * the limit is infinite.
+   *
+   * <p>This implementation is recursive: it uses one stack
+   * frame for each digit in the returned strings (ie, max
+   * is the max length returned string).
+   */
+  public static Set<IntsRef> getFiniteStringsRecursive(Automaton a, int limit) {
+    HashSet<IntsRef> strings = new HashSet<>();
+    if (a.isSingleton()) {
+      if (limit > 0) {
+        strings.add(Util.toUTF32(a.singleton, new IntsRef()));
+      }
+    } else if (!getFiniteStrings(a.initial, new HashSet<State>(), strings, new IntsRef(), limit)) {
+      return strings;
+    }
+    return strings;
+  }
+
+  /**
+   * Returns the strings that can be produced from the given state, or
+   * false if more than <code>limit</code> strings are found. 
+   * <code>limit</code>&lt;0 means "infinite".
+   */
+  private static boolean getFiniteStrings(State s, HashSet<State> pathstates, 
+      HashSet<IntsRef> strings, IntsRef path, int limit) {
+    pathstates.add(s);
+    for (Transition t : s.getTransitions()) {
+      if (pathstates.contains(t.to)) {
+        return false;
+      }
+      for (int n = t.min; n <= t.max; n++) {
+        path.grow(path.length+1);
+        path.ints[path.length] = n;
+        path.length++;
+        if (t.to.accept) {
+          strings.add(IntsRef.deepCopyOf(path));
+          if (limit >= 0 && strings.size() > limit) {
+            return false;
+          }
+        }
+        if (!getFiniteStrings(t.to, pathstates, strings, path, limit)) {
+          return false;
+        }
+        path.length--;
+      }
+    }
+    pathstates.remove(s);
+    return true;
+  }
+
  /**
   * Returns true if the language of this automaton is finite.
   * <p>