Fix StackOverflowException for long suggestion strings

Changed getFiniteStrings to use an iterative implementation instead of recursive, so we don't use a Java stack-frame per character for each suggestion at build & query time.
2014-05-04 13:35:05 -04:00 · 2014-05-04 13:35:05 -04:00 · 6bc3a744a1
parent c9f1792c81
commit 6bc3a744a1
4 changed files with 202 additions and 9 deletions
--- a/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java
+++ b/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java
@ -912,7 +912,7 @@ public class XAnalyzingSuggester extends Lookup {
      // TODO: we could walk & add simultaneously, so we
      // don't have to alloc [possibly biggish]
      // intermediate HashSet in RAM:
-      return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
+      return XSpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
  }
  final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
--- a/src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java
+++ b/src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java
@ -219,7 +219,7 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
    }
    Automaton toLevenshteinAutomata(Automaton automaton) {
-        final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
+        final Set<IntsRef> ref = XSpecialOperations.getFiniteStrings(automaton, -1);
        Automaton subs[] = new Automaton[ref.size()];
        int upto = 0;
        for (IntsRef path : ref) {
--- a/src/main/java/org/apache/lucene/search/suggest/analyzing/XSpecialOperations.java
+++ b/src/main/java/org/apache/lucene/search/suggest/analyzing/XSpecialOperations.java
@ -0,0 +1,200 @@
 /*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.apache.lucene.search.suggest.analyzing;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.IdentityHashMap;
 import java.util.Set;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.State;
 import org.apache.lucene.util.automaton.Transition;
 import org.apache.lucene.util.fst.Util;
 import org.elasticsearch.Version;
 class XSpecialOperations {
  // TODO Lucene 4.9: remove this once we upgrade; see
  // LUCENE-5628
  static {
    assert Version.CURRENT.luceneVersion == org.apache.lucene.util.Version.LUCENE_48: "Remove this code once we upgrade to Lucene 4.9 where LUCENE-5628 is fixed";
  }
  private static class PathNode {
    /** Which state the path node ends on, whose
     *  transitions we are enumerating. */
    public State state;
    /** Which state the current transition leads to. */
    public State to;
    /** Which transition we are on. */
    public int transition;
    /** Which label we are on, in the min-max range of the
     *  current Transition */
    public int label;
    public void resetState(State state) {
      assert state.numTransitions() != 0;
      this.state = state;
      transition = 0;
      Transition t = state.transitionsArray[transition];
      label = t.getMin();
      to = t.getDest();
    }
    /** Returns next label of current transition, or
     *  advances to next transition and returns its first
     *  label, if current one is exhausted.  If there are
     *  no more transitions, returns -1. */
    public int nextLabel() {
      if (label > state.transitionsArray[transition].getMax()) {
        // We've exhaused the current transition's labels;
        // move to next transitions:
        transition++;
        if (transition >= state.numTransitions()) {
          // We're done iterating transitions leaving this state
          return -1;
        }
        Transition t = state.transitionsArray[transition];
        label = t.getMin();
        to = t.getDest();
      }
      return label++;
    }
  }
  private static PathNode getNode(PathNode[] nodes, int index) {
    assert index < nodes.length;
    if (nodes[index] == null) {
      nodes[index] = new PathNode();
    }
    return nodes[index];
  }
  // TODO: this is a dangerous method ... Automaton could be
  // huge ... and it's better in general for caller to
  // enumerate & process in a single walk:
  /** Returns the set of accepted strings, up to at most
   *  <code>limit</code> strings. If more than <code>limit</code> 
   *  strings are accepted, the first limit strings found are returned. If <code>limit</code> == -1, then 
   *  the limit is infinite.  If the {@link Automaton} has
   *  cycles then this method might throw {@code
   *  IllegalArgumentException} but that is not guaranteed
   *  when the limit is set. */
  public static Set<IntsRef> getFiniteStrings(Automaton a, int limit) {
    Set<IntsRef> results = new HashSet<>();
    if (limit == -1 || limit > 0) {
      // OK
    } else {
      throw new IllegalArgumentException("limit must be -1 (which means no limit), or > 0; got: " + limit);
    }
    if (a.getSingleton() != null) {
      // Easy case: automaton accepts only 1 string
      results.add(Util.toUTF32(a.getSingleton(), new IntsRef()));
    } else {
      if (a.getInitialState().isAccept()) {
        // Special case the empty string, as usual:
        results.add(new IntsRef());
      }
      if (a.getInitialState().numTransitions() > 0 && (limit == -1 || results.size() < limit)) {
        // TODO: we could use state numbers here and just
        // alloc array, but asking for states array can be
        // costly (it's lazily computed):
        // Tracks which states are in the current path, for
        // cycle detection:
        Set<State> pathStates = Collections.newSetFromMap(new IdentityHashMap<State,Boolean>());
        // Stack to hold our current state in the
        // recursion/iteration:
        PathNode[] nodes = new PathNode[4];
        pathStates.add(a.getInitialState());
        PathNode root = getNode(nodes, 0);
        root.resetState(a.getInitialState());
        IntsRef string = new IntsRef(1);
        string.length = 1;
        while (string.length > 0) {
          PathNode node = nodes[string.length-1];
          // Get next label leaving the current node:
          int label = node.nextLabel();
          if (label != -1) {
            string.ints[string.length-1] = label;
            if (node.to.isAccept()) {
              // This transition leads to an accept state,
              // so we save the current string:
              results.add(IntsRef.deepCopyOf(string));
              if (results.size() == limit) {
                break;
              }
            }
            if (node.to.numTransitions() != 0) {
              // Now recurse: the destination of this transition has
              // outgoing transitions:
              if (pathStates.contains(node.to)) {
                throw new IllegalArgumentException("automaton has cycles");
              }
              pathStates.add(node.to);
              // Push node onto stack:
              if (nodes.length == string.length) {
                PathNode[] newNodes = new PathNode[ArrayUtil.oversize(nodes.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
                System.arraycopy(nodes, 0, newNodes, 0, nodes.length);
                nodes = newNodes;
              }
              getNode(nodes, string.length).resetState(node.to);
              string.length++;
              string.grow(string.length);
            }
          } else {
            // No more transitions leaving this state,
            // pop/return back to previous state:
            assert pathStates.contains(node.state);
            pathStates.remove(node.state);
            string.length--;
          }
        }
      }
    }
    return results;
  }
 }
--- a/src/test/java/org/elasticsearch/search/suggest/completion/CompletionPostingsFormatTest.java
+++ b/src/test/java/org/elasticsearch/search/suggest/completion/CompletionPostingsFormatTest.java
@ -126,13 +126,6 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
        final boolean usePayloads = getRandom().nextBoolean();
        final int options = preserveSeparators ? AnalyzingSuggester.PRESERVE_SEP : 0;
        // NOTE: remove once we fix getFiniteStrings to not
        // recurse; this is just a stopgap to mute the test: 
        // This test fails on Java8, I think because that
        // version allocates less stack in the Jenkins envs
        // where we run tests
        assumeFalse(Constants.JRE_IS_MINIMUM_JAVA8);
        XAnalyzingSuggester reference = new XAnalyzingSuggester(new StandardAnalyzer(TEST_VERSION_CURRENT), null, new StandardAnalyzer(
                TEST_VERSION_CURRENT), options, 256, -1, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER);
        LineFileDocs docs = new LineFileDocs(getRandom());