Fix StackOverflowException for long suggestion strings
Changed getFiniteStrings to use an iterative implementation instead of recursive, so we don't use a Java stack-frame per character for each suggestion at build & query time.
This commit is contained in:
parent
c9f1792c81
commit
6bc3a744a1
|
@ -912,7 +912,7 @@ public class XAnalyzingSuggester extends Lookup {
|
||||||
// TODO: we could walk & add simultaneously, so we
|
// TODO: we could walk & add simultaneously, so we
|
||||||
// don't have to alloc [possibly biggish]
|
// don't have to alloc [possibly biggish]
|
||||||
// intermediate HashSet in RAM:
|
// intermediate HashSet in RAM:
|
||||||
return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
|
return XSpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
|
||||||
}
|
}
|
||||||
|
|
||||||
final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
|
final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
|
||||||
|
|
|
@ -219,7 +219,7 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
|
||||||
}
|
}
|
||||||
|
|
||||||
Automaton toLevenshteinAutomata(Automaton automaton) {
|
Automaton toLevenshteinAutomata(Automaton automaton) {
|
||||||
final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
|
final Set<IntsRef> ref = XSpecialOperations.getFiniteStrings(automaton, -1);
|
||||||
Automaton subs[] = new Automaton[ref.size()];
|
Automaton subs[] = new Automaton[ref.size()];
|
||||||
int upto = 0;
|
int upto = 0;
|
||||||
for (IntsRef path : ref) {
|
for (IntsRef path : ref) {
|
||||||
|
|
|
@ -0,0 +1,200 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.search.suggest.analyzing;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.IdentityHashMap;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
import org.apache.lucene.util.automaton.Automaton;
|
||||||
|
import org.apache.lucene.util.automaton.State;
|
||||||
|
import org.apache.lucene.util.automaton.Transition;
|
||||||
|
import org.apache.lucene.util.fst.Util;
|
||||||
|
import org.elasticsearch.Version;
|
||||||
|
|
||||||
|
class XSpecialOperations {
|
||||||
|
|
||||||
|
// TODO Lucene 4.9: remove this once we upgrade; see
|
||||||
|
// LUCENE-5628
|
||||||
|
|
||||||
|
static {
|
||||||
|
assert Version.CURRENT.luceneVersion == org.apache.lucene.util.Version.LUCENE_48: "Remove this code once we upgrade to Lucene 4.9 where LUCENE-5628 is fixed";
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class PathNode {
|
||||||
|
|
||||||
|
/** Which state the path node ends on, whose
|
||||||
|
* transitions we are enumerating. */
|
||||||
|
public State state;
|
||||||
|
|
||||||
|
/** Which state the current transition leads to. */
|
||||||
|
public State to;
|
||||||
|
|
||||||
|
/** Which transition we are on. */
|
||||||
|
public int transition;
|
||||||
|
|
||||||
|
/** Which label we are on, in the min-max range of the
|
||||||
|
* current Transition */
|
||||||
|
public int label;
|
||||||
|
|
||||||
|
public void resetState(State state) {
|
||||||
|
assert state.numTransitions() != 0;
|
||||||
|
this.state = state;
|
||||||
|
transition = 0;
|
||||||
|
Transition t = state.transitionsArray[transition];
|
||||||
|
label = t.getMin();
|
||||||
|
to = t.getDest();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns next label of current transition, or
|
||||||
|
* advances to next transition and returns its first
|
||||||
|
* label, if current one is exhausted. If there are
|
||||||
|
* no more transitions, returns -1. */
|
||||||
|
public int nextLabel() {
|
||||||
|
if (label > state.transitionsArray[transition].getMax()) {
|
||||||
|
// We've exhaused the current transition's labels;
|
||||||
|
// move to next transitions:
|
||||||
|
transition++;
|
||||||
|
if (transition >= state.numTransitions()) {
|
||||||
|
// We're done iterating transitions leaving this state
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
Transition t = state.transitionsArray[transition];
|
||||||
|
label = t.getMin();
|
||||||
|
to = t.getDest();
|
||||||
|
}
|
||||||
|
return label++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static PathNode getNode(PathNode[] nodes, int index) {
|
||||||
|
assert index < nodes.length;
|
||||||
|
if (nodes[index] == null) {
|
||||||
|
nodes[index] = new PathNode();
|
||||||
|
}
|
||||||
|
return nodes[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: this is a dangerous method ... Automaton could be
|
||||||
|
// huge ... and it's better in general for caller to
|
||||||
|
// enumerate & process in a single walk:
|
||||||
|
|
||||||
|
/** Returns the set of accepted strings, up to at most
|
||||||
|
* <code>limit</code> strings. If more than <code>limit</code>
|
||||||
|
* strings are accepted, the first limit strings found are returned. If <code>limit</code> == -1, then
|
||||||
|
* the limit is infinite. If the {@link Automaton} has
|
||||||
|
* cycles then this method might throw {@code
|
||||||
|
* IllegalArgumentException} but that is not guaranteed
|
||||||
|
* when the limit is set. */
|
||||||
|
public static Set<IntsRef> getFiniteStrings(Automaton a, int limit) {
|
||||||
|
Set<IntsRef> results = new HashSet<>();
|
||||||
|
|
||||||
|
if (limit == -1 || limit > 0) {
|
||||||
|
// OK
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException("limit must be -1 (which means no limit), or > 0; got: " + limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (a.getSingleton() != null) {
|
||||||
|
// Easy case: automaton accepts only 1 string
|
||||||
|
results.add(Util.toUTF32(a.getSingleton(), new IntsRef()));
|
||||||
|
} else {
|
||||||
|
|
||||||
|
if (a.getInitialState().isAccept()) {
|
||||||
|
// Special case the empty string, as usual:
|
||||||
|
results.add(new IntsRef());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (a.getInitialState().numTransitions() > 0 && (limit == -1 || results.size() < limit)) {
|
||||||
|
|
||||||
|
// TODO: we could use state numbers here and just
|
||||||
|
// alloc array, but asking for states array can be
|
||||||
|
// costly (it's lazily computed):
|
||||||
|
|
||||||
|
// Tracks which states are in the current path, for
|
||||||
|
// cycle detection:
|
||||||
|
Set<State> pathStates = Collections.newSetFromMap(new IdentityHashMap<State,Boolean>());
|
||||||
|
|
||||||
|
// Stack to hold our current state in the
|
||||||
|
// recursion/iteration:
|
||||||
|
PathNode[] nodes = new PathNode[4];
|
||||||
|
|
||||||
|
pathStates.add(a.getInitialState());
|
||||||
|
PathNode root = getNode(nodes, 0);
|
||||||
|
root.resetState(a.getInitialState());
|
||||||
|
|
||||||
|
IntsRef string = new IntsRef(1);
|
||||||
|
string.length = 1;
|
||||||
|
|
||||||
|
while (string.length > 0) {
|
||||||
|
|
||||||
|
PathNode node = nodes[string.length-1];
|
||||||
|
|
||||||
|
// Get next label leaving the current node:
|
||||||
|
int label = node.nextLabel();
|
||||||
|
|
||||||
|
if (label != -1) {
|
||||||
|
string.ints[string.length-1] = label;
|
||||||
|
|
||||||
|
if (node.to.isAccept()) {
|
||||||
|
// This transition leads to an accept state,
|
||||||
|
// so we save the current string:
|
||||||
|
results.add(IntsRef.deepCopyOf(string));
|
||||||
|
if (results.size() == limit) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (node.to.numTransitions() != 0) {
|
||||||
|
// Now recurse: the destination of this transition has
|
||||||
|
// outgoing transitions:
|
||||||
|
if (pathStates.contains(node.to)) {
|
||||||
|
throw new IllegalArgumentException("automaton has cycles");
|
||||||
|
}
|
||||||
|
pathStates.add(node.to);
|
||||||
|
|
||||||
|
// Push node onto stack:
|
||||||
|
if (nodes.length == string.length) {
|
||||||
|
PathNode[] newNodes = new PathNode[ArrayUtil.oversize(nodes.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||||
|
System.arraycopy(nodes, 0, newNodes, 0, nodes.length);
|
||||||
|
nodes = newNodes;
|
||||||
|
}
|
||||||
|
getNode(nodes, string.length).resetState(node.to);
|
||||||
|
string.length++;
|
||||||
|
string.grow(string.length);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// No more transitions leaving this state,
|
||||||
|
// pop/return back to previous state:
|
||||||
|
assert pathStates.contains(node.state);
|
||||||
|
pathStates.remove(node.state);
|
||||||
|
string.length--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
}
|
|
@ -126,13 +126,6 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
|
||||||
final boolean usePayloads = getRandom().nextBoolean();
|
final boolean usePayloads = getRandom().nextBoolean();
|
||||||
final int options = preserveSeparators ? AnalyzingSuggester.PRESERVE_SEP : 0;
|
final int options = preserveSeparators ? AnalyzingSuggester.PRESERVE_SEP : 0;
|
||||||
|
|
||||||
// NOTE: remove once we fix getFiniteStrings to not
|
|
||||||
// recurse; this is just a stopgap to mute the test:
|
|
||||||
// This test fails on Java8, I think because that
|
|
||||||
// version allocates less stack in the Jenkins envs
|
|
||||||
// where we run tests
|
|
||||||
assumeFalse(Constants.JRE_IS_MINIMUM_JAVA8);
|
|
||||||
|
|
||||||
XAnalyzingSuggester reference = new XAnalyzingSuggester(new StandardAnalyzer(TEST_VERSION_CURRENT), null, new StandardAnalyzer(
|
XAnalyzingSuggester reference = new XAnalyzingSuggester(new StandardAnalyzer(TEST_VERSION_CURRENT), null, new StandardAnalyzer(
|
||||||
TEST_VERSION_CURRENT), options, 256, -1, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER);
|
TEST_VERSION_CURRENT), options, 256, -1, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER);
|
||||||
LineFileDocs docs = new LineFileDocs(getRandom());
|
LineFileDocs docs = new LineFileDocs(getRandom());
|
||||||
|
|
Loading…
Reference in New Issue