LUCENE-5628: change getFiniteStrings to iterative impl

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1592362 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2014-05-04 15:47:59 +00:00
parent 6e9cbf3986
commit 8c4e826818
5 changed files with 387 additions and 55 deletions

View File

@ -152,6 +152,12 @@ Bug fixes
* LUCENE-5639: Fix PositionLengthAttribute implementation in Token class.
(Uwe Schindler, Robert Muir)
* LUCENE-5628: Change getFiniteStrings to iterative not recursive
implementation, so that building suggesters on a long suggestion
doesn't risk overflowing the stack; previously it consumed one Java
stack frame per character in the expanded suggestion (Robert Muir,
Simon Willnauer, Mike McCandless).
Test Framework
* LUCENE-5622: Fail tests if they print over the given limit of bytes to

View File

@ -30,12 +30,16 @@
package org.apache.lucene.util.automaton;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.Set;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.Util;
/**
@ -212,57 +216,159 @@ final public class SpecialOperations {
return accept;
}
private static class PathNode {
/** Which state the path node ends on, whose
* transitions we are enumerating. */
public State state;
/** Which state the current transition leads to. */
public State to;
/** Which transition we are on. */
public int transition;
/** Which label we are on, in the min-max range of the
* current Transition */
public int label;
public void resetState(State state) {
assert state.numTransitions() != 0;
this.state = state;
transition = 0;
Transition t = state.transitionsArray[transition];
label = t.min;
to = t.to;
}
/** Returns next label of current transition, or
* advances to next transition and returns its first
* label, if current one is exhausted. If there are
* no more transitions, returns -1. */
public int nextLabel() {
if (label > state.transitionsArray[transition].max) {
// We've exhaused the current transition's labels;
// move to next transitions:
transition++;
if (transition >= state.numTransitions()) {
// We're done iterating transitions leaving this state
return -1;
}
Transition t = state.transitionsArray[transition];
label = t.min;
to = t.to;
}
return label++;
}
}
private static PathNode getNode(PathNode[] nodes, int index) {
assert index < nodes.length;
if (nodes[index] == null) {
nodes[index] = new PathNode();
}
return nodes[index];
}
// TODO: this is a dangerous method ... Automaton could be
// huge ... and it's better in general for caller to
// enumerate & process in a single walk:
/**
* Returns the set of accepted strings, assuming that at most
* <code>limit</code> strings are accepted. If more than <code>limit</code>
* strings are accepted, the first limit strings found are returned. If <code>limit</code>&lt;0, then
* the limit is infinite.
*/
/** Returns the set of accepted strings, up to at most
* <code>limit</code> strings. If more than <code>limit</code>
* strings are accepted, the first limit strings found are returned. If <code>limit</code> == -1, then
* the limit is infinite. If the {@link Automaton} has
* cycles then this method might throw {@code
* IllegalArgumentException} but that is not guaranteed
* when the limit is set. */
public static Set<IntsRef> getFiniteStrings(Automaton a, int limit) {
HashSet<IntsRef> strings = new HashSet<>();
if (a.isSingleton()) {
if (limit > 0) {
strings.add(Util.toUTF32(a.singleton, new IntsRef()));
}
} else if (!getFiniteStrings(a.initial, new HashSet<State>(), strings, new IntsRef(), limit)) {
return strings;
Set<IntsRef> results = new HashSet<>();
if (limit == -1 || limit > 0) {
// OK
} else {
throw new IllegalArgumentException("limit must be -1 (which means no limit), or > 0; got: " + limit);
}
return strings;
}
/**
* Returns the strings that can be produced from the given state, or
* false if more than <code>limit</code> strings are found.
* <code>limit</code>&lt;0 means "infinite".
*/
private static boolean getFiniteStrings(State s, HashSet<State> pathstates,
HashSet<IntsRef> strings, IntsRef path, int limit) {
pathstates.add(s);
for (Transition t : s.getTransitions()) {
if (pathstates.contains(t.to)) {
return false;
if (a.isSingleton()) {
// Easy case: automaton accepts only 1 string
results.add(Util.toUTF32(a.singleton, new IntsRef()));
} else {
if (a.initial.accept) {
// Special case the empty string, as usual:
results.add(new IntsRef());
}
for (int n = t.min; n <= t.max; n++) {
path.grow(path.length+1);
path.ints[path.length] = n;
path.length++;
if (t.to.accept) {
strings.add(IntsRef.deepCopyOf(path));
if (limit >= 0 && strings.size() > limit) {
return false;
if (a.initial.numTransitions() > 0 && (limit == -1 || results.size() < limit)) {
// TODO: we could use state numbers here and just
// alloc array, but asking for states array can be
// costly (it's lazily computed):
// Tracks which states are in the current path, for
// cycle detection:
Set<State> pathStates = Collections.newSetFromMap(new IdentityHashMap<State,Boolean>());
// Stack to hold our current state in the
// recursion/iteration:
PathNode[] nodes = new PathNode[4];
pathStates.add(a.initial);
PathNode root = getNode(nodes, 0);
root.resetState(a.initial);
IntsRef string = new IntsRef(1);
string.length = 1;
while (string.length > 0) {
PathNode node = nodes[string.length-1];
// Get next label leaving the current node:
int label = node.nextLabel();
if (label != -1) {
string.ints[string.length-1] = label;
if (node.to.accept) {
// This transition leads to an accept state,
// so we save the current string:
results.add(IntsRef.deepCopyOf(string));
if (results.size() == limit) {
break;
}
}
if (node.to.numTransitions() != 0) {
// Now recurse: the destination of this transition has
// outgoing transitions:
if (pathStates.contains(node.to)) {
throw new IllegalArgumentException("automaton has cycles");
}
pathStates.add(node.to);
// Push node onto stack:
if (nodes.length == string.length) {
PathNode[] newNodes = new PathNode[ArrayUtil.oversize(nodes.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(nodes, 0, newNodes, 0, nodes.length);
nodes = newNodes;
}
getNode(nodes, string.length).resetState(node.to);
string.length++;
string.grow(string.length);
}
} else {
// No more transitions leaving this state,
// pop/return back to previous state:
assert pathStates.contains(node.state);
pathStates.remove(node.state);
string.length--;
}
}
if (!getFiniteStrings(t.to, pathstates, strings, path, limit)) {
return false;
}
path.length--;
}
}
pathstates.remove(s);
return true;
return results;
}
}

View File

@ -277,9 +277,4 @@ public class State implements Comparable<State> {
public int compareTo(State s) {
return s.id - id;
}
@Override
public int hashCode() {
return id;
}
}

View File

@ -1,12 +1,5 @@
package org.apache.lucene.util.automaton;
import java.util.Set;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.fst.Util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -24,6 +17,18 @@ import org.apache.lucene.util.fst.Util;
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.fst.Util;
public class TestSpecialOperations extends LuceneTestCase {
/**
* tests against the original brics implementation.
@ -36,14 +41,24 @@ public class TestSpecialOperations extends LuceneTestCase {
assertEquals(AutomatonTestUtil.isFiniteSlow(a), SpecialOperations.isFinite(b));
}
}
/** Pass false for testRecursive if the expected strings
* may be too long */
private Set<IntsRef> getFiniteStrings(Automaton a, int limit, boolean testRecursive) {
Set<IntsRef> result = SpecialOperations.getFiniteStrings(a, limit);
if (testRecursive) {
assertEquals(AutomatonTestUtil.getFiniteStringsRecursive(a, limit), result);
}
return result;
}
/**
* Basic test for getFiniteStrings
*/
public void testFiniteStrings() {
public void testFiniteStringsBasic() {
Automaton a = BasicOperations.union(BasicAutomata.makeString("dog"), BasicAutomata.makeString("duck"));
MinimizationOperations.minimize(a);
Set<IntsRef> strings = SpecialOperations.getFiniteStrings(a, -1);
Set<IntsRef> strings = getFiniteStrings(a, -1, true);
assertEquals(2, strings.size());
IntsRef dog = new IntsRef();
Util.toIntsRef(new BytesRef("dog"), dog);
@ -52,4 +67,156 @@ public class TestSpecialOperations extends LuceneTestCase {
Util.toIntsRef(new BytesRef("duck"), duck);
assertTrue(strings.contains(duck));
}
public void testFiniteStringsEatsStack() {
char[] chars = new char[50000];
TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
String bigString1 = new String(chars);
TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
String bigString2 = new String(chars);
Automaton a = BasicOperations.union(BasicAutomata.makeString(bigString1), BasicAutomata.makeString(bigString2));
Set<IntsRef> strings = getFiniteStrings(a, -1, false);
assertEquals(2, strings.size());
IntsRef scratch = new IntsRef();
Util.toUTF32(bigString1.toCharArray(), 0, bigString1.length(), scratch);
assertTrue(strings.contains(scratch));
Util.toUTF32(bigString2.toCharArray(), 0, bigString2.length(), scratch);
assertTrue(strings.contains(scratch));
}
public void testRandomFiniteStrings1() {
int numStrings = atLeast(500);
if (VERBOSE) {
System.out.println("TEST: numStrings=" + numStrings);
}
Set<IntsRef> strings = new HashSet<IntsRef>();
List<Automaton> automata = new ArrayList<Automaton>();
for(int i=0;i<numStrings;i++) {
String s = TestUtil.randomSimpleString(random(), 1, 200);
automata.add(BasicAutomata.makeString(s));
IntsRef scratch = new IntsRef();
Util.toUTF32(s.toCharArray(), 0, s.length(), scratch);
strings.add(scratch);
if (VERBOSE) {
System.out.println(" add string=" + s);
}
}
// TODO: we could sometimes use
// DaciukMihovAutomatonBuilder here
// TODO: what other random things can we do here...
Automaton a = BasicOperations.union(automata);
if (random().nextBoolean()) {
Automaton.minimize(a);
if (VERBOSE) {
System.out.println("TEST: a.minimize numStates=" + a.getNumberOfStates());
}
} else if (random().nextBoolean()) {
if (VERBOSE) {
System.out.println("TEST: a.determinize");
}
a.determinize();
} else if (random().nextBoolean()) {
if (VERBOSE) {
System.out.println("TEST: a.reduce");
}
a.reduce();
} else if (random().nextBoolean()) {
if (VERBOSE) {
System.out.println("TEST: a.getNumberedStates");
}
a.getNumberedStates();
}
Set<IntsRef> actual = getFiniteStrings(a, -1, true);
if (strings.equals(actual) == false) {
System.out.println("strings.size()=" + strings.size() + " actual.size=" + actual.size());
List<IntsRef> x = new ArrayList<>(strings);
Collections.sort(x);
List<IntsRef> y = new ArrayList<>(actual);
Collections.sort(y);
int end = Math.min(x.size(), y.size());
for(int i=0;i<end;i++) {
System.out.println(" i=" + i + " string=" + toString(x.get(i)) + " actual=" + toString(y.get(i)));
}
fail("wrong strings found");
}
}
// ascii only!
private static String toString(IntsRef ints) {
BytesRef br = new BytesRef(ints.length);
for(int i=0;i<ints.length;i++) {
br.bytes[i] = (byte) ints.ints[i];
}
br.length = ints.length;
return br.utf8ToString();
}
public void testWithCycle() throws Exception {
try {
SpecialOperations.getFiniteStrings(new RegExp("abc.*", RegExp.NONE).toAutomaton(), -1);
fail("did not hit exception");
} catch (IllegalArgumentException iae) {
// expected
}
}
public void testRandomFiniteStrings2() {
// Just makes sure we can run on any random finite
// automaton:
int iters = atLeast(100);
for(int i=0;i<iters;i++) {
Automaton a = AutomatonTestUtil.randomAutomaton(random());
try {
// Must pass a limit because the random automaton
// can accept MANY strings:
SpecialOperations.getFiniteStrings(a, TestUtil.nextInt(random(), 1, 1000));
// NOTE: cannot do this, because the method is not
// guaranteed to detect cycles when you have a limit
//assertTrue(SpecialOperations.isFinite(a));
} catch (IllegalArgumentException iae) {
assertFalse(SpecialOperations.isFinite(a));
}
}
}
public void testInvalidLimit() {
Automaton a = AutomatonTestUtil.randomAutomaton(random());
try {
SpecialOperations.getFiniteStrings(a, -7);
fail("did not hit exception");
} catch (IllegalArgumentException iae) {
// expected
}
}
public void testInvalidLimit2() {
Automaton a = AutomatonTestUtil.randomAutomaton(random());
try {
SpecialOperations.getFiniteStrings(a, 0);
fail("did not hit exception");
} catch (IllegalArgumentException iae) {
// expected
}
}
public void testSingletonNoLimit() {
Set<IntsRef> result = SpecialOperations.getFiniteStrings(BasicAutomata.makeString("foobar"), -1);
assertEquals(1, result.size());
IntsRef scratch = new IntsRef();
Util.toUTF32("foobar".toCharArray(), 0, 6, scratch);
assertTrue(result.contains(scratch));
}
public void testSingletonLimit1() {
Set<IntsRef> result = SpecialOperations.getFiniteStrings(BasicAutomata.makeString("foobar"), 1);
assertEquals(1, result.size());
IntsRef scratch = new IntsRef();
Util.toUTF32("foobar".toCharArray(), 0, 6, scratch);
assertTrue(result.contains(scratch));
}
}

View File

@ -28,8 +28,10 @@ import java.util.Random;
import java.util.Set;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.Util;
/**
* Utilities for testing automata.
@ -387,6 +389,62 @@ public class AutomatonTestUtil {
a.removeDeadTransitions();
}
/**
* Simple, original implementation of getFiniteStrings.
*
* <p>Returns the set of accepted strings, assuming that at most
* <code>limit</code> strings are accepted. If more than <code>limit</code>
* strings are accepted, the first limit strings found are returned. If <code>limit</code>&lt;0, then
* the limit is infinite.
*
* <p>This implementation is recursive: it uses one stack
* frame for each digit in the returned strings (ie, max
* is the max length returned string).
*/
public static Set<IntsRef> getFiniteStringsRecursive(Automaton a, int limit) {
HashSet<IntsRef> strings = new HashSet<>();
if (a.isSingleton()) {
if (limit > 0) {
strings.add(Util.toUTF32(a.singleton, new IntsRef()));
}
} else if (!getFiniteStrings(a.initial, new HashSet<State>(), strings, new IntsRef(), limit)) {
return strings;
}
return strings;
}
/**
* Returns the strings that can be produced from the given state, or
* false if more than <code>limit</code> strings are found.
* <code>limit</code>&lt;0 means "infinite".
*/
private static boolean getFiniteStrings(State s, HashSet<State> pathstates,
HashSet<IntsRef> strings, IntsRef path, int limit) {
pathstates.add(s);
for (Transition t : s.getTransitions()) {
if (pathstates.contains(t.to)) {
return false;
}
for (int n = t.min; n <= t.max; n++) {
path.grow(path.length+1);
path.ints[path.length] = n;
path.length++;
if (t.to.accept) {
strings.add(IntsRef.deepCopyOf(path));
if (limit >= 0 && strings.size() > limit) {
return false;
}
}
if (!getFiniteStrings(t.to, pathstates, strings, path, limit)) {
return false;
}
path.length--;
}
}
pathstates.remove(s);
return true;
}
/**
* Returns true if the language of this automaton is finite.
* <p>