mirror of https://github.com/apache/lucene.git
LUCENE-5628: change getFiniteStrings to iterative impl
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1592362 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6e9cbf3986
commit
8c4e826818
|
@ -152,6 +152,12 @@ Bug fixes
|
|||
* LUCENE-5639: Fix PositionLengthAttribute implementation in Token class.
|
||||
(Uwe Schindler, Robert Muir)
|
||||
|
||||
* LUCENE-5628: Change getFiniteStrings to iterative not recursive
|
||||
implementation, so that building suggesters on a long suggestion
|
||||
doesn't risk overflowing the stack; previously it consumed one Java
|
||||
stack frame per character in the expanded suggestion (Robert Muir,
|
||||
Simon Willnauer, Mike McCandless).
|
||||
|
||||
Test Framework
|
||||
|
||||
* LUCENE-5622: Fail tests if they print over the given limit of bytes to
|
||||
|
|
|
@ -30,12 +30,16 @@
|
|||
package org.apache.lucene.util.automaton;
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
/**
|
||||
|
@ -212,57 +216,159 @@ final public class SpecialOperations {
|
|||
return accept;
|
||||
}
|
||||
|
||||
private static class PathNode {
|
||||
|
||||
/** Which state the path node ends on, whose
|
||||
* transitions we are enumerating. */
|
||||
public State state;
|
||||
|
||||
/** Which state the current transition leads to. */
|
||||
public State to;
|
||||
|
||||
/** Which transition we are on. */
|
||||
public int transition;
|
||||
|
||||
/** Which label we are on, in the min-max range of the
|
||||
* current Transition */
|
||||
public int label;
|
||||
|
||||
public void resetState(State state) {
|
||||
assert state.numTransitions() != 0;
|
||||
this.state = state;
|
||||
transition = 0;
|
||||
Transition t = state.transitionsArray[transition];
|
||||
label = t.min;
|
||||
to = t.to;
|
||||
}
|
||||
|
||||
/** Returns next label of current transition, or
|
||||
* advances to next transition and returns its first
|
||||
* label, if current one is exhausted. If there are
|
||||
* no more transitions, returns -1. */
|
||||
public int nextLabel() {
|
||||
if (label > state.transitionsArray[transition].max) {
|
||||
// We've exhaused the current transition's labels;
|
||||
// move to next transitions:
|
||||
transition++;
|
||||
if (transition >= state.numTransitions()) {
|
||||
// We're done iterating transitions leaving this state
|
||||
return -1;
|
||||
}
|
||||
Transition t = state.transitionsArray[transition];
|
||||
label = t.min;
|
||||
to = t.to;
|
||||
}
|
||||
return label++;
|
||||
}
|
||||
}
|
||||
|
||||
private static PathNode getNode(PathNode[] nodes, int index) {
|
||||
assert index < nodes.length;
|
||||
if (nodes[index] == null) {
|
||||
nodes[index] = new PathNode();
|
||||
}
|
||||
return nodes[index];
|
||||
}
|
||||
|
||||
// TODO: this is a dangerous method ... Automaton could be
|
||||
// huge ... and it's better in general for caller to
|
||||
// enumerate & process in a single walk:
|
||||
|
||||
/**
|
||||
* Returns the set of accepted strings, assuming that at most
|
||||
* <code>limit</code> strings are accepted. If more than <code>limit</code>
|
||||
* strings are accepted, the first limit strings found are returned. If <code>limit</code><0, then
|
||||
* the limit is infinite.
|
||||
*/
|
||||
/** Returns the set of accepted strings, up to at most
|
||||
* <code>limit</code> strings. If more than <code>limit</code>
|
||||
* strings are accepted, the first limit strings found are returned. If <code>limit</code> == -1, then
|
||||
* the limit is infinite. If the {@link Automaton} has
|
||||
* cycles then this method might throw {@code
|
||||
* IllegalArgumentException} but that is not guaranteed
|
||||
* when the limit is set. */
|
||||
public static Set<IntsRef> getFiniteStrings(Automaton a, int limit) {
|
||||
HashSet<IntsRef> strings = new HashSet<>();
|
||||
if (a.isSingleton()) {
|
||||
if (limit > 0) {
|
||||
strings.add(Util.toUTF32(a.singleton, new IntsRef()));
|
||||
}
|
||||
} else if (!getFiniteStrings(a.initial, new HashSet<State>(), strings, new IntsRef(), limit)) {
|
||||
return strings;
|
||||
Set<IntsRef> results = new HashSet<>();
|
||||
|
||||
if (limit == -1 || limit > 0) {
|
||||
// OK
|
||||
} else {
|
||||
throw new IllegalArgumentException("limit must be -1 (which means no limit), or > 0; got: " + limit);
|
||||
}
|
||||
return strings;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the strings that can be produced from the given state, or
|
||||
* false if more than <code>limit</code> strings are found.
|
||||
* <code>limit</code><0 means "infinite".
|
||||
*/
|
||||
private static boolean getFiniteStrings(State s, HashSet<State> pathstates,
|
||||
HashSet<IntsRef> strings, IntsRef path, int limit) {
|
||||
pathstates.add(s);
|
||||
for (Transition t : s.getTransitions()) {
|
||||
if (pathstates.contains(t.to)) {
|
||||
return false;
|
||||
|
||||
if (a.isSingleton()) {
|
||||
// Easy case: automaton accepts only 1 string
|
||||
results.add(Util.toUTF32(a.singleton, new IntsRef()));
|
||||
} else {
|
||||
|
||||
if (a.initial.accept) {
|
||||
// Special case the empty string, as usual:
|
||||
results.add(new IntsRef());
|
||||
}
|
||||
for (int n = t.min; n <= t.max; n++) {
|
||||
path.grow(path.length+1);
|
||||
path.ints[path.length] = n;
|
||||
path.length++;
|
||||
if (t.to.accept) {
|
||||
strings.add(IntsRef.deepCopyOf(path));
|
||||
if (limit >= 0 && strings.size() > limit) {
|
||||
return false;
|
||||
|
||||
if (a.initial.numTransitions() > 0 && (limit == -1 || results.size() < limit)) {
|
||||
|
||||
// TODO: we could use state numbers here and just
|
||||
// alloc array, but asking for states array can be
|
||||
// costly (it's lazily computed):
|
||||
|
||||
// Tracks which states are in the current path, for
|
||||
// cycle detection:
|
||||
Set<State> pathStates = Collections.newSetFromMap(new IdentityHashMap<State,Boolean>());
|
||||
|
||||
// Stack to hold our current state in the
|
||||
// recursion/iteration:
|
||||
PathNode[] nodes = new PathNode[4];
|
||||
|
||||
pathStates.add(a.initial);
|
||||
PathNode root = getNode(nodes, 0);
|
||||
root.resetState(a.initial);
|
||||
|
||||
IntsRef string = new IntsRef(1);
|
||||
string.length = 1;
|
||||
|
||||
while (string.length > 0) {
|
||||
|
||||
PathNode node = nodes[string.length-1];
|
||||
|
||||
// Get next label leaving the current node:
|
||||
int label = node.nextLabel();
|
||||
|
||||
if (label != -1) {
|
||||
string.ints[string.length-1] = label;
|
||||
|
||||
if (node.to.accept) {
|
||||
// This transition leads to an accept state,
|
||||
// so we save the current string:
|
||||
results.add(IntsRef.deepCopyOf(string));
|
||||
if (results.size() == limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (node.to.numTransitions() != 0) {
|
||||
// Now recurse: the destination of this transition has
|
||||
// outgoing transitions:
|
||||
if (pathStates.contains(node.to)) {
|
||||
throw new IllegalArgumentException("automaton has cycles");
|
||||
}
|
||||
pathStates.add(node.to);
|
||||
|
||||
// Push node onto stack:
|
||||
if (nodes.length == string.length) {
|
||||
PathNode[] newNodes = new PathNode[ArrayUtil.oversize(nodes.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
System.arraycopy(nodes, 0, newNodes, 0, nodes.length);
|
||||
nodes = newNodes;
|
||||
}
|
||||
getNode(nodes, string.length).resetState(node.to);
|
||||
string.length++;
|
||||
string.grow(string.length);
|
||||
}
|
||||
} else {
|
||||
// No more transitions leaving this state,
|
||||
// pop/return back to previous state:
|
||||
assert pathStates.contains(node.state);
|
||||
pathStates.remove(node.state);
|
||||
string.length--;
|
||||
}
|
||||
}
|
||||
if (!getFiniteStrings(t.to, pathstates, strings, path, limit)) {
|
||||
return false;
|
||||
}
|
||||
path.length--;
|
||||
}
|
||||
}
|
||||
pathstates.remove(s);
|
||||
return true;
|
||||
|
||||
return results;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -277,9 +277,4 @@ public class State implements Comparable<State> {
|
|||
public int compareTo(State s) {
|
||||
return s.id - id;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return id;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,12 +1,5 @@
|
|||
package org.apache.lucene.util.automaton;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -24,6 +17,18 @@ import org.apache.lucene.util.fst.Util;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
public class TestSpecialOperations extends LuceneTestCase {
|
||||
/**
|
||||
* tests against the original brics implementation.
|
||||
|
@ -36,14 +41,24 @@ public class TestSpecialOperations extends LuceneTestCase {
|
|||
assertEquals(AutomatonTestUtil.isFiniteSlow(a), SpecialOperations.isFinite(b));
|
||||
}
|
||||
}
|
||||
|
||||
/** Pass false for testRecursive if the expected strings
|
||||
* may be too long */
|
||||
private Set<IntsRef> getFiniteStrings(Automaton a, int limit, boolean testRecursive) {
|
||||
Set<IntsRef> result = SpecialOperations.getFiniteStrings(a, limit);
|
||||
if (testRecursive) {
|
||||
assertEquals(AutomatonTestUtil.getFiniteStringsRecursive(a, limit), result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Basic test for getFiniteStrings
|
||||
*/
|
||||
public void testFiniteStrings() {
|
||||
public void testFiniteStringsBasic() {
|
||||
Automaton a = BasicOperations.union(BasicAutomata.makeString("dog"), BasicAutomata.makeString("duck"));
|
||||
MinimizationOperations.minimize(a);
|
||||
Set<IntsRef> strings = SpecialOperations.getFiniteStrings(a, -1);
|
||||
Set<IntsRef> strings = getFiniteStrings(a, -1, true);
|
||||
assertEquals(2, strings.size());
|
||||
IntsRef dog = new IntsRef();
|
||||
Util.toIntsRef(new BytesRef("dog"), dog);
|
||||
|
@ -52,4 +67,156 @@ public class TestSpecialOperations extends LuceneTestCase {
|
|||
Util.toIntsRef(new BytesRef("duck"), duck);
|
||||
assertTrue(strings.contains(duck));
|
||||
}
|
||||
|
||||
public void testFiniteStringsEatsStack() {
|
||||
char[] chars = new char[50000];
|
||||
TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
|
||||
String bigString1 = new String(chars);
|
||||
TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
|
||||
String bigString2 = new String(chars);
|
||||
Automaton a = BasicOperations.union(BasicAutomata.makeString(bigString1), BasicAutomata.makeString(bigString2));
|
||||
Set<IntsRef> strings = getFiniteStrings(a, -1, false);
|
||||
assertEquals(2, strings.size());
|
||||
IntsRef scratch = new IntsRef();
|
||||
Util.toUTF32(bigString1.toCharArray(), 0, bigString1.length(), scratch);
|
||||
assertTrue(strings.contains(scratch));
|
||||
Util.toUTF32(bigString2.toCharArray(), 0, bigString2.length(), scratch);
|
||||
assertTrue(strings.contains(scratch));
|
||||
}
|
||||
|
||||
public void testRandomFiniteStrings1() {
|
||||
|
||||
int numStrings = atLeast(500);
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: numStrings=" + numStrings);
|
||||
}
|
||||
|
||||
Set<IntsRef> strings = new HashSet<IntsRef>();
|
||||
List<Automaton> automata = new ArrayList<Automaton>();
|
||||
for(int i=0;i<numStrings;i++) {
|
||||
String s = TestUtil.randomSimpleString(random(), 1, 200);
|
||||
automata.add(BasicAutomata.makeString(s));
|
||||
IntsRef scratch = new IntsRef();
|
||||
Util.toUTF32(s.toCharArray(), 0, s.length(), scratch);
|
||||
strings.add(scratch);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" add string=" + s);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: we could sometimes use
|
||||
// DaciukMihovAutomatonBuilder here
|
||||
|
||||
// TODO: what other random things can we do here...
|
||||
Automaton a = BasicOperations.union(automata);
|
||||
if (random().nextBoolean()) {
|
||||
Automaton.minimize(a);
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: a.minimize numStates=" + a.getNumberOfStates());
|
||||
}
|
||||
} else if (random().nextBoolean()) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: a.determinize");
|
||||
}
|
||||
a.determinize();
|
||||
} else if (random().nextBoolean()) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: a.reduce");
|
||||
}
|
||||
a.reduce();
|
||||
} else if (random().nextBoolean()) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: a.getNumberedStates");
|
||||
}
|
||||
a.getNumberedStates();
|
||||
}
|
||||
|
||||
Set<IntsRef> actual = getFiniteStrings(a, -1, true);
|
||||
if (strings.equals(actual) == false) {
|
||||
System.out.println("strings.size()=" + strings.size() + " actual.size=" + actual.size());
|
||||
List<IntsRef> x = new ArrayList<>(strings);
|
||||
Collections.sort(x);
|
||||
List<IntsRef> y = new ArrayList<>(actual);
|
||||
Collections.sort(y);
|
||||
int end = Math.min(x.size(), y.size());
|
||||
for(int i=0;i<end;i++) {
|
||||
System.out.println(" i=" + i + " string=" + toString(x.get(i)) + " actual=" + toString(y.get(i)));
|
||||
}
|
||||
fail("wrong strings found");
|
||||
}
|
||||
}
|
||||
|
||||
// ascii only!
|
||||
private static String toString(IntsRef ints) {
|
||||
BytesRef br = new BytesRef(ints.length);
|
||||
for(int i=0;i<ints.length;i++) {
|
||||
br.bytes[i] = (byte) ints.ints[i];
|
||||
}
|
||||
br.length = ints.length;
|
||||
return br.utf8ToString();
|
||||
}
|
||||
|
||||
public void testWithCycle() throws Exception {
|
||||
try {
|
||||
SpecialOperations.getFiniteStrings(new RegExp("abc.*", RegExp.NONE).toAutomaton(), -1);
|
||||
fail("did not hit exception");
|
||||
} catch (IllegalArgumentException iae) {
|
||||
// expected
|
||||
}
|
||||
}
|
||||
|
||||
public void testRandomFiniteStrings2() {
|
||||
// Just makes sure we can run on any random finite
|
||||
// automaton:
|
||||
int iters = atLeast(100);
|
||||
for(int i=0;i<iters;i++) {
|
||||
Automaton a = AutomatonTestUtil.randomAutomaton(random());
|
||||
try {
|
||||
// Must pass a limit because the random automaton
|
||||
// can accept MANY strings:
|
||||
SpecialOperations.getFiniteStrings(a, TestUtil.nextInt(random(), 1, 1000));
|
||||
// NOTE: cannot do this, because the method is not
|
||||
// guaranteed to detect cycles when you have a limit
|
||||
//assertTrue(SpecialOperations.isFinite(a));
|
||||
} catch (IllegalArgumentException iae) {
|
||||
assertFalse(SpecialOperations.isFinite(a));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testInvalidLimit() {
|
||||
Automaton a = AutomatonTestUtil.randomAutomaton(random());
|
||||
try {
|
||||
SpecialOperations.getFiniteStrings(a, -7);
|
||||
fail("did not hit exception");
|
||||
} catch (IllegalArgumentException iae) {
|
||||
// expected
|
||||
}
|
||||
}
|
||||
|
||||
public void testInvalidLimit2() {
|
||||
Automaton a = AutomatonTestUtil.randomAutomaton(random());
|
||||
try {
|
||||
SpecialOperations.getFiniteStrings(a, 0);
|
||||
fail("did not hit exception");
|
||||
} catch (IllegalArgumentException iae) {
|
||||
// expected
|
||||
}
|
||||
}
|
||||
|
||||
public void testSingletonNoLimit() {
|
||||
Set<IntsRef> result = SpecialOperations.getFiniteStrings(BasicAutomata.makeString("foobar"), -1);
|
||||
assertEquals(1, result.size());
|
||||
IntsRef scratch = new IntsRef();
|
||||
Util.toUTF32("foobar".toCharArray(), 0, 6, scratch);
|
||||
assertTrue(result.contains(scratch));
|
||||
}
|
||||
|
||||
public void testSingletonLimit1() {
|
||||
Set<IntsRef> result = SpecialOperations.getFiniteStrings(BasicAutomata.makeString("foobar"), 1);
|
||||
assertEquals(1, result.size());
|
||||
IntsRef scratch = new IntsRef();
|
||||
Util.toUTF32("foobar".toCharArray(), 0, 6, scratch);
|
||||
assertTrue(result.contains(scratch));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,8 +28,10 @@ import java.util.Random;
|
|||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
/**
|
||||
* Utilities for testing automata.
|
||||
|
@ -387,6 +389,62 @@ public class AutomatonTestUtil {
|
|||
a.removeDeadTransitions();
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple, original implementation of getFiniteStrings.
|
||||
*
|
||||
* <p>Returns the set of accepted strings, assuming that at most
|
||||
* <code>limit</code> strings are accepted. If more than <code>limit</code>
|
||||
* strings are accepted, the first limit strings found are returned. If <code>limit</code><0, then
|
||||
* the limit is infinite.
|
||||
*
|
||||
* <p>This implementation is recursive: it uses one stack
|
||||
* frame for each digit in the returned strings (ie, max
|
||||
* is the max length returned string).
|
||||
*/
|
||||
public static Set<IntsRef> getFiniteStringsRecursive(Automaton a, int limit) {
|
||||
HashSet<IntsRef> strings = new HashSet<>();
|
||||
if (a.isSingleton()) {
|
||||
if (limit > 0) {
|
||||
strings.add(Util.toUTF32(a.singleton, new IntsRef()));
|
||||
}
|
||||
} else if (!getFiniteStrings(a.initial, new HashSet<State>(), strings, new IntsRef(), limit)) {
|
||||
return strings;
|
||||
}
|
||||
return strings;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the strings that can be produced from the given state, or
|
||||
* false if more than <code>limit</code> strings are found.
|
||||
* <code>limit</code><0 means "infinite".
|
||||
*/
|
||||
private static boolean getFiniteStrings(State s, HashSet<State> pathstates,
|
||||
HashSet<IntsRef> strings, IntsRef path, int limit) {
|
||||
pathstates.add(s);
|
||||
for (Transition t : s.getTransitions()) {
|
||||
if (pathstates.contains(t.to)) {
|
||||
return false;
|
||||
}
|
||||
for (int n = t.min; n <= t.max; n++) {
|
||||
path.grow(path.length+1);
|
||||
path.ints[path.length] = n;
|
||||
path.length++;
|
||||
if (t.to.accept) {
|
||||
strings.add(IntsRef.deepCopyOf(path));
|
||||
if (limit >= 0 && strings.size() > limit) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (!getFiniteStrings(t.to, pathstates, strings, path, limit)) {
|
||||
return false;
|
||||
}
|
||||
path.length--;
|
||||
}
|
||||
}
|
||||
pathstates.remove(s);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the language of this automaton is finite.
|
||||
* <p>
|
||||
|
|
Loading…
Reference in New Issue