Remove Operations.isFinite (#11813)

This method is recursive: to avoid eating too much stack we apply a
small limit. This means it can't really be used on any largish automata
without hitting exception.

But the benefit of knowing finite vs infinite in AutomatonTermsEnum is
minor: let's not auto-compute this. FuzzyQuery still gets the finite
optimization because its finite by definition. PrefixQuery is always
infinite. Wildcard/Regex just assume infinite which is safe to do.

Remove the auto-computation and the "trillean" Boolean parameter. If you
dont know that your automaton is finite, pass false to
CompiledAutomaton, it is safe.

Move this method to AutomatonTestUtil so we can still use it in test
asserts.

Closes #11809
This commit is contained in:
Robert Muir 2022-09-24 10:51:04 -04:00 committed by GitHub
parent 54fba99cb1
commit 15f3743f02
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 90 additions and 120 deletions

View File

@ -31,6 +31,9 @@ API Changes
* LUCENE-10603: Remove SortedSetDocValues#NO_MORE_ORDS definition. (Greg Miller)
* GITHUB#11813: Remove Operations.isFinite: the recursive implementation could be problematic
for large automatons (WildcardQuery, PrefixQuery, RegExpQuery, etc). (taroplus, Robert Muir)
New Features
---------------------

View File

@ -28,7 +28,6 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;
/** Converts an Automaton into a TokenStream. */
@ -42,14 +41,10 @@ public class AutomatonToTokenStream {
* position nodes for the TokenStream. The resulting TokenStream releases edges from the automaton
* as tokens in order from the position nodes. This requires the automaton be a finite DAG.
*
* @param automaton automaton to convert. Must be a finite DAG.
* @param automaton automaton to convert. Must be a finite DAG to avoid infinite loops!
* @return TokenStream representation of automaton.
*/
public static TokenStream toTokenStream(Automaton automaton) {
if (Operations.isFinite(automaton) == false) {
throw new IllegalArgumentException("Automaton must be finite");
}
List<List<Integer>> positionNodes = new ArrayList<>();
Transition[][] transitions = automaton.getSortedTransitions();

View File

@ -95,8 +95,7 @@ public class AutomatonQuery extends MultiTermQuery implements Accountable {
this.term = term;
this.automaton = automaton;
this.automatonIsBinary = isBinary;
// TODO: we could take isFinite too, to save a bit of CPU in CompiledAutomaton ctor?:
this.compiled = new CompiledAutomaton(automaton, null, true, isBinary);
this.compiled = new CompiledAutomaton(automaton, false, true, isBinary);
this.ramBytesUsed =
BASE_RAM_BYTES + term.ramBytesUsed() + automaton.ramBytesUsed() + compiled.ramBytesUsed();

View File

@ -96,20 +96,17 @@ public class CompiledAutomaton implements Accountable {
public final BytesRef commonSuffixRef;
/**
* Indicates if the automaton accepts a finite set of strings. Null if this was not computed. Only
* valid for {@link AUTOMATON_TYPE#NORMAL}.
* Indicates if the automaton accepts a finite set of strings. Only valid for {@link
* AUTOMATON_TYPE#NORMAL}.
*/
public final Boolean finite;
public final boolean finite;
/** Which state, if any, accepts all suffixes, else -1. */
public final int sinkState;
/**
* Create this, passing simplify=true and finite=null, so that we try to simplify the automaton
* and determine if it is finite.
*/
/** Create this, passing simplify=true, so that we try to simplify the automaton. */
public CompiledAutomaton(Automaton automaton) {
this(automaton, null, true);
this(automaton, false, true);
}
/** Returns sink state, if present, else -1. */
@ -139,21 +136,21 @@ public class CompiledAutomaton implements Accountable {
}
/**
* Create this. If finite is null, we use {@link Operations#isFinite} to determine whether it is
* finite. If simplify is true, we run possibly expensive operations to determine if the automaton
* is one the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}.
* Create this. If simplify is true, we run possibly expensive operations to determine if the
* automaton is one the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}. Set finite to true if
* the automaton is finite, otherwise set to false if infinite or you don't know.
*/
public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) {
public CompiledAutomaton(Automaton automaton, boolean finite, boolean simplify) {
this(automaton, finite, simplify, false);
}
/**
* Create this. If finite is null, we use {@link Operations#isFinite} to determine whether it is
* finite. If simplify is true, we run possibly expensive operations to determine if the automaton
* is one the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}.
* Create this. If simplify is true, we run possibly expensive operations to determine if the
* automaton is one the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}. Set finite to true if
* the automaton is finite, otherwise set to false if infinite or you don't know.
*/
public CompiledAutomaton(
Automaton automaton, Boolean finite, boolean simplify, boolean isBinary) {
Automaton automaton, boolean finite, boolean simplify, boolean isBinary) {
if (automaton.getNumStates() == 0) {
automaton = new Automaton();
@ -174,7 +171,7 @@ public class CompiledAutomaton implements Accountable {
commonSuffixRef = null;
runAutomaton = null;
this.automaton = null;
this.finite = null;
this.finite = true;
sinkState = -1;
nfaRunAutomaton = null;
return;
@ -196,7 +193,7 @@ public class CompiledAutomaton implements Accountable {
commonSuffixRef = null;
runAutomaton = null;
this.automaton = null;
this.finite = null;
this.finite = false;
sinkState = -1;
nfaRunAutomaton = null;
return;
@ -210,7 +207,7 @@ public class CompiledAutomaton implements Accountable {
commonSuffixRef = null;
runAutomaton = null;
this.automaton = null;
this.finite = null;
this.finite = true;
if (isBinary) {
term = StringHelper.intsRefToBytesRef(singleton);
@ -228,11 +225,7 @@ public class CompiledAutomaton implements Accountable {
type = AUTOMATON_TYPE.NORMAL;
term = null;
if (finite == null) {
this.finite = Operations.isFinite(automaton);
} else {
this.finite = finite;
}
this.finite = finite;
Automaton binary;
if (isBinary) {

View File

@ -1037,44 +1037,6 @@ public final class Operations {
return result;
}
/**
* Returns true if the language of this automaton is finite. The automaton must not have any dead
* states.
*/
public static boolean isFinite(Automaton a) {
if (a.getNumStates() == 0) {
return true;
}
return isFinite(
new Transition(), a, 0, new BitSet(a.getNumStates()), new BitSet(a.getNumStates()), 0);
}
/**
* Checks whether there is a loop containing state. (This is sufficient since there are never
* transitions to dead states.)
*/
// TODO: not great that this is recursive... in theory a
// large automata could exceed java's stack so the maximum level of recursion is bounded to 1000
private static boolean isFinite(
Transition scratch, Automaton a, int state, BitSet path, BitSet visited, int level) {
if (level > MAX_RECURSION_LEVEL) {
throw new IllegalArgumentException("input automaton is too large: " + level);
}
path.set(state);
int numTransitions = a.initTransition(state, scratch);
for (int t = 0; t < numTransitions; t++) {
a.getTransition(state, t, scratch);
if (path.get(scratch.dest)
|| (!visited.get(scratch.dest)
&& !isFinite(scratch, a, scratch.dest, path, visited, level + 1))) {
return false;
}
}
path.clear(state);
visited.set(state);
return true;
}
/**
* Returns the longest string that is a prefix of all accepted strings and visits each state at
* most once. The automaton must not have dead states. If this automaton has already been

View File

@ -167,8 +167,7 @@ public class TestTermsEnum2 extends LuceneTestCase {
String reg = AutomatonTestUtil.randomRegexp(random());
Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
automaton = Operations.determinize(automaton, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
CompiledAutomaton ca =
new CompiledAutomaton(automaton, Operations.isFinite(automaton), false);
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
TermsEnum te = MultiTerms.getTerms(reader, "field").intersect(ca, null);
Automaton expected =
Operations.determinize(

View File

@ -389,4 +389,25 @@ public class TestWildcard extends LuceneTestCase {
reader.close();
dir.close();
}
/** Tests large Wildcard queries */
public void testLarge() throws IOException {
// big string from a user
String big =
"{group-bm-http-server-02083.node.dm.reg,group-bm-http-server-02082.node.dm.reg,group-bm-http-server-02081.node.dm.reg,group-bm-http-server-02080.node.dm.reg,group-bm-http-server-02079.node.dm.reg,group-bm-http-server-02078.node.dm.reg,group-bm-http-server-02077.node.dm.reg,group-bm-http-server-02076.node.dm.reg,group-bm-http-server-02073.node.dm.reg,group-bm-http-server-02070.node.dm.reg,group-bm-http-server-02067.node.dm.reg,group-bm-http-server-02064.node.dm.reg,group-bm-http-server-02029.node.dm.reg,group-bm-http-server-02028.node.dm.reg,group-bm-http-server-02027.node.dm.reg,group-bm-http-server-02026.node.dm.reg,group-bm-http-server-02025.node.dm.reg,group-bm-http-server-02023.node.dm.reg,group-bm-http-server-02022.node.dm.reg,group-bm-http-server-02021.node.dm.reg,group-bm-http-server-02020.node.dm.reg,group-bm-http-server-02019.node.dm.reg,group-bm-http-server-02018.node.dm.reg,group-bm-http-server-02016.node.dm.reg,group-bm-http-server-02015.node.dm.reg,group-bm-http-server-02014.node.dm.reg,group-bm-http-server-02009.node.dm.reg,group-bm-http-server-02007.node.dm.reg,group-bm-http-server-02004.node.dm.reg,group-bm-http-server-02003.node.dm.reg,group-bm-http-server-02002.node.dm.reg,group-bm-http-server-01311.node.dm.reg,group-bm-http-server-01309.node.dm.reg,group-bm-http-server-01307.node.dm.reg}";
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newStringField("body", big, Field.Store.YES));
writer.addDocument(doc);
writer.close();
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = newSearcher(reader);
Query query = new WildcardQuery(new Term("body", big + "*"));
assertMatches(searcher, query, 1);
reader.close();
dir.close();
}
}

View File

@ -224,7 +224,7 @@ public class TestAutomaton extends LuceneTestCase {
assertTrue(Operations.run(a, "mn"));
assertTrue(Operations.run(a, "mone"));
assertFalse(Operations.run(a, "m"));
assertFalse(Operations.isFinite(a));
assertFalse(AutomatonTestUtil.isFinite(a));
}
public void testUnion1() throws Exception {
@ -1237,7 +1237,7 @@ public class TestAutomaton extends LuceneTestCase {
private void assertSame(Collection<BytesRef> terms, Automaton a) {
try {
assertTrue(Operations.isFinite(a));
assertTrue(AutomatonTestUtil.isFinite(a));
assertFalse(Operations.isTotal(a));
Automaton detA = Operations.determinize(a, DEFAULT_DETERMINIZE_WORK_LIMIT);
@ -1354,7 +1354,7 @@ public class TestAutomaton extends LuceneTestCase {
byte[] zeros = new byte[3];
Automaton a =
makeBinaryInterval(newBytesRef(zeros, 0, 1), true, newBytesRef(zeros, 0, 2), true);
assertTrue(Operations.isFinite(a));
assertTrue(AutomatonTestUtil.isFinite(a));
assertFalse(accepts(a, newBytesRef()));
assertTrue(accepts(a, newBytesRef(zeros, 0, 1)));
assertTrue(accepts(a, newBytesRef(zeros, 0, 2)));
@ -1362,7 +1362,7 @@ public class TestAutomaton extends LuceneTestCase {
// '' (incl) - 00 (incl)
a = makeBinaryInterval(newBytesRef(), true, newBytesRef(zeros, 0, 2), true);
assertTrue(Operations.isFinite(a));
assertTrue(AutomatonTestUtil.isFinite(a));
assertTrue(accepts(a, newBytesRef()));
assertTrue(accepts(a, newBytesRef(zeros, 0, 1)));
assertTrue(accepts(a, newBytesRef(zeros, 0, 2)));
@ -1370,7 +1370,7 @@ public class TestAutomaton extends LuceneTestCase {
// '' (excl) - 00 (incl)
a = makeBinaryInterval(newBytesRef(), false, newBytesRef(zeros, 0, 2), true);
assertTrue(Operations.isFinite(a));
assertTrue(AutomatonTestUtil.isFinite(a));
assertFalse(accepts(a, newBytesRef()));
assertTrue(accepts(a, newBytesRef(zeros, 0, 1)));
assertTrue(accepts(a, newBytesRef(zeros, 0, 2)));
@ -1378,7 +1378,7 @@ public class TestAutomaton extends LuceneTestCase {
// 0 (excl) - 00 (incl)
a = makeBinaryInterval(newBytesRef(zeros, 0, 1), false, newBytesRef(zeros, 0, 2), true);
assertTrue(Operations.isFinite(a));
assertTrue(AutomatonTestUtil.isFinite(a));
assertFalse(accepts(a, newBytesRef()));
assertFalse(accepts(a, newBytesRef(zeros, 0, 1)));
assertTrue(accepts(a, newBytesRef(zeros, 0, 2)));
@ -1386,7 +1386,7 @@ public class TestAutomaton extends LuceneTestCase {
// 0 (excl) - 00 (excl)
a = makeBinaryInterval(newBytesRef(zeros, 0, 1), false, newBytesRef(zeros, 0, 2), false);
assertTrue(Operations.isFinite(a));
assertTrue(AutomatonTestUtil.isFinite(a));
assertFalse(accepts(a, newBytesRef()));
assertFalse(accepts(a, newBytesRef(zeros, 0, 1)));
assertFalse(accepts(a, newBytesRef(zeros, 0, 2)));
@ -1420,7 +1420,7 @@ public class TestAutomaton extends LuceneTestCase {
makeBinaryInterval(
minTerm, minInclusive,
maxTerm, maxInclusive);
assertTrue(Operations.isFinite(a));
assertTrue(AutomatonTestUtil.isFinite(a));
int expectedCount = maxTerm.length - minTerm.length + 1;
if (minInclusive == false) {
expectedCount--;
@ -1529,7 +1529,7 @@ public class TestAutomaton extends LuceneTestCase {
public void testMakeBinaryIntervalEqual() throws Exception {
Automaton a = Automata.makeBinaryInterval(newBytesRef("bar"), true, newBytesRef("bar"), true);
assertTrue(Operations.run(a, intsRef("bar")));
assertTrue(Operations.isFinite(a));
assertTrue(AutomatonTestUtil.isFinite(a));
assertEquals(1, TestOperations.getFiniteStrings(a).size());
}

View File

@ -130,7 +130,7 @@ public class TestCompiledAutomaton extends LuceneTestCase {
a.addTransition(state, state, 0, 0xff);
a.finishState();
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, true);
CompiledAutomaton ca = new CompiledAutomaton(a, false, true, true);
assertEquals(CompiledAutomaton.AUTOMATON_TYPE.ALL, ca.type);
}
@ -142,7 +142,7 @@ public class TestCompiledAutomaton extends LuceneTestCase {
a.addTransition(state, state, 0, Character.MAX_CODE_POINT);
a.finishState();
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, false);
CompiledAutomaton ca = new CompiledAutomaton(a, false, true, false);
assertEquals(CompiledAutomaton.AUTOMATON_TYPE.ALL, ca.type);
}
@ -150,14 +150,14 @@ public class TestCompiledAutomaton extends LuceneTestCase {
public void testBinarySingleton() throws Exception {
// This is just ascii so we can pretend it's binary:
Automaton a = Automata.makeString("foobar");
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, true);
CompiledAutomaton ca = new CompiledAutomaton(a, true, true, true);
assertEquals(CompiledAutomaton.AUTOMATON_TYPE.SINGLE, ca.type);
}
// LUCENE-6367
public void testUnicodeSingleton() throws Exception {
Automaton a = Automata.makeString(TestUtil.randomRealisticUnicodeString(random()));
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, false);
CompiledAutomaton ca = new CompiledAutomaton(a, true, true, false);
assertEquals(CompiledAutomaton.AUTOMATON_TYPE.SINGLE, ca.type);
}
}

View File

@ -22,6 +22,7 @@ import java.util.Collections;
import java.util.List;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.tests.util.automaton.AutomatonTestUtil;
/**
* Not thorough, but tries to test determinism correctness somewhat randomly, by determinizing a
@ -49,7 +50,7 @@ public class TestDeterminizeLexicon extends LuceneTestCase {
Collections.shuffle(automata, random());
Automaton lex = Operations.union(automata);
lex = Operations.determinize(lex, 1000000);
assertTrue(Operations.isFinite(lex));
assertTrue(AutomatonTestUtil.isFinite(lex));
for (String s : terms) {
assertTrue(Operations.run(lex, s));
}

View File

@ -21,6 +21,7 @@ import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WO
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.automaton.AutomatonTestUtil;
public class TestLevenshteinAutomata extends LuceneTestCase {
@ -73,8 +74,8 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
assertNotNull(tautomata[n]);
assertTrue(automata[n].isDeterministic());
assertTrue(tautomata[n].isDeterministic());
assertTrue(Operations.isFinite(automata[n]));
assertTrue(Operations.isFinite(tautomata[n]));
assertTrue(AutomatonTestUtil.isFinite(automata[n]));
assertTrue(AutomatonTestUtil.isFinite(tautomata[n]));
assertFalse(Operations.hasDeadStatesFromInitial(automata[n]));
assertFalse(Operations.hasDeadStatesFromInitial(tautomata[n]));
// check that the dfa for n-1 accepts a subset of the dfa for n

View File

@ -40,11 +40,11 @@ public class TestLimitedFiniteStringsIterator extends LuceneTestCase {
getFiniteStrings(new LimitedFiniteStringsIterator(a, TestUtil.nextInt(random(), 1, 1000)));
// NOTE: cannot do this, because the method is not
// guaranteed to detect cycles when you have a limit
// assertTrue(Operations.isFinite(a));
// assertTrue(AutomatonTestUtil.isFinite(a));
} catch (
@SuppressWarnings("unused")
IllegalArgumentException iae) {
assertFalse(Operations.isFinite(a));
assertFalse(AutomatonTestUtil.isFinite(a));
}
}
}

View File

@ -115,14 +115,6 @@ public class TestOperations extends LuceneTestCase {
}
}
}
/** tests against the original brics implementation. */
public void testIsFinite() {
int num = atLeast(200);
for (int i = 0; i < num; i++) {
Automaton a = AutomatonTestUtil.randomAutomaton(random());
assertEquals(AutomatonTestUtil.isFiniteSlow(a), Operations.isFinite(a));
}
}
public void testIsFiniteEatsStack() {
char[] chars = new char[50000];
@ -133,7 +125,7 @@ public class TestOperations extends LuceneTestCase {
Automaton a =
Operations.union(Automata.makeString(bigString1), Automata.makeString(bigString2));
IllegalArgumentException exc =
expectThrows(IllegalArgumentException.class, () -> Operations.isFinite(a));
expectThrows(IllegalArgumentException.class, () -> AutomatonTestUtil.isFinite(a));
assertTrue(exc.getMessage().contains("input automaton is too large"));
}

View File

@ -169,7 +169,8 @@ public final class Intervals {
* @throws IllegalStateException if the prefix expands to more than {@code maxExpansions} terms
*/
public static IntervalsSource prefix(BytesRef prefix, int maxExpansions) {
CompiledAutomaton ca = new CompiledAutomaton(PrefixQuery.toAutomaton(prefix), null, true, true);
CompiledAutomaton ca =
new CompiledAutomaton(PrefixQuery.toAutomaton(prefix), false, true, true);
return new MultiTermIntervalsSource(ca, maxExpansions, prefix.utf8ToString() + "*");
}

View File

@ -877,9 +877,6 @@ public class AnalyzingSuggester extends Lookup {
automaton = replaceSep(automaton);
automaton = convertAutomaton(automaton);
// TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings
// assert SpecialOperations.isFinite(automaton);
// Get all paths from the automaton (there can be
// more than one path, eg if the analyzer created a
// graph using SynFilter or WDF):

View File

@ -1605,7 +1605,7 @@ public class RandomPostingsTester {
while (true) {
Automaton a = AutomatonTestUtil.randomAutomaton(random);
a = Operations.determinize(a, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
CompiledAutomaton ca = new CompiledAutomaton(a, null, true, false);
CompiledAutomaton ca = new CompiledAutomaton(a, false, true, false);
if (ca.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
// Keep retrying until we get an A that will really "use" the PF's intersect code:
continue;

View File

@ -17,6 +17,7 @@
package org.apache.lucene.tests.util.automaton;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
@ -463,35 +464,40 @@ public class AutomatonTestUtil {
}
/**
* Returns true if the language of this automaton is finite.
*
* <p>WARNING: this method is slow, it will blow up if the automaton is large. this is only used
* to test the correctness of our faster implementation.
* Returns true if the language of this automaton is finite. The automaton must not have any dead
* states.
*/
public static boolean isFiniteSlow(Automaton a) {
public static boolean isFinite(Automaton a) {
if (a.getNumStates() == 0) {
return true;
}
return isFiniteSlow(a, 0, new HashSet<Integer>());
return isFinite(
new Transition(), a, 0, new BitSet(a.getNumStates()), new BitSet(a.getNumStates()), 0);
}
/**
* Checks whether there is a loop containing s. (This is sufficient since there are never
* Checks whether there is a loop containing state. (This is sufficient since there are never
* transitions to dead states.)
*/
// TODO: not great that this is recursive... in theory a
// large automata could exceed java's stack
private static boolean isFiniteSlow(Automaton a, int s, HashSet<Integer> path) {
path.add(s);
Transition t = new Transition();
int count = a.initTransition(s, t);
for (int i = 0; i < count; i++) {
a.getNextTransition(t);
if (path.contains(t.dest) || !isFiniteSlow(a, t.dest, path)) {
// large automata could exceed java's stack so the maximum level of recursion is bounded to 1000
private static boolean isFinite(
Transition scratch, Automaton a, int state, BitSet path, BitSet visited, int level) {
if (level > Operations.MAX_RECURSION_LEVEL) {
throw new IllegalArgumentException("input automaton is too large: " + level);
}
path.set(state);
int numTransitions = a.initTransition(state, scratch);
for (int t = 0; t < numTransitions; t++) {
a.getTransition(state, t, scratch);
if (path.get(scratch.dest)
|| (!visited.get(scratch.dest)
&& !isFinite(scratch, a, scratch.dest, path, visited, level + 1))) {
return false;
}
}
path.remove(s);
path.clear(state);
visited.set(state);
return true;
}