LUCENE-7914: Add a maximum recursion level in automaton recursive functions (Operations.isFinite and Operations.topsortState) to prevent large automaton to overflow the stack.

This commit is contained in:
Jim Ferenczi 2017-08-04 12:02:30 +02:00
parent d620326b88
commit 7dde798473
6 changed files with 71 additions and 39 deletions

View File

@ -14,7 +14,6 @@ Changes in Runtime Behavior
======================= Lucene 7.1.0 =======================
(No Changes)
Optimizations
@ -22,6 +21,12 @@ Optimizations
SortedSetDocValuesFacetCounts and others) builds its map (Robert
Muir, Adrien Grand, Mike McCandless)
Bug Fixes
* LUCENE-7914: Add a maximum recursion level in automaton recursive
functions (Operations.isFinite and Operations.topsortState) to prevent
large automaton to overflow the stack (Robert Muir, Adrien Grand, Jim Ferenczi)
======================= Lucene 7.0.0 =======================
New Features

View File

@ -58,6 +58,11 @@ final public class Operations {
*/
public static final int DEFAULT_MAX_DETERMINIZED_STATES = 10000;
/**
* Maximum level of recursion allowed in recursive operations.
*/
public static final int MAX_RECURSION_LEVEL = 1000;
private Operations() {}
/**
@ -1018,7 +1023,7 @@ final public class Operations {
if (a.getNumStates() == 0) {
return true;
}
return isFinite(new Transition(), a, 0, new BitSet(a.getNumStates()), new BitSet(a.getNumStates()));
return isFinite(new Transition(), a, 0, new BitSet(a.getNumStates()), new BitSet(a.getNumStates()), 0);
}
/**
@ -1026,13 +1031,16 @@ final public class Operations {
* there are never transitions to dead states.)
*/
// TODO: not great that this is recursive... in theory a
// large automata could exceed java's stack
private static boolean isFinite(Transition scratch, Automaton a, int state, BitSet path, BitSet visited) {
// large automata could exceed java's stack so the maximum level of recursion is bounded to 1000
private static boolean isFinite(Transition scratch, Automaton a, int state, BitSet path, BitSet visited, int level) {
if (level > MAX_RECURSION_LEVEL) {
throw new IllegalArgumentException("input automaton is too large: " + level);
}
path.set(state);
int numTransitions = a.initTransition(state, scratch);
for(int t=0;t<numTransitions;t++) {
a.getTransition(state, t, scratch);
if (path.get(scratch.dest) || (!visited.get(scratch.dest) && !isFinite(scratch, a, scratch.dest, path, visited))) {
if (path.get(scratch.dest) || (!visited.get(scratch.dest) && !isFinite(scratch, a, scratch.dest, path, visited, level+1))) {
return false;
}
}
@ -1264,7 +1272,7 @@ final public class Operations {
int numStates = a.getNumStates();
int[] states = new int[numStates];
final BitSet visited = new BitSet(numStates);
int upto = topoSortStatesRecurse(a, visited, states, 0, 0);
int upto = topoSortStatesRecurse(a, visited, states, 0, 0, 0);
if (upto < states.length) {
// There were dead states
@ -1283,14 +1291,19 @@ final public class Operations {
return states;
}
private static int topoSortStatesRecurse(Automaton a, BitSet visited, int[] states, int upto, int state) {
// TODO: not great that this is recursive... in theory a
// large automata could exceed java's stack so the maximum level of recursion is bounded to 1000
private static int topoSortStatesRecurse(Automaton a, BitSet visited, int[] states, int upto, int state, int level) {
if (level > MAX_RECURSION_LEVEL) {
throw new IllegalArgumentException("input automaton is too large: " + level);
}
Transition t = new Transition();
int count = a.initTransition(state, t);
for (int i=0;i<count;i++) {
a.getNextTransition(t);
if (!visited.get(t.dest)) {
visited.set(t.dest);
upto = topoSortStatesRecurse(a, visited, states, upto, t.dest);
upto = topoSortStatesRecurse(a, visited, states, upto, t.dest, level+1);
}
}
states[upto] = state;

View File

@ -542,19 +542,21 @@ public class RegExp {
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
break;
case REGEXP_REPEAT_MIN:
a = Operations.repeat(
exp1.toAutomatonInternal(automata, automaton_provider,
maxDeterminizedStates),
min);
a = exp1.toAutomatonInternal(automata, automaton_provider, maxDeterminizedStates);
int minNumStates = (a.getNumStates() - 1) * min;
if (minNumStates > maxDeterminizedStates) {
throw new TooComplexToDeterminizeException(a, minNumStates);
}
a = Operations.repeat(a, min);
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
break;
case REGEXP_REPEAT_MINMAX:
a = Operations.repeat(
exp1.toAutomatonInternal(automata, automaton_provider,
maxDeterminizedStates),
min,
max);
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
a = exp1.toAutomatonInternal(automata, automaton_provider, maxDeterminizedStates);
int minMaxNumStates = (a.getNumStates() - 1) * max;
if (minMaxNumStates > maxDeterminizedStates) {
throw new TooComplexToDeterminizeException(a, minMaxNumStates);
}
a = Operations.repeat(a, min, max);
break;
case REGEXP_COMPLEMENT:
a = Operations.complement(

View File

@ -52,8 +52,7 @@ public class TestOperations extends LuceneTestCase {
for (BytesRef bref : strings) {
eachIndividual[i++] = Automata.makeString(bref.utf8ToString());
}
return Operations.determinize(Operations.union(Arrays.asList(eachIndividual)),
DEFAULT_MAX_DETERMINIZED_STATES);
return Operations.determinize(Operations.union(Arrays.asList(eachIndividual)), DEFAULT_MAX_DETERMINIZED_STATES);
}
/** Test concatenation with empty language returns empty */
@ -61,6 +60,7 @@ public class TestOperations extends LuceneTestCase {
Automaton a = Automata.makeString("a");
Automaton concat = Operations.concatenate(a, Automata.makeEmpty());
assertTrue(Operations.isEmpty(concat));
}
/** Test optimization to concatenate() with empty String to an NFA */
@ -124,6 +124,28 @@ public class TestOperations extends LuceneTestCase {
}
}
public void testIsFiniteEatsStack() {
char[] chars = new char[50000];
TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
String bigString1 = new String(chars);
TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
String bigString2 = new String(chars);
Automaton a = Operations.union(Automata.makeString(bigString1), Automata.makeString(bigString2));
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> Operations.isFinite(a));
assertTrue(exc.getMessage().contains("input automaton is too large"));
}
public void testTopoSortEatsStack() {
char[] chars = new char[50000];
TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
String bigString1 = new String(chars);
TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
String bigString2 = new String(chars);
Automaton a = Operations.union(Automata.makeString(bigString1), Automata.makeString(bigString2));
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> Operations.topoSortStates(a));
assertTrue(exc.getMessage().contains("input automaton is too large"));
}
/**
* Returns the set of all accepted strings.
*

View File

@ -19,13 +19,6 @@ package org.apache.lucene.util.automaton;
import org.apache.lucene.util.LuceneTestCase;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.ObjectInput;
import java.io.ObjectInputStream;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
public class TestRegExp extends LuceneTestCase {
/**
@ -54,6 +47,14 @@ public class TestRegExp extends LuceneTestCase {
assertTrue(expected.getMessage().contains(source));
}
public void testSerializeTooManyStatesToRepeat() throws Exception {
String source = "a{50001}";
TooComplexToDeterminizeException expected = expectThrows(TooComplexToDeterminizeException.class, () -> {
new RegExp(source).toAutomaton(50000);
});
assertTrue(expected.getMessage().contains(source));
}
// LUCENE-6713
public void testSerializeTooManyStatesToDeterminizeExc() throws Exception {
// LUCENE-6046
@ -62,16 +63,6 @@ public class TestRegExp extends LuceneTestCase {
new RegExp(source).toAutomaton();
});
assertTrue(expected.getMessage().contains(source));
ByteArrayOutputStream bos = new ByteArrayOutputStream();
ObjectOutput out = new ObjectOutputStream(bos);
out.writeObject(expected);
byte[] bytes = bos.toByteArray();
ByteArrayInputStream bis = new ByteArrayInputStream(bytes);
ObjectInput in = new ObjectInputStream(bis);
TooComplexToDeterminizeException e2 = (TooComplexToDeterminizeException) in.readObject();
assertNotNull(e2.getMessage());
}
// LUCENE-6046

View File

@ -1252,10 +1252,9 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
suggester.build(new InputArrayIterator(new Input[] {
new Input(bigString, 7)}));
fail("did not hit expected exception");
} catch (StackOverflowError soe) {
// OK
} catch (IllegalArgumentException iae) {
// expected
assertTrue(iae.getMessage().contains("input automaton is too large"));
}
IOUtils.close(a, tempDir);
}