Relax Operations.isTotal() to work with a deterministic automaton (#13707)

Improve Operations.isTotal() to work with non-minimal automata.
This commit is contained in:
Robert Muir 2024-09-05 08:36:10 -04:00 committed by GitHub
parent 87bc8270ff
commit ea3a9b8927
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 71 additions and 7 deletions

View File

@ -169,6 +169,8 @@ Improvements
* GITHUB#12172: Update Romanian stopwords list to include the modern unicode forms. (Trey Jones)
* GITHUB#13707: Improve Operations.isTotal() to work with non-minimal automata. (Dawid Weiss, Robert Muir)
Optimizations
---------------------

View File

@ -778,22 +778,48 @@ public final class Operations {
return true;
}
/** Returns true if the given automaton accepts all strings. The automaton must be minimized. */
/**
* Returns true if the given automaton accepts all strings.
*
* <p>The automaton must be deterministic, or this method may return false.
*
* <p>Complexity: linear in number of states and transitions.
*/
public static boolean isTotal(Automaton a) {
return isTotal(a, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
}
/**
* Returns true if the given automaton accepts all strings for the specified min/max range of the
* alphabet. The automaton must be minimized.
* alphabet.
*
* <p>The automaton must be deterministic, or this method may return false.
*
* <p>Complexity: linear in number of states and transitions.
*/
public static boolean isTotal(Automaton a, int minAlphabet, int maxAlphabet) {
if (a.isAccept(0) && a.getNumTransitions(0) == 1) {
Transition t = new Transition();
a.getTransition(0, 0, t);
return t.dest == 0 && t.min == minAlphabet && t.max == maxAlphabet;
BitSet states = getLiveStates(a);
Transition spare = new Transition();
int seenStates = 0;
for (int state = states.nextSetBit(0); state >= 0; state = states.nextSetBit(state + 1)) {
// all reachable states must be accept states
if (a.isAccept(state) == false) return false;
// all reachable states must contain transitions covering minAlphabet-maxAlphabet
int previousLabel = minAlphabet - 1;
for (int transition = 0; transition < a.getNumTransitions(state); transition++) {
a.getTransition(state, transition, spare);
// no gaps are allowed
if (spare.min > previousLabel + 1) return false;
previousLabel = spare.max;
}
return false;
if (previousLabel < maxAlphabet) return false;
if (state == Integer.MAX_VALUE) {
break; // or (state+1) would overflow
}
seenStates++;
}
// we've checked all the states, automaton is either total or empty
return seenStates > 0;
}
/**

View File

@ -173,6 +173,42 @@ public class TestOperations extends LuceneTestCase {
assertTrue(exc.getMessage().contains("input automaton is too large"));
}
public void testIsTotal() {
// minimal
assertFalse(Operations.isTotal(Automata.makeEmpty()));
assertFalse(Operations.isTotal(Automata.makeEmptyString()));
assertTrue(Operations.isTotal(Automata.makeAnyString()));
assertTrue(Operations.isTotal(Automata.makeAnyBinary(), 0, 255));
assertFalse(Operations.isTotal(Automata.makeNonEmptyBinary(), 0, 255));
// deterministic, but not minimal
assertTrue(Operations.isTotal(Operations.repeat(Automata.makeAnyChar())));
Automaton tricky =
Operations.repeat(
Operations.union(
Automata.makeCharRange(Character.MIN_CODE_POINT, 100),
Automata.makeCharRange(101, Character.MAX_CODE_POINT)));
assertTrue(Operations.isTotal(tricky));
// not total, but close
Automaton tricky2 =
Operations.repeat(
Operations.union(
Automata.makeCharRange(Character.MIN_CODE_POINT + 1, 100),
Automata.makeCharRange(101, Character.MAX_CODE_POINT)));
assertFalse(Operations.isTotal(tricky2));
Automaton tricky3 =
Operations.repeat(
Operations.union(
Automata.makeCharRange(Character.MIN_CODE_POINT, 99),
Automata.makeCharRange(101, Character.MAX_CODE_POINT)));
assertFalse(Operations.isTotal(tricky3));
Automaton tricky4 =
Operations.repeat(
Operations.union(
Automata.makeCharRange(Character.MIN_CODE_POINT, 100),
Automata.makeCharRange(101, Character.MAX_CODE_POINT - 1)));
assertFalse(Operations.isTotal(tricky4));
}
/**
* Returns the set of all accepted strings.
*