From ea3a9b89278e2bbaebdd54c91e692f200f79e37b Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 5 Sep 2024 08:36:10 -0400 Subject: [PATCH] Relax Operations.isTotal() to work with a deterministic automaton (#13707) Improve Operations.isTotal() to work with non-minimal automata. --- lucene/CHANGES.txt | 2 + .../lucene/util/automaton/Operations.java | 40 +++++++++++++++---- .../lucene/util/automaton/TestOperations.java | 36 +++++++++++++++++ 3 files changed, 71 insertions(+), 7 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b41144b2d9c..42ab9c6fc63 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -169,6 +169,8 @@ Improvements * GITHUB#12172: Update Romanian stopwords list to include the modern unicode forms. (Trey Jones) +* GITHUB#13707: Improve Operations.isTotal() to work with non-minimal automata. (Dawid Weiss, Robert Muir) + Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index 8fd43dbe1ff..9cd0b3934aa 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -778,22 +778,48 @@ public final class Operations { return true; } - /** Returns true if the given automaton accepts all strings. The automaton must be minimized. */ + /** + * Returns true if the given automaton accepts all strings. + * + *

The automaton must be deterministic, or this method may return false. + * + *

Complexity: linear in number of states and transitions. + */ public static boolean isTotal(Automaton a) { return isTotal(a, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT); } /** * Returns true if the given automaton accepts all strings for the specified min/max range of the - * alphabet. The automaton must be minimized. + * alphabet. + * + *

The automaton must be deterministic, or this method may return false. + * + *

Complexity: linear in number of states and transitions. */ public static boolean isTotal(Automaton a, int minAlphabet, int maxAlphabet) { - if (a.isAccept(0) && a.getNumTransitions(0) == 1) { - Transition t = new Transition(); - a.getTransition(0, 0, t); - return t.dest == 0 && t.min == minAlphabet && t.max == maxAlphabet; + BitSet states = getLiveStates(a); + Transition spare = new Transition(); + int seenStates = 0; + for (int state = states.nextSetBit(0); state >= 0; state = states.nextSetBit(state + 1)) { + // all reachable states must be accept states + if (a.isAccept(state) == false) return false; + // all reachable states must contain transitions covering minAlphabet-maxAlphabet + int previousLabel = minAlphabet - 1; + for (int transition = 0; transition < a.getNumTransitions(state); transition++) { + a.getTransition(state, transition, spare); + // no gaps are allowed + if (spare.min > previousLabel + 1) return false; + previousLabel = spare.max; + } + if (previousLabel < maxAlphabet) return false; + if (state == Integer.MAX_VALUE) { + break; // or (state+1) would overflow + } + seenStates++; } - return false; + // we've checked all the states, automaton is either total or empty + return seenStates > 0; } /** diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java index c6ccf403fc8..849f615e726 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java @@ -173,6 +173,42 @@ public class TestOperations extends LuceneTestCase { assertTrue(exc.getMessage().contains("input automaton is too large")); } + public void testIsTotal() { + // minimal + assertFalse(Operations.isTotal(Automata.makeEmpty())); + assertFalse(Operations.isTotal(Automata.makeEmptyString())); + assertTrue(Operations.isTotal(Automata.makeAnyString())); + assertTrue(Operations.isTotal(Automata.makeAnyBinary(), 0, 255)); + assertFalse(Operations.isTotal(Automata.makeNonEmptyBinary(), 0, 255)); + // deterministic, but not minimal + assertTrue(Operations.isTotal(Operations.repeat(Automata.makeAnyChar()))); + Automaton tricky = + Operations.repeat( + Operations.union( + Automata.makeCharRange(Character.MIN_CODE_POINT, 100), + Automata.makeCharRange(101, Character.MAX_CODE_POINT))); + assertTrue(Operations.isTotal(tricky)); + // not total, but close + Automaton tricky2 = + Operations.repeat( + Operations.union( + Automata.makeCharRange(Character.MIN_CODE_POINT + 1, 100), + Automata.makeCharRange(101, Character.MAX_CODE_POINT))); + assertFalse(Operations.isTotal(tricky2)); + Automaton tricky3 = + Operations.repeat( + Operations.union( + Automata.makeCharRange(Character.MIN_CODE_POINT, 99), + Automata.makeCharRange(101, Character.MAX_CODE_POINT))); + assertFalse(Operations.isTotal(tricky3)); + Automaton tricky4 = + Operations.repeat( + Operations.union( + Automata.makeCharRange(Character.MIN_CODE_POINT, 100), + Automata.makeCharRange(101, Character.MAX_CODE_POINT - 1))); + assertFalse(Operations.isTotal(tricky4)); + } + /** * Returns the set of all accepted strings. *