From fa308d6ad61ec4367c32ba9e0f241a6c935ad80c Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 13 May 2011 19:20:05 +0000 Subject: [PATCH] LUCENE-3094: optimize lev automata construction, don't keep around detached states git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1102875 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/util/automaton/LevenshteinAutomata.java | 9 ++++++--- .../lucene/util/automaton/AutomatonTestUtil.java | 11 +++++++++++ .../util/automaton/TestLevenshteinAutomata.java | 6 ++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/lucene/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java b/lucene/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java index b7d8a34f9d8..1ab9f15ac8c 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java @@ -143,13 +143,16 @@ public class LevenshteinAutomata { if (dest >= 0) for (int r = 0; r < numRanges; r++) states[k].addTransition(new Transition(rangeLower[r], rangeUpper[r], states[dest])); - // reduce the state: this doesn't appear to help anything - //states[k].reduce(); } Automaton a = new Automaton(states[0]); a.setDeterministic(true); - a.setNumberedStates(states); + // we create some useless unconnected states, and its a net-win overall to remove these, + // as well as to combine any adjacent transitions (it makes later algorithms more efficient). + // so, while we could set our numberedStates here, its actually best not to, and instead to + // force a traversal in reduce, pruning the unconnected states while we combine adjacent transitions. + //a.setNumberedStates(states); + a.reduce(); // we need not trim transitions to dead states, as they are not created. //a.restoreInvariant(); return a; diff --git a/lucene/src/test-framework/org/apache/lucene/util/automaton/AutomatonTestUtil.java b/lucene/src/test-framework/org/apache/lucene/util/automaton/AutomatonTestUtil.java index ca943fff2ba..f41cb95a2bd 100644 --- a/lucene/src/test-framework/org/apache/lucene/util/automaton/AutomatonTestUtil.java +++ b/lucene/src/test-framework/org/apache/lucene/util/automaton/AutomatonTestUtil.java @@ -397,4 +397,15 @@ public class AutomatonTestUtil { path.remove(s); return true; } + + + /** + * Checks that an automaton has no detached states that are unreachable + * from the initial state. + */ + public static void assertNoDetachedStates(Automaton a) { + int numStates = a.getNumberOfStates(); + a.clearNumberedStates(); // force recomputation of cached numbered states + assert numStates == a.getNumberOfStates() : "automaton has " + (numStates - a.getNumberOfStates()) + " detached states"; + } } diff --git a/lucene/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java b/lucene/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java index 6953f5ddf90..222ac0e324b 100644 --- a/lucene/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java +++ b/lucene/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java @@ -39,6 +39,11 @@ public class TestLevenshteinAutomata extends LuceneTestCase { assertCharVectors(2); } + // LUCENE-3094 + public void testNoWastedStates() throws Exception { + AutomatonTestUtil.assertNoDetachedStates(new LevenshteinAutomata("abc").toAutomaton(1)); + } + /** * Tests all possible characteristic vectors for some n * This exhaustively tests the parametric transitions tables. @@ -66,6 +71,7 @@ public class TestLevenshteinAutomata extends LuceneTestCase { assertNotNull(automata[n]); assertTrue(automata[n].isDeterministic()); assertTrue(SpecialOperations.isFinite(automata[n])); + AutomatonTestUtil.assertNoDetachedStates(automata[n]); // check that the dfa for n-1 accepts a subset of the dfa for n if (n > 0) { assertTrue(automata[n-1].subsetOf(automata[n]));