LUCENE-3832: Port BasicAutomata.stringUnion from Brics to Lucene.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1355302 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2012-06-29 10:02:37 +00:00
parent f2afde8425
commit 11f2885a12
6 changed files with 118 additions and 101 deletions

View File

@ -9,6 +9,12 @@ http://s.apache.org/luceneversions
======================= Lucene 4.0.0-BETA =======================
New features
* LUCENE-3832: Added BasicAutomata.makeStringUnion method to efficiently
create automata from a fixed collection of UTF-8 encoded BytesRef
(Dawid Weiss, Robert Muir)
API Changes
* LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3.

View File

@ -29,8 +29,9 @@
package org.apache.lucene.util.automaton;
import java.util.ArrayList;
import java.util.Collection;
import java.util.*;
import org.apache.lucene.util.BytesRef;
/**
* Construction of basic automata.
@ -239,4 +240,25 @@ final public class BasicAutomata {
a.deterministic = true;
return a;
}
/**
* Returns a new (deterministic and minimal) automaton that accepts the union
* of the given collection of {@link BytesRef}s representing UTF-8 encoded
* strings.
*
* @param utf8Strings
* The input strings, UTF-8 encoded. The collection must be in sorted
* order.
*
* @return An {@link Automaton} accepting all input strings. The resulting
* automaton is codepoint based (full unicode codepoints on
* transitions).
*/
public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings) {
if (utf8Strings.isEmpty()) {
return makeEmpty();
} else {
return DaciukMihovAutomatonBuilder.build(utf8Strings);
}
}
}

View File

@ -24,15 +24,18 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
/**
* Builds a minimal deterministic automaton that accepts a set of strings. The
* algorithm requires sorted input data, but is very fast (nearly linear with
* the input size).
* Builds a minimal, deterministic {@link Automaton} that accepts a set of
* strings. The algorithm requires sorted input data, but is very fast
* (nearly linear with the input size).
*
* @see #build(Collection)
* @see BasicAutomata#makeStringUnion(Collection)
*/
public final class DaciukMihovAutomatonBuilder {
final class DaciukMihovAutomatonBuilder {
/**
* DFSA state with <code>char</code> labels on transitions.
*/
public final static class State {
private final static class State {
/** An empty set of labels. */
private final static int[] NO_LABELS = new int[0];
@ -63,28 +66,11 @@ public final class DaciukMihovAutomatonBuilder {
* with <code>label</code>. If no such transition exists, returns
* <code>null</code>.
*/
public State getState(int label) {
State getState(int label) {
final int index = Arrays.binarySearch(labels, label);
return index >= 0 ? states[index] : null;
}
/**
* Returns an array of outgoing transition labels. The array is sorted in
* lexicographic order and indexes correspond to states returned from
* {@link #getStates()}.
*/
public int[] getTransitionLabels() {
return this.labels;
}
/**
* Returns an array of outgoing transitions from this state. The returned
* array must not be changed.
*/
public State[] getStates() {
return this.states;
}
/**
* Two states are equal if:
* <ul>
@ -102,21 +88,6 @@ public final class DaciukMihovAutomatonBuilder {
&& referenceEquals(this.states, other.states);
}
/**
* Return <code>true</code> if this state has any children (outgoing
* transitions).
*/
public boolean hasChildren() {
return labels.length > 0;
}
/**
* Is this state a final state in the automaton?
*/
public boolean isFinal() {
return is_final;
}
/**
* Compute the hash code of the <i>current</i> status of this state.
*/
@ -141,6 +112,14 @@ public final class DaciukMihovAutomatonBuilder {
return hash;
}
/**
* Return <code>true</code> if this state has any children (outgoing
* transitions).
*/
boolean hasChildren() {
return labels.length > 0;
}
/**
* Create a new outgoing transition labeled <code>label</code> and return
* the newly created target state for this transition.
@ -149,9 +128,9 @@ public final class DaciukMihovAutomatonBuilder {
assert Arrays.binarySearch(labels, label) < 0 : "State already has transition labeled: "
+ label;
labels = copyOf(labels, labels.length + 1);
states = copyOf(states, states.length + 1);
labels = Arrays.copyOf(labels, labels.length + 1);
states = Arrays.copyOf(states, states.length + 1);
labels[labels.length - 1] = label;
return states[states.length - 1] = new State();
}
@ -187,43 +166,28 @@ public final class DaciukMihovAutomatonBuilder {
states[states.length - 1] = state;
}
/**
* JDK1.5-replacement of {@link Arrays#copyOf(int[], int)}
*/
private static int[] copyOf(int[] original, int newLength) {
int[] copy = new int[newLength];
System.arraycopy(original, 0, copy, 0,
Math.min(original.length, newLength));
return copy;
}
/**
* JDK1.5-replacement of {@link Arrays#copyOf(char[], int)}
*/
public static State[] copyOf(State[] original, int newLength) {
State[] copy = new State[newLength];
System.arraycopy(original, 0, copy, 0,
Math.min(original.length, newLength));
return copy;
}
/**
* Compare two lists of objects for reference-equality.
*/
private static boolean referenceEquals(Object[] a1, Object[] a2) {
if (a1.length != a2.length) return false;
for (int i = 0; i < a1.length; i++)
if (a1[i] != a2[i]) return false;
if (a1.length != a2.length) {
return false;
}
for (int i = 0; i < a1.length; i++) {
if (a1[i] != a2[i]) {
return false;
}
}
return true;
}
}
/**
* "register" for state interning.
* A "registry" for state interning.
*/
private HashMap<State,State> register = new HashMap<State,State>();
private HashMap<State,State> stateRegistry = new HashMap<State,State>();
/**
* Root automaton state.
@ -231,10 +195,14 @@ public final class DaciukMihovAutomatonBuilder {
private State root = new State();
/**
* Previous sequence added to the automaton in {@link #add(CharSequence)}.
* Previous sequence added to the automaton in {@link #add(CharsRef)}.
*/
private CharsRef previous;
/**
* A comparator used for enforcing sorted UTF8 order, used in assertions only.
*/
@SuppressWarnings("deprecation")
private static final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();
/**
@ -243,12 +211,12 @@ public final class DaciukMihovAutomatonBuilder {
* to this automaton (the input must be sorted).
*/
public void add(CharsRef current) {
assert register != null : "Automaton already built.";
assert stateRegistry != null : "Automaton already built.";
assert previous == null
|| comparator.compare(previous, current) <= 0 : "Input must be sorted: "
|| comparator.compare(previous, current) <= 0 : "Input must be in sorted UTF-8 order: "
+ previous + " >= " + current;
assert setPrevious(current);
// Descend in the automaton (find matching prefix).
int pos = 0, max = current.length();
State next, state = root;
@ -270,11 +238,11 @@ public final class DaciukMihovAutomatonBuilder {
* @return Root automaton state.
*/
public State complete() {
if (this.register == null) throw new IllegalStateException();
if (this.stateRegistry == null) throw new IllegalStateException();
if (root.hasChildren()) replaceOrRegister(root);
register = null;
stateRegistry = null;
return root;
}
@ -293,15 +261,16 @@ public final class DaciukMihovAutomatonBuilder {
int i = 0;
int[] labels = s.labels;
for (DaciukMihovAutomatonBuilder.State target : s.states) {
converted.addTransition(new Transition(labels[i++], convert(target,
visited)));
converted.addTransition(
new Transition(labels[i++], convert(target, visited)));
}
return converted;
}
/**
* Build a minimal, deterministic automaton from a sorted list of strings.
* Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
* strings in UTF-8. These strings must be binary-sorted.
*/
public static Automaton build(Collection<BytesRef> input) {
final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
@ -313,7 +282,9 @@ public final class DaciukMihovAutomatonBuilder {
}
Automaton a = new Automaton();
a.initial = convert(builder.complete(), new IdentityHashMap<State,org.apache.lucene.util.automaton.State>());
a.initial = convert(
builder.complete(),
new IdentityHashMap<State,org.apache.lucene.util.automaton.State>());
a.deterministic = true;
return a;
}
@ -330,21 +301,21 @@ public final class DaciukMihovAutomatonBuilder {
/**
* Replace last child of <code>state</code> with an already registered state
* or register the last child state.
* or stateRegistry the last child state.
*/
private void replaceOrRegister(State state) {
final State child = state.lastChild();
if (child.hasChildren()) replaceOrRegister(child);
final State registered = register.get(child);
final State registered = stateRegistry.get(child);
if (registered != null) {
state.replaceLastChild(registered);
} else {
register.put(child, child);
stateRegistry.put(child, child);
}
}
/**
* Add a suffix of <code>current</code> starting at <code>fromIndex</code>
* (inclusive) to state <code>state</code>.

View File

@ -35,7 +35,6 @@ import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
import org.apache.lucene.util.automaton.RegExp;
@SuppressCodecs({ "SimpleText", "Memory" })
@ -257,7 +256,7 @@ public class TestTermsEnum extends LuceneTestCase {
acceptTerms.add(s2);
sortedAcceptTerms.add(new BytesRef(s2));
}
a = DaciukMihovAutomatonBuilder.build(sortedAcceptTerms);
a = BasicAutomata.makeStringUnion(sortedAcceptTerms);
}
if (random().nextBoolean()) {

View File

@ -35,13 +35,7 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.AutomatonTestUtil;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.automaton.SpecialOperations;
import org.apache.lucene.util.automaton.*;
public class TestTermsEnum2 extends LuceneTestCase {
private Directory dir;
@ -72,7 +66,7 @@ public class TestTermsEnum2 extends LuceneTestCase {
writer.addDocument(doc);
}
termsAutomaton = DaciukMihovAutomatonBuilder.build(terms);
termsAutomaton = BasicAutomata.makeStringUnion(terms);
reader = writer.getReader();
searcher = newSearcher(reader);
@ -97,7 +91,7 @@ public class TestTermsEnum2 extends LuceneTestCase {
}
}
Automaton alternate = DaciukMihovAutomatonBuilder.build(matchedTerms);
Automaton alternate = BasicAutomata.makeStringUnion(matchedTerms);
//System.out.println("match " + matchedTerms.size() + " " + alternate.getNumberOfStates() + " states, sigma=" + alternate.getStartPoints().length);
//AutomatonTestUtil.minimizeSimple(alternate);
//System.out.println("minmize done");
@ -164,7 +158,7 @@ public class TestTermsEnum2 extends LuceneTestCase {
found.add(BytesRef.deepCopyOf(te.term()));
}
Automaton actual = DaciukMihovAutomatonBuilder.build(found);
Automaton actual = BasicAutomata.makeStringUnion(found);
assertTrue(BasicOperations.sameLanguage(expected, actual));
}
}

View File

@ -17,10 +17,35 @@ package org.apache.lucene.util.automaton;
* limitations under the License.
*/
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import java.util.*;
import org.apache.lucene.util.*;
import com.carrotsearch.randomizedtesting.generators.RandomInts;
public class TestBasicOperations extends LuceneTestCase {
/** Test string union. */
public void testStringUnion() {
List<BytesRef> strings = new ArrayList<BytesRef>();
for (int i = RandomInts.randomIntBetween(random(), 0, 1000); --i >= 0;) {
strings.add(new BytesRef(_TestUtil.randomUnicodeString(random())));
}
Collections.sort(strings);
Automaton union = BasicAutomata.makeStringUnion(strings);
assertTrue(union.isDeterministic());
assertTrue(BasicOperations.sameLanguage(union, naiveUnion(strings)));
}
private static Automaton naiveUnion(List<BytesRef> strings) {
Automaton [] eachIndividual = new Automaton [strings.size()];
int i = 0;
for (BytesRef bref : strings) {
eachIndividual[i++] = BasicAutomata.makeString(bref.utf8ToString());
}
return BasicOperations.union(Arrays.asList(eachIndividual));
}
public class TestBasicOperations extends LuceneTestCase {
/** Test optimization to concatenate() */
public void testSingletonConcatenate() {
Automaton singleton = BasicAutomata.makeString("prefix");