mirror of
https://github.com/apache/lucene.git
synced 2025-02-08 02:58:58 +00:00
LUCENE-3832: Port BasicAutomata.stringUnion from Brics to Lucene.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1355302 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f2afde8425
commit
11f2885a12
@ -9,6 +9,12 @@ http://s.apache.org/luceneversions
|
||||
|
||||
======================= Lucene 4.0.0-BETA =======================
|
||||
|
||||
New features
|
||||
|
||||
* LUCENE-3832: Added BasicAutomata.makeStringUnion method to efficiently
|
||||
create automata from a fixed collection of UTF-8 encoded BytesRef
|
||||
(Dawid Weiss, Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3.
|
||||
|
@ -29,8 +29,9 @@
|
||||
|
||||
package org.apache.lucene.util.automaton;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* Construction of basic automata.
|
||||
@ -239,4 +240,25 @@ final public class BasicAutomata {
|
||||
a.deterministic = true;
|
||||
return a;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new (deterministic and minimal) automaton that accepts the union
|
||||
* of the given collection of {@link BytesRef}s representing UTF-8 encoded
|
||||
* strings.
|
||||
*
|
||||
* @param utf8Strings
|
||||
* The input strings, UTF-8 encoded. The collection must be in sorted
|
||||
* order.
|
||||
*
|
||||
* @return An {@link Automaton} accepting all input strings. The resulting
|
||||
* automaton is codepoint based (full unicode codepoints on
|
||||
* transitions).
|
||||
*/
|
||||
public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings) {
|
||||
if (utf8Strings.isEmpty()) {
|
||||
return makeEmpty();
|
||||
} else {
|
||||
return DaciukMihovAutomatonBuilder.build(utf8Strings);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -24,15 +24,18 @@ import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
/**
|
||||
* Builds a minimal deterministic automaton that accepts a set of strings. The
|
||||
* algorithm requires sorted input data, but is very fast (nearly linear with
|
||||
* the input size).
|
||||
* Builds a minimal, deterministic {@link Automaton} that accepts a set of
|
||||
* strings. The algorithm requires sorted input data, but is very fast
|
||||
* (nearly linear with the input size).
|
||||
*
|
||||
* @see #build(Collection)
|
||||
* @see BasicAutomata#makeStringUnion(Collection)
|
||||
*/
|
||||
public final class DaciukMihovAutomatonBuilder {
|
||||
final class DaciukMihovAutomatonBuilder {
|
||||
/**
|
||||
* DFSA state with <code>char</code> labels on transitions.
|
||||
*/
|
||||
public final static class State {
|
||||
private final static class State {
|
||||
|
||||
/** An empty set of labels. */
|
||||
private final static int[] NO_LABELS = new int[0];
|
||||
@ -63,28 +66,11 @@ public final class DaciukMihovAutomatonBuilder {
|
||||
* with <code>label</code>. If no such transition exists, returns
|
||||
* <code>null</code>.
|
||||
*/
|
||||
public State getState(int label) {
|
||||
State getState(int label) {
|
||||
final int index = Arrays.binarySearch(labels, label);
|
||||
return index >= 0 ? states[index] : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of outgoing transition labels. The array is sorted in
|
||||
* lexicographic order and indexes correspond to states returned from
|
||||
* {@link #getStates()}.
|
||||
*/
|
||||
public int[] getTransitionLabels() {
|
||||
return this.labels;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of outgoing transitions from this state. The returned
|
||||
* array must not be changed.
|
||||
*/
|
||||
public State[] getStates() {
|
||||
return this.states;
|
||||
}
|
||||
|
||||
/**
|
||||
* Two states are equal if:
|
||||
* <ul>
|
||||
@ -102,21 +88,6 @@ public final class DaciukMihovAutomatonBuilder {
|
||||
&& referenceEquals(this.states, other.states);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return <code>true</code> if this state has any children (outgoing
|
||||
* transitions).
|
||||
*/
|
||||
public boolean hasChildren() {
|
||||
return labels.length > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this state a final state in the automaton?
|
||||
*/
|
||||
public boolean isFinal() {
|
||||
return is_final;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the hash code of the <i>current</i> status of this state.
|
||||
*/
|
||||
@ -141,6 +112,14 @@ public final class DaciukMihovAutomatonBuilder {
|
||||
return hash;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return <code>true</code> if this state has any children (outgoing
|
||||
* transitions).
|
||||
*/
|
||||
boolean hasChildren() {
|
||||
return labels.length > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new outgoing transition labeled <code>label</code> and return
|
||||
* the newly created target state for this transition.
|
||||
@ -149,9 +128,9 @@ public final class DaciukMihovAutomatonBuilder {
|
||||
assert Arrays.binarySearch(labels, label) < 0 : "State already has transition labeled: "
|
||||
+ label;
|
||||
|
||||
labels = copyOf(labels, labels.length + 1);
|
||||
states = copyOf(states, states.length + 1);
|
||||
|
||||
labels = Arrays.copyOf(labels, labels.length + 1);
|
||||
states = Arrays.copyOf(states, states.length + 1);
|
||||
|
||||
labels[labels.length - 1] = label;
|
||||
return states[states.length - 1] = new State();
|
||||
}
|
||||
@ -187,43 +166,28 @@ public final class DaciukMihovAutomatonBuilder {
|
||||
states[states.length - 1] = state;
|
||||
}
|
||||
|
||||
/**
|
||||
* JDK1.5-replacement of {@link Arrays#copyOf(int[], int)}
|
||||
*/
|
||||
private static int[] copyOf(int[] original, int newLength) {
|
||||
int[] copy = new int[newLength];
|
||||
System.arraycopy(original, 0, copy, 0,
|
||||
Math.min(original.length, newLength));
|
||||
return copy;
|
||||
}
|
||||
|
||||
/**
|
||||
* JDK1.5-replacement of {@link Arrays#copyOf(char[], int)}
|
||||
*/
|
||||
public static State[] copyOf(State[] original, int newLength) {
|
||||
State[] copy = new State[newLength];
|
||||
System.arraycopy(original, 0, copy, 0,
|
||||
Math.min(original.length, newLength));
|
||||
return copy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two lists of objects for reference-equality.
|
||||
*/
|
||||
private static boolean referenceEquals(Object[] a1, Object[] a2) {
|
||||
if (a1.length != a2.length) return false;
|
||||
|
||||
for (int i = 0; i < a1.length; i++)
|
||||
if (a1[i] != a2[i]) return false;
|
||||
|
||||
if (a1.length != a2.length) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < a1.length; i++) {
|
||||
if (a1[i] != a2[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* "register" for state interning.
|
||||
* A "registry" for state interning.
|
||||
*/
|
||||
private HashMap<State,State> register = new HashMap<State,State>();
|
||||
private HashMap<State,State> stateRegistry = new HashMap<State,State>();
|
||||
|
||||
/**
|
||||
* Root automaton state.
|
||||
@ -231,10 +195,14 @@ public final class DaciukMihovAutomatonBuilder {
|
||||
private State root = new State();
|
||||
|
||||
/**
|
||||
* Previous sequence added to the automaton in {@link #add(CharSequence)}.
|
||||
* Previous sequence added to the automaton in {@link #add(CharsRef)}.
|
||||
*/
|
||||
private CharsRef previous;
|
||||
|
||||
|
||||
/**
|
||||
* A comparator used for enforcing sorted UTF8 order, used in assertions only.
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
private static final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();
|
||||
|
||||
/**
|
||||
@ -243,12 +211,12 @@ public final class DaciukMihovAutomatonBuilder {
|
||||
* to this automaton (the input must be sorted).
|
||||
*/
|
||||
public void add(CharsRef current) {
|
||||
assert register != null : "Automaton already built.";
|
||||
assert stateRegistry != null : "Automaton already built.";
|
||||
assert previous == null
|
||||
|| comparator.compare(previous, current) <= 0 : "Input must be sorted: "
|
||||
|| comparator.compare(previous, current) <= 0 : "Input must be in sorted UTF-8 order: "
|
||||
+ previous + " >= " + current;
|
||||
assert setPrevious(current);
|
||||
|
||||
|
||||
// Descend in the automaton (find matching prefix).
|
||||
int pos = 0, max = current.length();
|
||||
State next, state = root;
|
||||
@ -270,11 +238,11 @@ public final class DaciukMihovAutomatonBuilder {
|
||||
* @return Root automaton state.
|
||||
*/
|
||||
public State complete() {
|
||||
if (this.register == null) throw new IllegalStateException();
|
||||
if (this.stateRegistry == null) throw new IllegalStateException();
|
||||
|
||||
if (root.hasChildren()) replaceOrRegister(root);
|
||||
|
||||
register = null;
|
||||
stateRegistry = null;
|
||||
return root;
|
||||
}
|
||||
|
||||
@ -293,15 +261,16 @@ public final class DaciukMihovAutomatonBuilder {
|
||||
int i = 0;
|
||||
int[] labels = s.labels;
|
||||
for (DaciukMihovAutomatonBuilder.State target : s.states) {
|
||||
converted.addTransition(new Transition(labels[i++], convert(target,
|
||||
visited)));
|
||||
converted.addTransition(
|
||||
new Transition(labels[i++], convert(target, visited)));
|
||||
}
|
||||
|
||||
return converted;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Build a minimal, deterministic automaton from a sorted list of strings.
|
||||
* Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
|
||||
* strings in UTF-8. These strings must be binary-sorted.
|
||||
*/
|
||||
public static Automaton build(Collection<BytesRef> input) {
|
||||
final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
|
||||
@ -313,7 +282,9 @@ public final class DaciukMihovAutomatonBuilder {
|
||||
}
|
||||
|
||||
Automaton a = new Automaton();
|
||||
a.initial = convert(builder.complete(), new IdentityHashMap<State,org.apache.lucene.util.automaton.State>());
|
||||
a.initial = convert(
|
||||
builder.complete(),
|
||||
new IdentityHashMap<State,org.apache.lucene.util.automaton.State>());
|
||||
a.deterministic = true;
|
||||
return a;
|
||||
}
|
||||
@ -330,21 +301,21 @@ public final class DaciukMihovAutomatonBuilder {
|
||||
|
||||
/**
|
||||
* Replace last child of <code>state</code> with an already registered state
|
||||
* or register the last child state.
|
||||
* or stateRegistry the last child state.
|
||||
*/
|
||||
private void replaceOrRegister(State state) {
|
||||
final State child = state.lastChild();
|
||||
|
||||
if (child.hasChildren()) replaceOrRegister(child);
|
||||
|
||||
final State registered = register.get(child);
|
||||
final State registered = stateRegistry.get(child);
|
||||
if (registered != null) {
|
||||
state.replaceLastChild(registered);
|
||||
} else {
|
||||
register.put(child, child);
|
||||
stateRegistry.put(child, child);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Add a suffix of <code>current</code> starting at <code>fromIndex</code>
|
||||
* (inclusive) to state <code>state</code>.
|
@ -35,7 +35,6 @@ import org.apache.lucene.util._TestUtil;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.BasicAutomata;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
@SuppressCodecs({ "SimpleText", "Memory" })
|
||||
@ -257,7 +256,7 @@ public class TestTermsEnum extends LuceneTestCase {
|
||||
acceptTerms.add(s2);
|
||||
sortedAcceptTerms.add(new BytesRef(s2));
|
||||
}
|
||||
a = DaciukMihovAutomatonBuilder.build(sortedAcceptTerms);
|
||||
a = BasicAutomata.makeStringUnion(sortedAcceptTerms);
|
||||
}
|
||||
|
||||
if (random().nextBoolean()) {
|
||||
|
@ -35,13 +35,7 @@ import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.AutomatonTestUtil;
|
||||
import org.apache.lucene.util.automaton.BasicOperations;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
import org.apache.lucene.util.automaton.SpecialOperations;
|
||||
import org.apache.lucene.util.automaton.*;
|
||||
|
||||
public class TestTermsEnum2 extends LuceneTestCase {
|
||||
private Directory dir;
|
||||
@ -72,7 +66,7 @@ public class TestTermsEnum2 extends LuceneTestCase {
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
termsAutomaton = DaciukMihovAutomatonBuilder.build(terms);
|
||||
termsAutomaton = BasicAutomata.makeStringUnion(terms);
|
||||
|
||||
reader = writer.getReader();
|
||||
searcher = newSearcher(reader);
|
||||
@ -97,7 +91,7 @@ public class TestTermsEnum2 extends LuceneTestCase {
|
||||
}
|
||||
}
|
||||
|
||||
Automaton alternate = DaciukMihovAutomatonBuilder.build(matchedTerms);
|
||||
Automaton alternate = BasicAutomata.makeStringUnion(matchedTerms);
|
||||
//System.out.println("match " + matchedTerms.size() + " " + alternate.getNumberOfStates() + " states, sigma=" + alternate.getStartPoints().length);
|
||||
//AutomatonTestUtil.minimizeSimple(alternate);
|
||||
//System.out.println("minmize done");
|
||||
@ -164,7 +158,7 @@ public class TestTermsEnum2 extends LuceneTestCase {
|
||||
found.add(BytesRef.deepCopyOf(te.term()));
|
||||
}
|
||||
|
||||
Automaton actual = DaciukMihovAutomatonBuilder.build(found);
|
||||
Automaton actual = BasicAutomata.makeStringUnion(found);
|
||||
assertTrue(BasicOperations.sameLanguage(expected, actual));
|
||||
}
|
||||
}
|
||||
|
@ -17,10 +17,35 @@ package org.apache.lucene.util.automaton;
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.lucene.util.*;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomInts;
|
||||
|
||||
public class TestBasicOperations extends LuceneTestCase {
|
||||
/** Test string union. */
|
||||
public void testStringUnion() {
|
||||
List<BytesRef> strings = new ArrayList<BytesRef>();
|
||||
for (int i = RandomInts.randomIntBetween(random(), 0, 1000); --i >= 0;) {
|
||||
strings.add(new BytesRef(_TestUtil.randomUnicodeString(random())));
|
||||
}
|
||||
|
||||
Collections.sort(strings);
|
||||
Automaton union = BasicAutomata.makeStringUnion(strings);
|
||||
assertTrue(union.isDeterministic());
|
||||
assertTrue(BasicOperations.sameLanguage(union, naiveUnion(strings)));
|
||||
}
|
||||
|
||||
private static Automaton naiveUnion(List<BytesRef> strings) {
|
||||
Automaton [] eachIndividual = new Automaton [strings.size()];
|
||||
int i = 0;
|
||||
for (BytesRef bref : strings) {
|
||||
eachIndividual[i++] = BasicAutomata.makeString(bref.utf8ToString());
|
||||
}
|
||||
return BasicOperations.union(Arrays.asList(eachIndividual));
|
||||
}
|
||||
|
||||
public class TestBasicOperations extends LuceneTestCase {
|
||||
/** Test optimization to concatenate() */
|
||||
public void testSingletonConcatenate() {
|
||||
Automaton singleton = BasicAutomata.makeString("prefix");
|
||||
|
Loading…
x
Reference in New Issue
Block a user