mirror of https://github.com/apache/lucene.git
Minor cleanup and improvements to DaciukMihovAutomatonBuilder (#12305)
This commit is contained in:
parent
2facb3ae0e
commit
3e4ca4042c
|
@ -123,7 +123,9 @@ New Features
|
||||||
Improvements
|
Improvements
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
GITHUB#12245: Add support for Score Mode to `ToParentBlockJoinQuery` explain. (Marcus Eagan via Mikhail Khludnev)
|
* GITHUB#12245: Add support for Score Mode to `ToParentBlockJoinQuery` explain. (Marcus Eagan via Mikhail Khludnev)
|
||||||
|
|
||||||
|
* GITHUB#12305: Minor cleanup and improvements to DaciukMihovAutomatonBuilder. (Greg Miller)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
---------------------
|
---------------------
|
||||||
|
|
|
@ -24,7 +24,7 @@ import java.util.IdentityHashMap;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.CharsRef;
|
import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.UnicodeUtil;
|
import org.apache.lucene.util.CharsRefBuilder;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds a minimal, deterministic {@link Automaton} that accepts a set of strings. The algorithm
|
* Builds a minimal, deterministic {@link Automaton} that accepts a set of strings. The algorithm
|
||||||
|
@ -179,10 +179,10 @@ public final class DaciukMihovAutomatonBuilder {
|
||||||
private HashMap<State, State> stateRegistry = new HashMap<>();
|
private HashMap<State, State> stateRegistry = new HashMap<>();
|
||||||
|
|
||||||
/** Root automaton state. */
|
/** Root automaton state. */
|
||||||
private State root = new State();
|
private final State root = new State();
|
||||||
|
|
||||||
/** Previous sequence added to the automaton in {@link #add(CharsRef)}. */
|
/** Previous sequence added to the automaton in {@link #add(CharsRef)}. */
|
||||||
private CharsRef previous;
|
private CharsRefBuilder previous;
|
||||||
|
|
||||||
/** A comparator used for enforcing sorted UTF8 order, used in assertions only. */
|
/** A comparator used for enforcing sorted UTF8 order, used in assertions only. */
|
||||||
@SuppressWarnings("deprecation")
|
@SuppressWarnings("deprecation")
|
||||||
|
@ -192,23 +192,33 @@ public final class DaciukMihovAutomatonBuilder {
|
||||||
* Add another character sequence to this automaton. The sequence must be lexicographically larger
|
* Add another character sequence to this automaton. The sequence must be lexicographically larger
|
||||||
* or equal compared to any previous sequences added to this automaton (the input must be sorted).
|
* or equal compared to any previous sequences added to this automaton (the input must be sorted).
|
||||||
*/
|
*/
|
||||||
public void add(CharsRef current) {
|
private void add(CharsRef current) {
|
||||||
if (current.length > MAX_TERM_LENGTH) {
|
if (current.length > MAX_TERM_LENGTH) {
|
||||||
throw new IllegalArgumentException(
|
throw new IllegalArgumentException(
|
||||||
"This builder doesn't allow terms that are larger than 1,000 characters, got " + current);
|
"This builder doesn't allow terms that are larger than 1,000 characters, got " + current);
|
||||||
}
|
}
|
||||||
assert stateRegistry != null : "Automaton already built.";
|
assert stateRegistry != null : "Automaton already built.";
|
||||||
assert previous == null || comparator.compare(previous, current) <= 0
|
assert previous == null || comparator.compare(previous.get(), current) <= 0
|
||||||
: "Input must be in sorted UTF-8 order: " + previous + " >= " + current;
|
: "Input must be in sorted UTF-8 order: " + previous + " >= " + current;
|
||||||
assert setPrevious(current);
|
assert setPrevious(current);
|
||||||
|
|
||||||
// Descend in the automaton (find matching prefix).
|
// Descend in the automaton (find matching prefix).
|
||||||
int pos = 0, max = current.length();
|
int pos = 0, max = current.length();
|
||||||
State next, state = root;
|
State state = root;
|
||||||
while (pos < max && (next = state.lastChild(Character.codePointAt(current, pos))) != null) {
|
for (; ; ) {
|
||||||
|
assert pos <= max;
|
||||||
|
if (pos == max) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
int codePoint = Character.codePointAt(current, pos);
|
||||||
|
State next = state.lastChild(codePoint);
|
||||||
|
if (next == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
state = next;
|
state = next;
|
||||||
// todo, optimize me
|
pos += Character.charCount(codePoint);
|
||||||
pos += Character.charCount(Character.codePointAt(current, pos));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (state.hasChildren()) replaceOrRegister(state);
|
if (state.hasChildren()) replaceOrRegister(state);
|
||||||
|
@ -222,7 +232,7 @@ public final class DaciukMihovAutomatonBuilder {
|
||||||
*
|
*
|
||||||
* @return Root automaton state.
|
* @return Root automaton state.
|
||||||
*/
|
*/
|
||||||
public State complete() {
|
private State complete() {
|
||||||
if (this.stateRegistry == null) throw new IllegalStateException();
|
if (this.stateRegistry == null) throw new IllegalStateException();
|
||||||
|
|
||||||
if (root.hasChildren()) replaceOrRegister(root);
|
if (root.hasChildren()) replaceOrRegister(root);
|
||||||
|
@ -260,27 +270,24 @@ public final class DaciukMihovAutomatonBuilder {
|
||||||
public static Automaton build(Collection<BytesRef> input) {
|
public static Automaton build(Collection<BytesRef> input) {
|
||||||
final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
|
final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
|
||||||
|
|
||||||
char[] chars = new char[0];
|
CharsRefBuilder current = new CharsRefBuilder();
|
||||||
CharsRef ref = new CharsRef();
|
|
||||||
for (BytesRef b : input) {
|
for (BytesRef b : input) {
|
||||||
chars = ArrayUtil.grow(chars, b.length);
|
current.copyUTF8Bytes(b);
|
||||||
final int len = UnicodeUtil.UTF8toUTF16(b, chars);
|
builder.add(current.get());
|
||||||
ref.chars = chars;
|
|
||||||
ref.length = len;
|
|
||||||
builder.add(ref);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Automaton.Builder a = new Automaton.Builder();
|
Automaton.Builder a = new Automaton.Builder();
|
||||||
convert(a, builder.complete(), new IdentityHashMap<State, Integer>());
|
convert(a, builder.complete(), new IdentityHashMap<>());
|
||||||
|
|
||||||
return a.finish();
|
return a.finish();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Copy <code>current</code> into an internal buffer. */
|
/** Copy <code>current</code> into an internal buffer. */
|
||||||
private boolean setPrevious(CharsRef current) {
|
private boolean setPrevious(CharsRef current) {
|
||||||
// don't need to copy, once we fix https://issues.apache.org/jira/browse/LUCENE-3277
|
if (previous == null) {
|
||||||
// still, called only from assert
|
previous = new CharsRefBuilder();
|
||||||
previous = CharsRef.deepCopyOf(current);
|
}
|
||||||
|
previous.copyChars(current);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue