Minor cleanup and improvements to DaciukMihovAutomatonBuilder (#12305)

This commit is contained in:
Greg Miller 2023-05-18 07:01:19 -07:00 committed by GitHub
parent 2facb3ae0e
commit 3e4ca4042c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 22 deletions

View File

@ -123,7 +123,9 @@ New Features
Improvements Improvements
--------------------- ---------------------
GITHUB#12245: Add support for Score Mode to `ToParentBlockJoinQuery` explain. (Marcus Eagan via Mikhail Khludnev) * GITHUB#12245: Add support for Score Mode to `ToParentBlockJoinQuery` explain. (Marcus Eagan via Mikhail Khludnev)
* GITHUB#12305: Minor cleanup and improvements to DaciukMihovAutomatonBuilder. (Greg Miller)
Optimizations Optimizations
--------------------- ---------------------

View File

@ -24,7 +24,7 @@ import java.util.IdentityHashMap;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.CharsRefBuilder;
/** /**
* Builds a minimal, deterministic {@link Automaton} that accepts a set of strings. The algorithm * Builds a minimal, deterministic {@link Automaton} that accepts a set of strings. The algorithm
@ -179,10 +179,10 @@ public final class DaciukMihovAutomatonBuilder {
private HashMap<State, State> stateRegistry = new HashMap<>(); private HashMap<State, State> stateRegistry = new HashMap<>();
/** Root automaton state. */ /** Root automaton state. */
private State root = new State(); private final State root = new State();
/** Previous sequence added to the automaton in {@link #add(CharsRef)}. */ /** Previous sequence added to the automaton in {@link #add(CharsRef)}. */
private CharsRef previous; private CharsRefBuilder previous;
/** A comparator used for enforcing sorted UTF8 order, used in assertions only. */ /** A comparator used for enforcing sorted UTF8 order, used in assertions only. */
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
@ -192,23 +192,33 @@ public final class DaciukMihovAutomatonBuilder {
* Add another character sequence to this automaton. The sequence must be lexicographically larger * Add another character sequence to this automaton. The sequence must be lexicographically larger
* or equal compared to any previous sequences added to this automaton (the input must be sorted). * or equal compared to any previous sequences added to this automaton (the input must be sorted).
*/ */
public void add(CharsRef current) { private void add(CharsRef current) {
if (current.length > MAX_TERM_LENGTH) { if (current.length > MAX_TERM_LENGTH) {
throw new IllegalArgumentException( throw new IllegalArgumentException(
"This builder doesn't allow terms that are larger than 1,000 characters, got " + current); "This builder doesn't allow terms that are larger than 1,000 characters, got " + current);
} }
assert stateRegistry != null : "Automaton already built."; assert stateRegistry != null : "Automaton already built.";
assert previous == null || comparator.compare(previous, current) <= 0 assert previous == null || comparator.compare(previous.get(), current) <= 0
: "Input must be in sorted UTF-8 order: " + previous + " >= " + current; : "Input must be in sorted UTF-8 order: " + previous + " >= " + current;
assert setPrevious(current); assert setPrevious(current);
// Descend in the automaton (find matching prefix). // Descend in the automaton (find matching prefix).
int pos = 0, max = current.length(); int pos = 0, max = current.length();
State next, state = root; State state = root;
while (pos < max && (next = state.lastChild(Character.codePointAt(current, pos))) != null) { for (; ; ) {
assert pos <= max;
if (pos == max) {
break;
}
int codePoint = Character.codePointAt(current, pos);
State next = state.lastChild(codePoint);
if (next == null) {
break;
}
state = next; state = next;
// todo, optimize me pos += Character.charCount(codePoint);
pos += Character.charCount(Character.codePointAt(current, pos));
} }
if (state.hasChildren()) replaceOrRegister(state); if (state.hasChildren()) replaceOrRegister(state);
@ -222,7 +232,7 @@ public final class DaciukMihovAutomatonBuilder {
* *
* @return Root automaton state. * @return Root automaton state.
*/ */
public State complete() { private State complete() {
if (this.stateRegistry == null) throw new IllegalStateException(); if (this.stateRegistry == null) throw new IllegalStateException();
if (root.hasChildren()) replaceOrRegister(root); if (root.hasChildren()) replaceOrRegister(root);
@ -260,27 +270,24 @@ public final class DaciukMihovAutomatonBuilder {
public static Automaton build(Collection<BytesRef> input) { public static Automaton build(Collection<BytesRef> input) {
final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder(); final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
char[] chars = new char[0]; CharsRefBuilder current = new CharsRefBuilder();
CharsRef ref = new CharsRef();
for (BytesRef b : input) { for (BytesRef b : input) {
chars = ArrayUtil.grow(chars, b.length); current.copyUTF8Bytes(b);
final int len = UnicodeUtil.UTF8toUTF16(b, chars); builder.add(current.get());
ref.chars = chars;
ref.length = len;
builder.add(ref);
} }
Automaton.Builder a = new Automaton.Builder(); Automaton.Builder a = new Automaton.Builder();
convert(a, builder.complete(), new IdentityHashMap<State, Integer>()); convert(a, builder.complete(), new IdentityHashMap<>());
return a.finish(); return a.finish();
} }
/** Copy <code>current</code> into an internal buffer. */ /** Copy <code>current</code> into an internal buffer. */
private boolean setPrevious(CharsRef current) { private boolean setPrevious(CharsRef current) {
// don't need to copy, once we fix https://issues.apache.org/jira/browse/LUCENE-3277 if (previous == null) {
// still, called only from assert previous = new CharsRefBuilder();
previous = CharsRef.deepCopyOf(current); }
previous.copyChars(current);
return true; return true;
} }