From eb84c04052eef7ab6ebe187c421a4d870f318c41 Mon Sep 17 00:00:00 2001 From: Bruno Roustant Date: Mon, 13 Jan 2020 11:11:35 +0100 Subject: [PATCH] LUCENE-9125: Optimize Automaton.step() with binary search and introduce Automaton.next(). --- lucene/CHANGES.txt | 2 + .../lucene/util/automaton/Automaton.java | 81 ++++++++++++++++--- .../automaton/MinimizationOperations.java | 5 +- .../lucene/util/automaton/RunAutomaton.java | 5 +- 4 files changed, 78 insertions(+), 15 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 449a8931137..92648c460f3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -42,6 +42,8 @@ Optimizations * LUCENE-9113: Faster merging of SORTED/SORTED_SET doc values. (Adrien Grand) +* LUCENE-9125: Optimize Automaton.step() with binary search and introduce Automaton.next(). (Bruno Roustant) + Bug Fixes --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java index 30069402f4d..2758b162825 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java @@ -656,22 +656,77 @@ public class Automaton implements Accountable { * @return destination state, -1 if no matching outgoing transition */ public int step(int state, int label) { + return next(state, 0, label, null); + } + + /** + * Looks for the next transition that matches the provided label, assuming determinism. + *

+ * This method is similar to {@link #step(int, int)} but is used more efficiently + * when iterating over multiple transitions from the same source state. It keeps + * the latest reached transition index in {@code transition.transitionUpto} so + * the next call to this method can continue from there instead of restarting + * from the first transition. + * + * @param transition The transition to start the lookup from (inclusive, using its + * {@link Transition#source} and {@link Transition#transitionUpto}). + * It is updated with the matched transition; + * or with {@link Transition#dest} = -1 if no match. + * @param label The codepoint to look up. + * @return The destination state; or -1 if no matching outgoing transition. + */ + public int next(Transition transition, int label) { + return next(transition.source, transition.transitionUpto, label, transition); + } + + /** + * Looks for the next transition that matches the provided label, assuming determinism. + * + * @param state The source state. + * @param fromTransitionIndex The transition index to start the lookup from (inclusive); negative interpreted as 0. + * @param label The codepoint to look up. + * @param transition The output transition to update with the matching transition; or null for no update. + * @return The destination state; or -1 if no matching outgoing transition. + */ + private int next(int state, int fromTransitionIndex, int label, Transition transition) { assert state >= 0; assert label >= 0; - int trans = states[2*state]; - int limit = trans + 3*states[2*state+1]; - // TODO: we could do bin search; transitions are sorted - while (trans < limit) { - int dest = transitions[trans]; - int min = transitions[trans+1]; - int max = transitions[trans+2]; - if (min <= label && label <= max) { - return dest; - } - trans += 3; - } + int stateIndex = 2 * state; + int firstTransitionIndex = states[stateIndex]; + int numTransitions = states[stateIndex + 1]; - return -1; + // Since transitions are sorted, + // binary search the transition for which label is within [minLabel, maxLabel]. + int low = Math.max(fromTransitionIndex, 0); + int high = numTransitions - 1; + while (low <= high) { + int mid = (low + high) >>> 1; + int transitionIndex = firstTransitionIndex + 3 * mid; + int minLabel = transitions[transitionIndex + 1]; + if (minLabel > label) { + high = mid - 1; + } else { + int maxLabel = transitions[transitionIndex + 2]; + if (maxLabel < label){ + low = mid + 1; + } else { + int destState = transitions[transitionIndex]; + if (transition != null) { + transition.dest = destState; + transition.min = minLabel; + transition.max = maxLabel; + transition.transitionUpto = mid; + } + return destState; + } + } + } + int destState = -1; + if (transition != null) { + transition.dest = destState; + transition.transitionUpto = low; + } + return destState; } /** Records new states and transitions and then {@link diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java index 979f7c52988..1962731b266 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java @@ -94,12 +94,15 @@ final public class MinimizationOperations { } } // find initial partition and reverse edges + Transition transition = new Transition(); for (int q = 0; q < statesLen; q++) { final int j = a.isAccept(q) ? 0 : 1; partition[j].add(q); block[q] = j; + transition.source = q; + transition.transitionUpto = -1; for (int x = 0; x < sigmaLen; x++) { - final ArrayList[] r = reverse[a.step(q, sigma[x])]; + final ArrayList[] r = reverse[a.next(transition, sigma[x])]; if (r[x] == null) { r[x] = new ArrayList<>(); } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java index a42588759f6..e105ac3aaeb 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java @@ -78,10 +78,13 @@ public abstract class RunAutomaton implements Accountable { accept = new boolean[size]; transitions = new int[size * points.length]; Arrays.fill(transitions, -1); + Transition transition = new Transition(); for (int n=0;n