simplify PrefixQuery to avoid requiring Automaton

2023-03-16 10:11:58 -04:00 · 2023-03-16 10:11:58 -04:00 · 082a4c7436
parent e4d8a5c5cb
commit 082a4c7436
1 changed files with 151 additions and 2 deletions
--- a/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java
@ -16,9 +16,19 @@
 */
 package org.apache.lucene.search;

+import java.io.IOException;
+import org.apache.lucene.index.FilteredTermsEnum;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
+import org.apache.lucene.util.automaton.CompiledAutomaton;

 /**
 * A Query that matches documents containing terms with a specified prefix. A PrefixQuery is built
@ -26,7 +36,11 @@ import org.apache.lucene.util.automaton.Automaton;
 *
 * <p>This query uses the {@link MultiTermQuery#CONSTANT_SCORE_BLENDED_REWRITE} rewrite method.
 */
-public class PrefixQuery extends AutomatonQuery {
+public class PrefixQuery extends MultiTermQuery {
+  private final Term term;
+  private final BytesRef prefix;
+  private final BytesRef limit;
+  private final ByteRunAutomaton[] runAutomaton = new ByteRunAutomaton[1];

  /** Constructs a query for terms starting with <code>prefix</code>. */
  public PrefixQuery(Term prefix) {
@ -37,7 +51,15 @@ public class PrefixQuery extends AutomatonQuery {
   * Constructs a query for terms starting with <code>prefix</code> using a defined RewriteMethod
   */
  public PrefixQuery(Term prefix, RewriteMethod rewriteMethod) {
-    super(prefix, toAutomaton(prefix.bytes()), true, rewriteMethod);
+    super(prefix.field(), rewriteMethod);
+    this.term = prefix;
+    BytesRef tmp = prefix.bytes();
+    byte[] backing = new byte[tmp.length + UnicodeUtil.BIG_TERM.length];
+    System.arraycopy(tmp.bytes, tmp.offset, backing, 0, tmp.length);
+    System.arraycopy(
+        UnicodeUtil.BIG_TERM.bytes, 0, backing, tmp.length, UnicodeUtil.BIG_TERM.length);
+    this.prefix = new BytesRef(backing, 0, tmp.length);
+    this.limit = new BytesRef(backing);
  }

  /** Build an automaton accepting all terms with the specified prefix. */
@ -57,6 +79,114 @@ public class PrefixQuery extends AutomatonQuery {
    return automaton;
  }

+  @Override
+  @SuppressWarnings("fallthrough")
+  protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
+    final TermsEnum te = terms.iterator();
+    final BytesRef start;
+    switch (te.seekCeil(PrefixQuery.this.prefix)) {
+      case FOUND:
+        start = PrefixQuery.this.prefix;
+        break;
+      case NOT_FOUND:
+        BytesRef term = te.term();
+        if (StringHelper.startsWith(term, PrefixQuery.this.prefix)) {
+          start = BytesRef.deepCopyOf(term);
+          break;
+        }
+        // fallthrough
+        // $CASES-OMITTED$
+      default:
+        return TermsEnum.EMPTY;
+    }
+    final TermState startState = te.termState();
+    if (te.seekCeil(PrefixQuery.this.limit) == TermsEnum.SeekStatus.END) {
+      te.seekExact(start, startState);
+      return new DirectPrefixTailTermsEnum(te, start);
+    } else {
+      BytesRef limit = te.term();
+      final int tdi = getThresholdDeterminantIdx(PrefixQuery.this.prefix, limit);
+      final int determinant = Byte.toUnsignedInt(limit.bytes[limit.offset + tdi]);
+      final int limitLength = limit.length;
+      te.seekExact(start, startState);
+      return new DirectPrefixTermsEnum(te, start, limitLength, tdi, determinant);
+    }
+  }
+
+  /**
+   * Find an index that differs between the prefix and limit. The particular index is arbitrary, but
+   * there is guaranteed to be at least one determinant index. Once we have this index, it will
+   * suffice to check this index only (regardless of how long the prefix or limit threshold term
+   * is).
+   */
+  private static int getThresholdDeterminantIdx(BytesRef prefix, BytesRef limit) {
+    for (int i = Math.min(prefix.length, limit.length) - 1; i >= 0; i--) {
+      if (prefix.bytes[i] != limit.bytes[limit.offset + i]) {
+        return i;
+      }
+    }
+    throw new IllegalStateException("`limit` must not start with `prefix`");
+  }
+
+  private static final class DirectPrefixTermsEnum extends FilteredTermsEnum {
+    private final BytesRef startTerm;
+    private final int thresholdLength;
+    private final int tdi;
+    private final int determinant;
+
+    public DirectPrefixTermsEnum(
+        TermsEnum tenum,
+        BytesRef startTerm,
+        int limitLength,
+        int thresholdDeterminantIdx,
+        int determinant) {
+      super(tenum);
+      this.startTerm = startTerm;
+      this.tdi = thresholdDeterminantIdx;
+      this.thresholdLength = limitLength;
+      this.determinant = determinant;
+    }
+
+    @Override
+    protected BytesRef nextSeekTerm(BytesRef currentTerm) {
+      if (currentTerm == null) {
+        return startTerm;
+      } else {
+        return null;
+      }
+    }
+
+    @Override
+    protected AcceptStatus accept(BytesRef candidate) {
+      if (thresholdLength == candidate.length
+          && determinant == Byte.toUnsignedInt(candidate.bytes[candidate.offset + tdi])) {
+        return AcceptStatus.NO_AND_SEEK;
+      } else {
+        return AcceptStatus.YES;
+      }
+    }
+  }
+
+  private static final class DirectPrefixTailTermsEnum extends FilteredTermsEnum {
+    private final BytesRef startTerm;
+
+    public DirectPrefixTailTermsEnum(TermsEnum tenum, BytesRef startTerm) {
+      super(tenum);
+      this.startTerm = startTerm;
+    }
+
+    @Override
+    protected BytesRef nextSeekTerm(BytesRef currentTerm) {
+      assert currentTerm == null;
+      return startTerm;
+    }
+
+    @Override
+    protected AcceptStatus accept(BytesRef candidate) {
+      return AcceptStatus.YES;
+    }
+  }
+
  /** Returns the prefix of this query. */
  public Term getPrefix() {
    return term;
@ -75,6 +205,25 @@ public class PrefixQuery extends AutomatonQuery {
    return buffer.toString();
  }

+  @Override
+  public void visit(QueryVisitor visitor) {
+    // build lazily. There are many cases that do not actually use automaton, so we can
+    // often avoid building it.
+    if (visitor.acceptField(field)) {
+      visitor.consumeTermsMatching(
+          this,
+          field,
+          () -> {
+            ByteRunAutomaton ret = runAutomaton[0];
+            if (ret == null) {
+              ret = new CompiledAutomaton(toAutomaton(this.prefix), false, true, true).runAutomaton;
+              runAutomaton[0] = ret;
+            }
+            return ret;
+          });
+    }
+  }
+
  @Override
  public int hashCode() {
    final int prime = 31;