From 082a4c7436359a7a303ee6510e651cd1537f771e Mon Sep 17 00:00:00 2001 From: Michael Gibney Date: Thu, 16 Mar 2023 10:11:58 -0400 Subject: [PATCH] simplify PrefixQuery to avoid requiring Automaton --- .../org/apache/lucene/search/PrefixQuery.java | 153 +++++++++++++++++- 1 file changed, 151 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java b/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java index a8cba561ef0..b0122756997 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java @@ -16,9 +16,19 @@ */ package org.apache.lucene.search; +import java.io.IOException; +import org.apache.lucene.index.FilteredTermsEnum; import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.ByteRunAutomaton; +import org.apache.lucene.util.automaton.CompiledAutomaton; /** * A Query that matches documents containing terms with a specified prefix. A PrefixQuery is built @@ -26,7 +36,11 @@ import org.apache.lucene.util.automaton.Automaton; * *

This query uses the {@link MultiTermQuery#CONSTANT_SCORE_BLENDED_REWRITE} rewrite method. */ -public class PrefixQuery extends AutomatonQuery { +public class PrefixQuery extends MultiTermQuery { + private final Term term; + private final BytesRef prefix; + private final BytesRef limit; + private final ByteRunAutomaton[] runAutomaton = new ByteRunAutomaton[1]; /** Constructs a query for terms starting with prefix. */ public PrefixQuery(Term prefix) { @@ -37,7 +51,15 @@ public class PrefixQuery extends AutomatonQuery { * Constructs a query for terms starting with prefix using a defined RewriteMethod */ public PrefixQuery(Term prefix, RewriteMethod rewriteMethod) { - super(prefix, toAutomaton(prefix.bytes()), true, rewriteMethod); + super(prefix.field(), rewriteMethod); + this.term = prefix; + BytesRef tmp = prefix.bytes(); + byte[] backing = new byte[tmp.length + UnicodeUtil.BIG_TERM.length]; + System.arraycopy(tmp.bytes, tmp.offset, backing, 0, tmp.length); + System.arraycopy( + UnicodeUtil.BIG_TERM.bytes, 0, backing, tmp.length, UnicodeUtil.BIG_TERM.length); + this.prefix = new BytesRef(backing, 0, tmp.length); + this.limit = new BytesRef(backing); } /** Build an automaton accepting all terms with the specified prefix. */ @@ -57,6 +79,114 @@ public class PrefixQuery extends AutomatonQuery { return automaton; } + @Override + @SuppressWarnings("fallthrough") + protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { + final TermsEnum te = terms.iterator(); + final BytesRef start; + switch (te.seekCeil(PrefixQuery.this.prefix)) { + case FOUND: + start = PrefixQuery.this.prefix; + break; + case NOT_FOUND: + BytesRef term = te.term(); + if (StringHelper.startsWith(term, PrefixQuery.this.prefix)) { + start = BytesRef.deepCopyOf(term); + break; + } + // fallthrough + // $CASES-OMITTED$ + default: + return TermsEnum.EMPTY; + } + final TermState startState = te.termState(); + if (te.seekCeil(PrefixQuery.this.limit) == TermsEnum.SeekStatus.END) { + te.seekExact(start, startState); + return new DirectPrefixTailTermsEnum(te, start); + } else { + BytesRef limit = te.term(); + final int tdi = getThresholdDeterminantIdx(PrefixQuery.this.prefix, limit); + final int determinant = Byte.toUnsignedInt(limit.bytes[limit.offset + tdi]); + final int limitLength = limit.length; + te.seekExact(start, startState); + return new DirectPrefixTermsEnum(te, start, limitLength, tdi, determinant); + } + } + + /** + * Find an index that differs between the prefix and limit. The particular index is arbitrary, but + * there is guaranteed to be at least one determinant index. Once we have this index, it will + * suffice to check this index only (regardless of how long the prefix or limit threshold term + * is). + */ + private static int getThresholdDeterminantIdx(BytesRef prefix, BytesRef limit) { + for (int i = Math.min(prefix.length, limit.length) - 1; i >= 0; i--) { + if (prefix.bytes[i] != limit.bytes[limit.offset + i]) { + return i; + } + } + throw new IllegalStateException("`limit` must not start with `prefix`"); + } + + private static final class DirectPrefixTermsEnum extends FilteredTermsEnum { + private final BytesRef startTerm; + private final int thresholdLength; + private final int tdi; + private final int determinant; + + public DirectPrefixTermsEnum( + TermsEnum tenum, + BytesRef startTerm, + int limitLength, + int thresholdDeterminantIdx, + int determinant) { + super(tenum); + this.startTerm = startTerm; + this.tdi = thresholdDeterminantIdx; + this.thresholdLength = limitLength; + this.determinant = determinant; + } + + @Override + protected BytesRef nextSeekTerm(BytesRef currentTerm) { + if (currentTerm == null) { + return startTerm; + } else { + return null; + } + } + + @Override + protected AcceptStatus accept(BytesRef candidate) { + if (thresholdLength == candidate.length + && determinant == Byte.toUnsignedInt(candidate.bytes[candidate.offset + tdi])) { + return AcceptStatus.NO_AND_SEEK; + } else { + return AcceptStatus.YES; + } + } + } + + private static final class DirectPrefixTailTermsEnum extends FilteredTermsEnum { + private final BytesRef startTerm; + + public DirectPrefixTailTermsEnum(TermsEnum tenum, BytesRef startTerm) { + super(tenum); + this.startTerm = startTerm; + } + + @Override + protected BytesRef nextSeekTerm(BytesRef currentTerm) { + assert currentTerm == null; + return startTerm; + } + + @Override + protected AcceptStatus accept(BytesRef candidate) { + return AcceptStatus.YES; + } + } + /** Returns the prefix of this query. */ public Term getPrefix() { return term; @@ -75,6 +205,25 @@ public class PrefixQuery extends AutomatonQuery { return buffer.toString(); } + @Override + public void visit(QueryVisitor visitor) { + // build lazily. There are many cases that do not actually use automaton, so we can + // often avoid building it. + if (visitor.acceptField(field)) { + visitor.consumeTermsMatching( + this, + field, + () -> { + ByteRunAutomaton ret = runAutomaton[0]; + if (ret == null) { + ret = new CompiledAutomaton(toAutomaton(this.prefix), false, true, true).runAutomaton; + runAutomaton[0] = ret; + } + return ret; + }); + } + } + @Override public int hashCode() { final int prime = 31;