mirror of https://github.com/apache/lucene.git
simplify PrefixQuery to avoid requiring Automaton
This commit is contained in:
parent
e4d8a5c5cb
commit
082a4c7436
|
@ -16,9 +16,19 @@
|
|||
*/
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.index.FilteredTermsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
|
||||
/**
|
||||
* A Query that matches documents containing terms with a specified prefix. A PrefixQuery is built
|
||||
|
@ -26,7 +36,11 @@ import org.apache.lucene.util.automaton.Automaton;
|
|||
*
|
||||
* <p>This query uses the {@link MultiTermQuery#CONSTANT_SCORE_BLENDED_REWRITE} rewrite method.
|
||||
*/
|
||||
public class PrefixQuery extends AutomatonQuery {
|
||||
public class PrefixQuery extends MultiTermQuery {
|
||||
private final Term term;
|
||||
private final BytesRef prefix;
|
||||
private final BytesRef limit;
|
||||
private final ByteRunAutomaton[] runAutomaton = new ByteRunAutomaton[1];
|
||||
|
||||
/** Constructs a query for terms starting with <code>prefix</code>. */
|
||||
public PrefixQuery(Term prefix) {
|
||||
|
@ -37,7 +51,15 @@ public class PrefixQuery extends AutomatonQuery {
|
|||
* Constructs a query for terms starting with <code>prefix</code> using a defined RewriteMethod
|
||||
*/
|
||||
public PrefixQuery(Term prefix, RewriteMethod rewriteMethod) {
|
||||
super(prefix, toAutomaton(prefix.bytes()), true, rewriteMethod);
|
||||
super(prefix.field(), rewriteMethod);
|
||||
this.term = prefix;
|
||||
BytesRef tmp = prefix.bytes();
|
||||
byte[] backing = new byte[tmp.length + UnicodeUtil.BIG_TERM.length];
|
||||
System.arraycopy(tmp.bytes, tmp.offset, backing, 0, tmp.length);
|
||||
System.arraycopy(
|
||||
UnicodeUtil.BIG_TERM.bytes, 0, backing, tmp.length, UnicodeUtil.BIG_TERM.length);
|
||||
this.prefix = new BytesRef(backing, 0, tmp.length);
|
||||
this.limit = new BytesRef(backing);
|
||||
}
|
||||
|
||||
/** Build an automaton accepting all terms with the specified prefix. */
|
||||
|
@ -57,6 +79,114 @@ public class PrefixQuery extends AutomatonQuery {
|
|||
return automaton;
|
||||
}
|
||||
|
||||
@Override
|
||||
@SuppressWarnings("fallthrough")
|
||||
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
|
||||
final TermsEnum te = terms.iterator();
|
||||
final BytesRef start;
|
||||
switch (te.seekCeil(PrefixQuery.this.prefix)) {
|
||||
case FOUND:
|
||||
start = PrefixQuery.this.prefix;
|
||||
break;
|
||||
case NOT_FOUND:
|
||||
BytesRef term = te.term();
|
||||
if (StringHelper.startsWith(term, PrefixQuery.this.prefix)) {
|
||||
start = BytesRef.deepCopyOf(term);
|
||||
break;
|
||||
}
|
||||
// fallthrough
|
||||
// $CASES-OMITTED$
|
||||
default:
|
||||
return TermsEnum.EMPTY;
|
||||
}
|
||||
final TermState startState = te.termState();
|
||||
if (te.seekCeil(PrefixQuery.this.limit) == TermsEnum.SeekStatus.END) {
|
||||
te.seekExact(start, startState);
|
||||
return new DirectPrefixTailTermsEnum(te, start);
|
||||
} else {
|
||||
BytesRef limit = te.term();
|
||||
final int tdi = getThresholdDeterminantIdx(PrefixQuery.this.prefix, limit);
|
||||
final int determinant = Byte.toUnsignedInt(limit.bytes[limit.offset + tdi]);
|
||||
final int limitLength = limit.length;
|
||||
te.seekExact(start, startState);
|
||||
return new DirectPrefixTermsEnum(te, start, limitLength, tdi, determinant);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find an index that differs between the prefix and limit. The particular index is arbitrary, but
|
||||
* there is guaranteed to be at least one determinant index. Once we have this index, it will
|
||||
* suffice to check this index only (regardless of how long the prefix or limit threshold term
|
||||
* is).
|
||||
*/
|
||||
private static int getThresholdDeterminantIdx(BytesRef prefix, BytesRef limit) {
|
||||
for (int i = Math.min(prefix.length, limit.length) - 1; i >= 0; i--) {
|
||||
if (prefix.bytes[i] != limit.bytes[limit.offset + i]) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
throw new IllegalStateException("`limit` must not start with `prefix`");
|
||||
}
|
||||
|
||||
private static final class DirectPrefixTermsEnum extends FilteredTermsEnum {
|
||||
private final BytesRef startTerm;
|
||||
private final int thresholdLength;
|
||||
private final int tdi;
|
||||
private final int determinant;
|
||||
|
||||
public DirectPrefixTermsEnum(
|
||||
TermsEnum tenum,
|
||||
BytesRef startTerm,
|
||||
int limitLength,
|
||||
int thresholdDeterminantIdx,
|
||||
int determinant) {
|
||||
super(tenum);
|
||||
this.startTerm = startTerm;
|
||||
this.tdi = thresholdDeterminantIdx;
|
||||
this.thresholdLength = limitLength;
|
||||
this.determinant = determinant;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BytesRef nextSeekTerm(BytesRef currentTerm) {
|
||||
if (currentTerm == null) {
|
||||
return startTerm;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected AcceptStatus accept(BytesRef candidate) {
|
||||
if (thresholdLength == candidate.length
|
||||
&& determinant == Byte.toUnsignedInt(candidate.bytes[candidate.offset + tdi])) {
|
||||
return AcceptStatus.NO_AND_SEEK;
|
||||
} else {
|
||||
return AcceptStatus.YES;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static final class DirectPrefixTailTermsEnum extends FilteredTermsEnum {
|
||||
private final BytesRef startTerm;
|
||||
|
||||
public DirectPrefixTailTermsEnum(TermsEnum tenum, BytesRef startTerm) {
|
||||
super(tenum);
|
||||
this.startTerm = startTerm;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BytesRef nextSeekTerm(BytesRef currentTerm) {
|
||||
assert currentTerm == null;
|
||||
return startTerm;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected AcceptStatus accept(BytesRef candidate) {
|
||||
return AcceptStatus.YES;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the prefix of this query. */
|
||||
public Term getPrefix() {
|
||||
return term;
|
||||
|
@ -75,6 +205,25 @@ public class PrefixQuery extends AutomatonQuery {
|
|||
return buffer.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visit(QueryVisitor visitor) {
|
||||
// build lazily. There are many cases that do not actually use automaton, so we can
|
||||
// often avoid building it.
|
||||
if (visitor.acceptField(field)) {
|
||||
visitor.consumeTermsMatching(
|
||||
this,
|
||||
field,
|
||||
() -> {
|
||||
ByteRunAutomaton ret = runAutomaton[0];
|
||||
if (ret == null) {
|
||||
ret = new CompiledAutomaton(toAutomaton(this.prefix), false, true, true).runAutomaton;
|
||||
runAutomaton[0] = ret;
|
||||
}
|
||||
return ret;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
|
|
Loading…
Reference in New Issue