Use Automaton for SurroundQuery prefix/pattern matching (#12999)

This commit is contained in:
Michael Gibney 2024-01-10 13:14:21 -05:00 committed by GitHub
parent d7a14257ce
commit 89a02fa4e3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 31 additions and 104 deletions

View File

@ -189,6 +189,8 @@ Improvements
* GITHUB#12910: Refactor around NeighborArray to make it more self-contained. (Patrick Zhai)
* GITHUB#12999: Use Automaton for SurroundQuery prefix/pattern matching (Michael Gibney)
Optimizations
---------------------

View File

@ -1,7 +1,7 @@
{
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/ParseException.java": "d8b3e605b4bfb01697df5ce246e84fa2b691fb4f",
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParser.java": "e79256ffc3859ac60deca6957ce742c13c1e5649",
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParser.jj": "21b38627431747c741e2ec24be1e7aef38dc70c9",
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParser.java": "f07a1c6a54c544a01c1ba19dd468c2c0a86cb9d8",
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParser.jj": "2a288b7c933ab757c781890c41bea5e5c4fa3b49",
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParserConstants.java": "8feb77878890c27e874be457d839eba48192c40f",
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParserTokenManager.java": "959523aec4e49f9665e39f16e1da335aab3632d1",
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/Token.java": "a5eea2a3043e0aa2781f4a43b9ab9c5d59add80e",

View File

@ -151,7 +151,7 @@ import org.apache.lucene.queryparser.charstream.FastCharStream;
}
protected SrndQuery getTruncQuery(String truncated) {
return new SrndTruncQuery(truncated, TRUNCATOR, ANY_CHAR);
return new SrndTruncQuery(truncated);
}
final public SrndQuery TopSrndQuery() throws ParseException {SrndQuery q;

View File

@ -179,7 +179,7 @@ public class QueryParser {
}
protected SrndQuery getTruncQuery(String truncated) {
return new SrndTruncQuery(truncated, TRUNCATOR, ANY_CHAR);
return new SrndTruncQuery(truncated);
}
}

View File

@ -22,17 +22,19 @@ import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.automaton.CompiledAutomaton;
/** Query that matches String prefixes */
public class SrndPrefixQuery extends SimpleTerm {
private final BytesRef prefixRef;
private final CompiledAutomaton compiled;
public SrndPrefixQuery(String prefix, boolean quoted, char truncator) {
super(quoted);
this.prefix = prefix;
prefixRef = new BytesRef(prefix);
compiled =
new CompiledAutomaton(PrefixQuery.toAutomaton(new BytesRef(prefix)), true, true, true);
this.truncator = truncator;
}
@ -48,10 +50,6 @@ public class SrndPrefixQuery extends SimpleTerm {
return truncator;
}
public Term getLucenePrefixTerm(String fieldName) {
return new Term(fieldName, getPrefix());
}
@Override
public String toStringUnquoted() {
return getPrefix();
@ -65,35 +63,13 @@ public class SrndPrefixQuery extends SimpleTerm {
@Override
public void visitMatchingTerms(IndexReader reader, String fieldName, MatchingTermVisitor mtv)
throws IOException {
/* inspired by PrefixQuery.rewrite(): */
Terms terms = MultiTerms.getTerms(reader, fieldName);
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
TermsEnum termsEnum = compiled.getTermsEnum(terms);
boolean skip = false;
TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(getPrefix()));
if (status == TermsEnum.SeekStatus.FOUND) {
mtv.visitMatchingTerm(getLucenePrefixTerm(fieldName));
} else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
if (StringHelper.startsWith(termsEnum.term(), prefixRef)) {
mtv.visitMatchingTerm(new Term(fieldName, termsEnum.term().utf8ToString()));
} else {
skip = true;
}
} else {
// EOF
skip = true;
}
if (!skip) {
while (true) {
BytesRef text = termsEnum.next();
if (text != null && StringHelper.startsWith(text, prefixRef)) {
mtv.visitMatchingTerm(new Term(fieldName, text.utf8ToString()));
} else {
break;
}
}
BytesRef br;
while ((br = termsEnum.next()) != null) {
mtv.visitMatchingTerm(new Term(fieldName, BytesRef.deepCopyOf(br)));
}
}
}

View File

@ -17,33 +17,32 @@
package org.apache.lucene.queryparser.surround.query;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.Operations;
/** Query that matches wildcards */
public class SrndTruncQuery extends SimpleTerm {
public SrndTruncQuery(String truncated, char unlimited, char mask) {
public SrndTruncQuery(String truncated) {
super(false); /* not quoted */
this.truncated = truncated;
this.unlimited = unlimited;
this.mask = mask;
truncatedToPrefixAndPattern();
compiled =
new CompiledAutomaton(
WildcardQuery.toAutomaton(
new Term(null, truncated), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT),
false,
true,
true);
}
private final String truncated;
private final char unlimited;
private final char mask;
private String prefix;
private BytesRef prefixRef;
private Pattern pattern;
private final CompiledAutomaton compiled;
public String getTruncated() {
return truncated;
@ -54,66 +53,16 @@ public class SrndTruncQuery extends SimpleTerm {
return getTruncated();
}
protected boolean matchingChar(char c) {
return (c != unlimited) && (c != mask);
}
protected void appendRegExpForChar(char c, StringBuilder re) {
if (c == unlimited) re.append(".*");
else if (c == mask) re.append(".");
else re.append(c);
}
protected void truncatedToPrefixAndPattern() {
int i = 0;
while ((i < truncated.length()) && matchingChar(truncated.charAt(i))) {
i++;
}
prefix = truncated.substring(0, i);
prefixRef = new BytesRef(prefix);
StringBuilder re = new StringBuilder();
while (i < truncated.length()) {
appendRegExpForChar(truncated.charAt(i), re);
i++;
}
pattern = Pattern.compile(re.toString());
}
@Override
public void visitMatchingTerms(IndexReader reader, String fieldName, MatchingTermVisitor mtv)
throws IOException {
int prefixLength = prefix.length();
Terms terms = MultiTerms.getTerms(reader, fieldName);
if (terms != null) {
Matcher matcher = pattern.matcher("");
try {
TermsEnum termsEnum = terms.iterator();
TermsEnum termsEnum = compiled.getTermsEnum(terms);
TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef);
BytesRef text;
if (status == TermsEnum.SeekStatus.FOUND) {
text = prefixRef;
} else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
text = termsEnum.term();
} else {
text = null;
}
while (text != null) {
if (StringHelper.startsWith(text, prefixRef)) {
String textString = text.utf8ToString();
matcher.reset(textString.substring(prefixLength));
if (matcher.matches()) {
mtv.visitMatchingTerm(new Term(fieldName, textString));
}
} else {
break;
}
text = termsEnum.next();
}
} finally {
matcher.reset();
BytesRef br;
while ((br = termsEnum.next()) != null) {
mtv.visitMatchingTerm(new Term(fieldName, BytesRef.deepCopyOf(br)));
}
}
}