mirror of
https://github.com/apache/lucene.git
synced 2025-03-07 00:39:21 +00:00
Use Automaton for SurroundQuery prefix/pattern matching (#12999)
This commit is contained in:
parent
d7a14257ce
commit
89a02fa4e3
@ -189,6 +189,8 @@ Improvements
|
||||
|
||||
* GITHUB#12910: Refactor around NeighborArray to make it more self-contained. (Patrick Zhai)
|
||||
|
||||
* GITHUB#12999: Use Automaton for SurroundQuery prefix/pattern matching (Michael Gibney)
|
||||
|
||||
Optimizations
|
||||
---------------------
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
{
|
||||
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/ParseException.java": "d8b3e605b4bfb01697df5ce246e84fa2b691fb4f",
|
||||
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParser.java": "e79256ffc3859ac60deca6957ce742c13c1e5649",
|
||||
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParser.jj": "21b38627431747c741e2ec24be1e7aef38dc70c9",
|
||||
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParser.java": "f07a1c6a54c544a01c1ba19dd468c2c0a86cb9d8",
|
||||
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParser.jj": "2a288b7c933ab757c781890c41bea5e5c4fa3b49",
|
||||
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParserConstants.java": "8feb77878890c27e874be457d839eba48192c40f",
|
||||
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/QueryParserTokenManager.java": "959523aec4e49f9665e39f16e1da335aab3632d1",
|
||||
"lucene/queryparser/src/java/org/apache/lucene/queryparser/surround/parser/Token.java": "a5eea2a3043e0aa2781f4a43b9ab9c5d59add80e",
|
||||
|
@ -151,7 +151,7 @@ import org.apache.lucene.queryparser.charstream.FastCharStream;
|
||||
}
|
||||
|
||||
protected SrndQuery getTruncQuery(String truncated) {
|
||||
return new SrndTruncQuery(truncated, TRUNCATOR, ANY_CHAR);
|
||||
return new SrndTruncQuery(truncated);
|
||||
}
|
||||
|
||||
final public SrndQuery TopSrndQuery() throws ParseException {SrndQuery q;
|
||||
|
@ -179,7 +179,7 @@ public class QueryParser {
|
||||
}
|
||||
|
||||
protected SrndQuery getTruncQuery(String truncated) {
|
||||
return new SrndTruncQuery(truncated, TRUNCATOR, ANY_CHAR);
|
||||
return new SrndTruncQuery(truncated);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -22,17 +22,19 @@ import org.apache.lucene.index.MultiTerms;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
|
||||
/** Query that matches String prefixes */
|
||||
public class SrndPrefixQuery extends SimpleTerm {
|
||||
private final BytesRef prefixRef;
|
||||
private final CompiledAutomaton compiled;
|
||||
|
||||
public SrndPrefixQuery(String prefix, boolean quoted, char truncator) {
|
||||
super(quoted);
|
||||
this.prefix = prefix;
|
||||
prefixRef = new BytesRef(prefix);
|
||||
compiled =
|
||||
new CompiledAutomaton(PrefixQuery.toAutomaton(new BytesRef(prefix)), true, true, true);
|
||||
this.truncator = truncator;
|
||||
}
|
||||
|
||||
@ -48,10 +50,6 @@ public class SrndPrefixQuery extends SimpleTerm {
|
||||
return truncator;
|
||||
}
|
||||
|
||||
public Term getLucenePrefixTerm(String fieldName) {
|
||||
return new Term(fieldName, getPrefix());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toStringUnquoted() {
|
||||
return getPrefix();
|
||||
@ -65,35 +63,13 @@ public class SrndPrefixQuery extends SimpleTerm {
|
||||
@Override
|
||||
public void visitMatchingTerms(IndexReader reader, String fieldName, MatchingTermVisitor mtv)
|
||||
throws IOException {
|
||||
/* inspired by PrefixQuery.rewrite(): */
|
||||
Terms terms = MultiTerms.getTerms(reader, fieldName);
|
||||
if (terms != null) {
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
TermsEnum termsEnum = compiled.getTermsEnum(terms);
|
||||
|
||||
boolean skip = false;
|
||||
TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(getPrefix()));
|
||||
if (status == TermsEnum.SeekStatus.FOUND) {
|
||||
mtv.visitMatchingTerm(getLucenePrefixTerm(fieldName));
|
||||
} else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
|
||||
if (StringHelper.startsWith(termsEnum.term(), prefixRef)) {
|
||||
mtv.visitMatchingTerm(new Term(fieldName, termsEnum.term().utf8ToString()));
|
||||
} else {
|
||||
skip = true;
|
||||
}
|
||||
} else {
|
||||
// EOF
|
||||
skip = true;
|
||||
}
|
||||
|
||||
if (!skip) {
|
||||
while (true) {
|
||||
BytesRef text = termsEnum.next();
|
||||
if (text != null && StringHelper.startsWith(text, prefixRef)) {
|
||||
mtv.visitMatchingTerm(new Term(fieldName, text.utf8ToString()));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
BytesRef br;
|
||||
while ((br = termsEnum.next()) != null) {
|
||||
mtv.visitMatchingTerm(new Term(fieldName, BytesRef.deepCopyOf(br)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -17,33 +17,32 @@
|
||||
package org.apache.lucene.queryparser.surround.query;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiTerms;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
|
||||
/** Query that matches wildcards */
|
||||
public class SrndTruncQuery extends SimpleTerm {
|
||||
public SrndTruncQuery(String truncated, char unlimited, char mask) {
|
||||
public SrndTruncQuery(String truncated) {
|
||||
super(false); /* not quoted */
|
||||
this.truncated = truncated;
|
||||
this.unlimited = unlimited;
|
||||
this.mask = mask;
|
||||
truncatedToPrefixAndPattern();
|
||||
compiled =
|
||||
new CompiledAutomaton(
|
||||
WildcardQuery.toAutomaton(
|
||||
new Term(null, truncated), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT),
|
||||
false,
|
||||
true,
|
||||
true);
|
||||
}
|
||||
|
||||
private final String truncated;
|
||||
private final char unlimited;
|
||||
private final char mask;
|
||||
|
||||
private String prefix;
|
||||
private BytesRef prefixRef;
|
||||
private Pattern pattern;
|
||||
private final CompiledAutomaton compiled;
|
||||
|
||||
public String getTruncated() {
|
||||
return truncated;
|
||||
@ -54,66 +53,16 @@ public class SrndTruncQuery extends SimpleTerm {
|
||||
return getTruncated();
|
||||
}
|
||||
|
||||
protected boolean matchingChar(char c) {
|
||||
return (c != unlimited) && (c != mask);
|
||||
}
|
||||
|
||||
protected void appendRegExpForChar(char c, StringBuilder re) {
|
||||
if (c == unlimited) re.append(".*");
|
||||
else if (c == mask) re.append(".");
|
||||
else re.append(c);
|
||||
}
|
||||
|
||||
protected void truncatedToPrefixAndPattern() {
|
||||
int i = 0;
|
||||
while ((i < truncated.length()) && matchingChar(truncated.charAt(i))) {
|
||||
i++;
|
||||
}
|
||||
prefix = truncated.substring(0, i);
|
||||
prefixRef = new BytesRef(prefix);
|
||||
|
||||
StringBuilder re = new StringBuilder();
|
||||
while (i < truncated.length()) {
|
||||
appendRegExpForChar(truncated.charAt(i), re);
|
||||
i++;
|
||||
}
|
||||
pattern = Pattern.compile(re.toString());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visitMatchingTerms(IndexReader reader, String fieldName, MatchingTermVisitor mtv)
|
||||
throws IOException {
|
||||
int prefixLength = prefix.length();
|
||||
Terms terms = MultiTerms.getTerms(reader, fieldName);
|
||||
if (terms != null) {
|
||||
Matcher matcher = pattern.matcher("");
|
||||
try {
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
TermsEnum termsEnum = compiled.getTermsEnum(terms);
|
||||
|
||||
TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef);
|
||||
BytesRef text;
|
||||
if (status == TermsEnum.SeekStatus.FOUND) {
|
||||
text = prefixRef;
|
||||
} else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
|
||||
text = termsEnum.term();
|
||||
} else {
|
||||
text = null;
|
||||
}
|
||||
|
||||
while (text != null) {
|
||||
if (StringHelper.startsWith(text, prefixRef)) {
|
||||
String textString = text.utf8ToString();
|
||||
matcher.reset(textString.substring(prefixLength));
|
||||
if (matcher.matches()) {
|
||||
mtv.visitMatchingTerm(new Term(fieldName, textString));
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
text = termsEnum.next();
|
||||
}
|
||||
} finally {
|
||||
matcher.reset();
|
||||
BytesRef br;
|
||||
while ((br = termsEnum.next()) != null) {
|
||||
mtv.visitMatchingTerm(new Term(fieldName, BytesRef.deepCopyOf(br)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user