LUCENE-9062: QueryVisitor.consumeTermsMatching (#1037)

This commit adds a consumeTermsMatching() method to QueryVisitor, allowing
queries that match against a class of terms to report this back to the visitor. It also
changes highlighting code to use this new method, replacing the current implementation
via instanceof checks.
This commit is contained in:
Alan Woodward 2019-11-27 16:28:19 +00:00
parent 2144bc9b4e
commit e681f9dca4
17 changed files with 240 additions and 148 deletions

View File

@ -28,6 +28,10 @@ Improvements
* LUCENE-9036: ExitableDirectoryReader may interupt scaning over DocValues (Mikhail Khludnev)
* LUCENE-9062: QueryVisitor now has a consumeTermsMatching() method, allowing queries
that match a class of terms to pass a ByteRunAutomaton matching those that class
back to the visitor. (Alan Woodward, David Smiley)
Optimizations
* LUCENE-8928: When building a kd-tree for dimensions n > 2, compute exact bounds for an inner node every N splits

View File

@ -162,8 +162,8 @@ public class AutomatonQuery extends MultiTermQuery implements Accountable {
@Override
public void visit(QueryVisitor visitor) {
if (visitor.acceptField(getField())) {
visitor.visitLeaf(this);
if (visitor.acceptField(field)) {
compiled.visit(visitor, this, field);
}
}

View File

@ -25,7 +25,9 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
/** Implements the fuzzy search query. The similarity measurement
* is based on the Damerau-Levenshtein (optimal string alignment) algorithm,
@ -156,9 +158,14 @@ public class FuzzyQuery extends MultiTermQuery {
@Override
public void visit(QueryVisitor visitor) {
// TODO find some way of consuming Automata
if (visitor.acceptField(term.field())) {
visitor.visitLeaf(this);
if (visitor.acceptField(field)) {
if (maxEdits == 0 || prefixLength >= term.text().length()) {
visitor.consumeTerms(this, term);
} else {
// Note: we're rebuilding the automaton here, so this can be expensive
visitor.consumeTermsMatching(this, field,
new ByteRunAutomaton(toAutomaton(), false, Operations.DEFAULT_MAX_DETERMINIZED_STATES));
}
}
}

View File

@ -21,6 +21,7 @@ import java.util.Arrays;
import java.util.Set;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
/**
* Allows recursion through a query tree
@ -37,8 +38,18 @@ public abstract class QueryVisitor {
*/
public void consumeTerms(Query query, Term... terms) { }
// TODO it would be nice to have a way to consume 'classes' of Terms from
// things like AutomatonQuery
/**
* Called by leaf queries that match on a class of terms
*
* @param query the leaf query
* @param field the field queried against
* @param automaton an automaton defining which terms match
*
* @lucene.experimental
*/
public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
visitLeaf(query); // default impl for backward compatibility
}
/**
* Called by leaf queries that do not match on terms

View File

@ -16,7 +16,6 @@
*/
package org.apache.lucene.util.automaton;
/**
* Automaton representation for matching UTF-8 byte[].
*/

View File

@ -16,14 +16,17 @@
*/
package org.apache.lucene.util.automaton;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.SingleTermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
@ -344,6 +347,27 @@ public class CompiledAutomaton implements Accountable {
}
}
/**
* Report back to a QueryVisitor how this automaton matches terms
*/
public void visit(QueryVisitor visitor, Query parent, String field) {
if (visitor.acceptField(field)) {
switch (type) {
case NORMAL:
visitor.consumeTermsMatching(parent, field, runAutomaton);
break;
case NONE:
break;
case ALL:
visitor.consumeTermsMatching(parent, field, new ByteRunAutomaton(Automata.makeAnyString()));
break;
case SINGLE:
visitor.consumeTerms(parent, new Term(field, term));
break;
}
}
}
/** Finds largest term accepted by this Automaton, that's
* <= the provided input term. The result is placed in
* output; it's fine for output and input to point to

View File

@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.util.List;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
* Matches a character array
*
* @lucene.internal
*/
public interface CharArrayMatcher {
/**
* Return {@code true} if the passed-in character array matches
*/
boolean match(char[] s, int offset, int length);
/**
* Return {@code true} if the passed-in CharsRef matches
*/
default boolean match(CharsRef chars) {
return match(chars.chars, chars.offset, chars.length);
}
static CharArrayMatcher fromTerms(List<BytesRef> terms) {
CharacterRunAutomaton a = new CharacterRunAutomaton(Automata.makeStringUnion(terms));
return a::run;
}
}

View File

@ -31,7 +31,6 @@ import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
* Ultimately returns an {@link OffsetsEnum} yielding potentially highlightable words in the text. Needs
@ -168,7 +167,7 @@ public abstract class FieldOffsetStrategy {
}
protected void createOffsetsEnumsForAutomata(Terms termsIndex, int doc, List<OffsetsEnum> results) throws IOException {
final CharacterRunAutomaton[] automata = components.getAutomata();
final LabelledCharArrayMatcher[] automata = components.getAutomata();
List<List<PostingsEnum>> automataPostings = new ArrayList<>(automata.length);
for (int i = 0; i < automata.length; i++) {
automataPostings.add(new ArrayList<>());
@ -180,9 +179,9 @@ public abstract class FieldOffsetStrategy {
CharsRefBuilder refBuilder = new CharsRefBuilder();
while ((term = termsEnum.next()) != null) {
for (int i = 0; i < automata.length; i++) {
CharacterRunAutomaton automaton = automata[i];
CharArrayMatcher automaton = automata[i];
refBuilder.copyUTF8Bytes(term);
if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) {
if (automaton.match(refBuilder.get())) {
PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
if (doc == postings.advance(doc)) {
automataPostings.get(i).add(postings);
@ -192,13 +191,13 @@ public abstract class FieldOffsetStrategy {
}
for (int i = 0; i < automata.length; i++) {
CharacterRunAutomaton automaton = automata[i];
LabelledCharArrayMatcher automaton = automata[i];
List<PostingsEnum> postingsEnums = automataPostings.get(i);
if (postingsEnums.isEmpty()) {
continue;
}
// Build one OffsetsEnum exposing the automata.toString as the term, and the sum of freq
BytesRef wildcardTerm = new BytesRef(automaton.toString());
// Build one OffsetsEnum exposing the automaton label as the term, and the sum of freq
BytesRef wildcardTerm = new BytesRef(automaton.getLabel());
int sumFreq = 0;
for (PostingsEnum postingsEnum : postingsEnums) {
sumFreq += postingsEnum.freq();

View File

@ -0,0 +1,88 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
/**
* Associates a label with a CharArrayMatcher to distinguish different sources for terms in highlighting
*
* @lucene.internal
*/
public interface LabelledCharArrayMatcher extends CharArrayMatcher {
/**
* @return the label for this matcher
*/
String getLabel();
/**
* Associates a label with a CharArrayMatcher
*/
static LabelledCharArrayMatcher wrap(String label, CharArrayMatcher in) {
return new LabelledCharArrayMatcher() {
@Override
public String getLabel() {
return label;
}
@Override
public boolean match(char[] s, int offset, int length) {
return in.match(s, offset, length);
}
};
}
/**
* Returns a representation of the automaton that matches char[] instead of byte[]
*/
static LabelledCharArrayMatcher wrap(String label, ByteRunAutomaton runAutomaton) {
return wrap(label, (chars, offset, length) -> {
int state = 0;
final int maxIdx = offset + length;
for (int i = offset; i < maxIdx; i++) {
final int code = chars[i];
int b;
// UTF16 to UTF8 (inlined logic from UnicodeUtil.UTF16toUTF8 )
if (code < 0x80) {
state = runAutomaton.step(state, code);
if (state == -1) return false;
} else if (code < 0x800) {
b = (0xC0 | (code >> 6));
state = runAutomaton.step(state, b);
if (state == -1) return false;
b = (0x80 | (code & 0x3F));
state = runAutomaton.step(state, b);
if (state == -1) return false;
} else {
// more complex
byte[] utf8Bytes = new byte[4 * (maxIdx - i)];
int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes);
for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) {
state = runAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF);
if (state == -1) return false;
}
break;
}
}
return runAutomaton.isAccept(state);
});
}
}

View File

@ -29,8 +29,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
@ -42,7 +40,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
private final MemoryIndex memoryIndex;
private final LeafReader memIndexLeafReader;
private final CharacterRunAutomaton preMemIndexFilterAutomaton;
private final CharArrayMatcher preMemIndexFilterAutomaton;
public MemoryIndexOffsetStrategy(UHComponents components, Analyzer analyzer) {
super(components, analyzer);
@ -54,17 +52,17 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
}
/**
* Build one {@link CharacterRunAutomaton} matching any term the query might match.
* Build one {@link CharArrayMatcher} matching any term the query might match.
*/
private static CharacterRunAutomaton buildCombinedAutomaton(UHComponents components) {
private static CharArrayMatcher buildCombinedAutomaton(UHComponents components) {
// We don't know enough about the query to do this confidently
if (components.getTerms() == null || components.getAutomata() == null) {
return null;
}
List<CharacterRunAutomaton> allAutomata = new ArrayList<>();
List<CharArrayMatcher> allAutomata = new ArrayList<>();
if (components.getTerms().length > 0) {
allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(components.getTerms()))));
allAutomata.add(CharArrayMatcher.fromTerms(Arrays.asList(components.getTerms())));
}
Collections.addAll(allAutomata, components.getAutomata());
for (SpanQuery spanQuery : components.getPhraseHelper().getSpanQueries()) {
@ -75,20 +73,18 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
if (allAutomata.size() == 1) {
return allAutomata.get(0);
}
//TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
// could union them all. But it's not exposed, and sometimes the automaton is byte (not char) oriented
// Return an aggregate CharacterRunAutomaton of others
return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used
@Override
public boolean run(char[] chars, int offset, int length) {
for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
if (allAutomata.get(i).run(chars, offset, length)) {
return true;
}
// Return an aggregate CharArrayMatcher of others
return (chars, offset, length) -> {
for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
if (allAutomata.get(i).match(chars, offset, length)) {
return true;
}
return false;
}
return false;
};
}
@ -118,14 +114,14 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
}
private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream,
final CharacterRunAutomaton charRunAutomaton) {
final CharArrayMatcher matcher) {
// it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface?
return new FilteringTokenFilter(tokenStream) {
final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class);
@Override
protected boolean accept() throws IOException {
return charRunAutomaton.run(charAtt.buffer(), 0, charAtt.length());
return matcher.match(charAtt.buffer(), 0, charAtt.length());
}
};
}

View File

@ -26,12 +26,7 @@ import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.Operations;
/**
* Support for highlighting multi-term queries.
@ -46,11 +41,10 @@ final class MultiTermHighlighting {
* Extracts MultiTermQueries that match the provided field predicate.
* Returns equivalent automata that will match terms.
*/
static CharacterRunAutomaton[] extractAutomata(Query query, Predicate<String> fieldMatcher, boolean lookInSpan) {
static LabelledCharArrayMatcher[] extractAutomata(Query query, Predicate<String> fieldMatcher, boolean lookInSpan) {
AutomataCollector collector = new AutomataCollector(lookInSpan, fieldMatcher);
query.visit(collector);
return collector.runAutomata.toArray(new CharacterRunAutomaton[0]);
return collector.runAutomata.toArray(new LabelledCharArrayMatcher[0]);
}
/**
@ -63,7 +57,7 @@ final class MultiTermHighlighting {
private static class AutomataCollector extends QueryVisitor {
List<CharacterRunAutomaton> runAutomata = new ArrayList<>();
List<LabelledCharArrayMatcher> runAutomata = new ArrayList<>();
final boolean lookInSpan;
final Predicate<String> fieldMatcher;
@ -86,85 +80,10 @@ final class MultiTermHighlighting {
}
@Override
public void visitLeaf(Query query) {
if (query instanceof AutomatonQuery) {
AutomatonQuery aq = (AutomatonQuery) query;
if (aq.isAutomatonBinary() == false) {
// WildcardQuery, RegexpQuery
runAutomata.add(new CharacterRunAutomaton(aq.getAutomaton()) {
@Override
public String toString() {
return query.toString();
}
});
}
else {
runAutomata.add(binaryToCharRunAutomaton(aq.getAutomaton(), query.toString()));
}
}
else if (query instanceof FuzzyQuery) {
FuzzyQuery fq = (FuzzyQuery) query;
if (fq.getMaxEdits() == 0 || fq.getPrefixLength() >= fq.getTerm().text().length()) {
consumeTerms(query, fq.getTerm());
}
else {
runAutomata.add(new CharacterRunAutomaton(fq.toAutomaton()){
@Override
public String toString() {
return query.toString();
}
});
}
}
public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
runAutomata.add(LabelledCharArrayMatcher.wrap(query.toString(), automaton));
}
}
private static CharacterRunAutomaton binaryToCharRunAutomaton(Automaton binaryAutomaton, String description) {
return new CharacterRunAutomaton(Automata.makeEmpty()) { // empty here is bogus just to satisfy API
// TODO can we get access to the aq.compiledAutomaton.runAutomaton ?
ByteRunAutomaton byteRunAutomaton =
new ByteRunAutomaton(binaryAutomaton, true, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
@Override
public String toString() {
return description;
}
@Override
public boolean run(char[] chars, int offset, int length) {
int state = 0;
final int maxIdx = offset + length;
for (int i = offset; i < maxIdx; i++) {
final int code = chars[i];
int b;
// UTF16 to UTF8 (inlined logic from UnicodeUtil.UTF16toUTF8 )
if (code < 0x80) {
state = byteRunAutomaton.step(state, code);
if (state == -1) return false;
} else if (code < 0x800) {
b = (0xC0 | (code >> 6));
state = byteRunAutomaton.step(state, b);
if (state == -1) return false;
b = (0x80 | (code & 0x3F));
state = byteRunAutomaton.step(state, b);
if (state == -1) return false;
} else {
// more complex
byte[] utf8Bytes = new byte[4 * (maxIdx - i)];
int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes);
for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) {
state = byteRunAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF);
if (state == -1) return false;
}
break;
}
}
return byteRunAutomaton.isAccept(state);
}
};
}
}

View File

@ -22,7 +22,6 @@ import java.util.Collections;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
* Never returns offsets. Used when the query would highlight nothing.
@ -34,7 +33,8 @@ public class NoOpOffsetStrategy extends FieldOffsetStrategy {
public static final NoOpOffsetStrategy INSTANCE = new NoOpOffsetStrategy();
private NoOpOffsetStrategy() {
super(new UHComponents("_ignored_", (s) -> false, new MatchNoDocsQuery(), new BytesRef[0], PhraseHelper.NONE, new CharacterRunAutomaton[0], false, Collections.emptySet()));
super(new UHComponents("_ignored_", (s) -> false, new MatchNoDocsQuery(),
new BytesRef[0], PhraseHelper.NONE, new LabelledCharArrayMatcher[0], false, Collections.emptySet()));
}
@Override

View File

@ -34,28 +34,24 @@ import org.apache.lucene.util.automaton.CharacterRunAutomaton;
*/
public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
private final CharacterRunAutomaton[] combinedAutomata;
private final CharArrayMatcher[] combinedAutomata;
public TokenStreamOffsetStrategy(UHComponents components, Analyzer indexAnalyzer) {
super(components, indexAnalyzer);
assert components.getPhraseHelper().hasPositionSensitivity() == false;
combinedAutomata = convertTermsToAutomata(components.getTerms(), components.getAutomata());
combinedAutomata = convertTermsToMatchers(components.getTerms(), components.getAutomata());
}
//TODO this is inefficient; instead build a union automata just for terms part.
private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) {
CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length];
private static CharArrayMatcher[] convertTermsToMatchers(BytesRef[] terms, CharArrayMatcher[] matchers) {
CharArrayMatcher[] newAutomata = new CharArrayMatcher[terms.length + matchers.length];
for (int i = 0; i < terms.length; i++) {
String termString = terms[i].utf8ToString();
newAutomata[i] = new CharacterRunAutomaton(Automata.makeString(termString)) {
@Override
public String toString() {
return termString;
}
};
CharacterRunAutomaton a = new CharacterRunAutomaton(Automata.makeString(termString));
newAutomata[i] = LabelledCharArrayMatcher.wrap(termString, a::run);
}
// Append existing automata (that which is used for MTQs)
System.arraycopy(automata, 0, newAutomata, terms.length, automata.length);
System.arraycopy(matchers, 0, newAutomata, terms.length, matchers.length);
return newAutomata;
}
@ -66,7 +62,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
private static class TokenStreamOffsetsEnum extends OffsetsEnum {
TokenStream stream; // becomes null when closed
final CharacterRunAutomaton[] matchers;
final CharArrayMatcher[] matchers;
final CharTermAttribute charTermAtt;
final OffsetAttribute offsetAtt;
@ -74,7 +70,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
final BytesRef matchDescriptions[];
TokenStreamOffsetsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException {
TokenStreamOffsetsEnum(TokenStream ts, CharArrayMatcher[] matchers) throws IOException {
this.stream = ts;
this.matchers = matchers;
matchDescriptions = new BytesRef[matchers.length];
@ -88,7 +84,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
if (stream != null) {
while (stream.incrementToken()) {
for (int i = 0; i < matchers.length; i++) {
if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
if (matchers[i].match(charTermAtt.buffer(), 0, charTermAtt.length())) {
currentMatch = i;
return true;
}

View File

@ -22,7 +22,6 @@ import java.util.function.Predicate;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
* A parameter object to hold the components a {@link FieldOffsetStrategy} needs.
@ -35,12 +34,12 @@ public class UHComponents {
private final Query query;
private final BytesRef[] terms; // Query: all terms we extracted (some may be position sensitive)
private final PhraseHelper phraseHelper; // Query: position-sensitive information
private final CharacterRunAutomaton[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive
private final LabelledCharArrayMatcher[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive
private final boolean hasUnrecognizedQueryPart; // Query: if part of the query (other than the extracted terms / automata) is a leaf we don't know
private final Set<UnifiedHighlighter.HighlightFlag> highlightFlags;
public UHComponents(String field, Predicate<String> fieldMatcher, Query query,
BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata,
BytesRef[] terms, PhraseHelper phraseHelper, LabelledCharArrayMatcher[] automata,
boolean hasUnrecognizedQueryPart, Set<UnifiedHighlighter.HighlightFlag> highlightFlags) {
this.field = field;
this.fieldMatcher = fieldMatcher;
@ -72,7 +71,7 @@ public class UHComponents {
return phraseHelper;
}
public CharacterRunAutomaton[] getAutomata() {
public LabelledCharArrayMatcher[] getAutomata() {
return automata;
}

View File

@ -63,7 +63,6 @@ import org.apache.lucene.search.Weight;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
* A Highlighter that can get offsets from either
@ -111,7 +110,7 @@ public class UnifiedHighlighter {
}
}
protected static final CharacterRunAutomaton[] ZERO_LEN_AUTOMATA_ARRAY = new CharacterRunAutomaton[0];
protected static final LabelledCharArrayMatcher[] ZERO_LEN_AUTOMATA_ARRAY = new LabelledCharArrayMatcher[0];
protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher
@ -770,7 +769,7 @@ public class UnifiedHighlighter {
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
boolean queryHasUnrecognizedPart = hasUnrecognizedQuery(fieldMatcher, query);
BytesRef[] terms = null;
CharacterRunAutomaton[] automata = null;
LabelledCharArrayMatcher[] automata = null;
if (!highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES) || !queryHasUnrecognizedPart) {
terms = filterExtractedTerms(fieldMatcher, allTerms);
automata = getAutomata(field, query, highlightFlags);
@ -840,7 +839,7 @@ public class UnifiedHighlighter {
: PhraseHelper.NONE;
}
protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set<HighlightFlag> highlightFlags) {
protected LabelledCharArrayMatcher[] getAutomata(String field, Query query, Set<HighlightFlag> highlightFlags) {
// do we "eagerly" look in span queries for automata here, or do we not and let PhraseHelper handle those?
// if don't highlight phrases strictly,
final boolean lookInSpan =

View File

@ -36,6 +36,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.uhighlight.FieldHighlighter;
import org.apache.lucene.search.uhighlight.FieldOffsetStrategy;
import org.apache.lucene.search.uhighlight.LabelledCharArrayMatcher;
import org.apache.lucene.search.uhighlight.OffsetsEnum;
import org.apache.lucene.search.uhighlight.Passage;
import org.apache.lucene.search.uhighlight.PassageFormatter;
@ -46,7 +47,6 @@ import org.apache.lucene.search.uhighlight.UHComponents;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.junit.Test;
/**
@ -65,7 +65,7 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
(s) -> false,
new MatchAllDocsQuery(), new BytesRef[0],
PhraseHelper.NONE,
new CharacterRunAutomaton[0], false, Collections.emptySet())) {
new LabelledCharArrayMatcher[0], false, Collections.emptySet())) {
@Override
public UnifiedHighlighter.OffsetSource getOffsetSource() {
return offsetSource;
@ -180,7 +180,7 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
BytesRef[] terms = filterExtractedTerms(fieldMatcher, allTerms);
Set<HighlightFlag> highlightFlags = getFlags(field);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
LabelledCharArrayMatcher[] automata = getAutomata(field, query, highlightFlags);
boolean queryHasUnrecognizedPart = false;
return new UHComponents(field, fieldMatcher, query, terms, phraseHelper, automata, queryHasUnrecognizedPart, highlightFlags);
}

View File

@ -96,7 +96,7 @@ class MultiTermIntervalsSource extends IntervalsSource {
@Override
public void visit(String field, QueryVisitor visitor) {
visitor.visitLeaf(new IntervalQuery(field, this));
automaton.visit(visitor, new IntervalQuery(field, this), field);
}
@Override