LUCENE-9062: QueryVisitor.consumeTermsMatching (#1037)

This commit adds a consumeTermsMatching() method to QueryVisitor, allowing
queries that match against a class of terms to report this back to the visitor. It also
changes highlighting code to use this new method, replacing the current implementation
via instanceof checks.
This commit is contained in:
Alan Woodward 2019-11-27 16:28:19 +00:00 committed by GitHub
parent 47a908a0b9
commit bed694ec88
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 240 additions and 148 deletions

View File

@ -92,6 +92,10 @@ Improvements
* LUCENE-9036: ExitableDirectoryReader may interupt scaning over DocValues (Mikhail Khludnev) * LUCENE-9036: ExitableDirectoryReader may interupt scaning over DocValues (Mikhail Khludnev)
* LUCENE-9062: QueryVisitor now has a consumeTermsMatching() method, allowing queries
that match a class of terms to pass a ByteRunAutomaton matching those that class
back to the visitor. (Alan Woodward, David Smiley)
Optimizations Optimizations
* LUCENE-8928: When building a kd-tree for dimensions n > 2, compute exact bounds for an inner node every N splits * LUCENE-8928: When building a kd-tree for dimensions n > 2, compute exact bounds for an inner node every N splits

View File

@ -162,8 +162,8 @@ public class AutomatonQuery extends MultiTermQuery implements Accountable {
@Override @Override
public void visit(QueryVisitor visitor) { public void visit(QueryVisitor visitor) {
if (visitor.acceptField(getField())) { if (visitor.acceptField(field)) {
visitor.visitLeaf(this); compiled.visit(visitor, this, field);
} }
} }

View File

@ -25,7 +25,9 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata; import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
/** Implements the fuzzy search query. The similarity measurement /** Implements the fuzzy search query. The similarity measurement
* is based on the Damerau-Levenshtein (optimal string alignment) algorithm, * is based on the Damerau-Levenshtein (optimal string alignment) algorithm,
@ -156,9 +158,14 @@ public class FuzzyQuery extends MultiTermQuery {
@Override @Override
public void visit(QueryVisitor visitor) { public void visit(QueryVisitor visitor) {
// TODO find some way of consuming Automata if (visitor.acceptField(field)) {
if (visitor.acceptField(term.field())) { if (maxEdits == 0 || prefixLength >= term.text().length()) {
visitor.visitLeaf(this); visitor.consumeTerms(this, term);
} else {
// Note: we're rebuilding the automaton here, so this can be expensive
visitor.consumeTermsMatching(this, field,
new ByteRunAutomaton(toAutomaton(), false, Operations.DEFAULT_MAX_DETERMINIZED_STATES));
}
} }
} }

View File

@ -21,6 +21,7 @@ import java.util.Arrays;
import java.util.Set; import java.util.Set;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
/** /**
* Allows recursion through a query tree * Allows recursion through a query tree
@ -37,8 +38,18 @@ public abstract class QueryVisitor {
*/ */
public void consumeTerms(Query query, Term... terms) { } public void consumeTerms(Query query, Term... terms) { }
// TODO it would be nice to have a way to consume 'classes' of Terms from /**
// things like AutomatonQuery * Called by leaf queries that match on a class of terms
*
* @param query the leaf query
* @param field the field queried against
* @param automaton an automaton defining which terms match
*
* @lucene.experimental
*/
public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
visitLeaf(query); // default impl for backward compatibility
}
/** /**
* Called by leaf queries that do not match on terms * Called by leaf queries that do not match on terms

View File

@ -16,7 +16,6 @@
*/ */
package org.apache.lucene.util.automaton; package org.apache.lucene.util.automaton;
/** /**
* Automaton representation for matching UTF-8 byte[]. * Automaton representation for matching UTF-8 byte[].
*/ */

View File

@ -22,8 +22,11 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.lucene.index.SingleTermsEnum; import org.apache.lucene.index.SingleTermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.util.Accountable; import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.BytesRefBuilder;
@ -344,6 +347,27 @@ public class CompiledAutomaton implements Accountable {
} }
} }
/**
* Report back to a QueryVisitor how this automaton matches terms
*/
public void visit(QueryVisitor visitor, Query parent, String field) {
if (visitor.acceptField(field)) {
switch (type) {
case NORMAL:
visitor.consumeTermsMatching(parent, field, runAutomaton);
break;
case NONE:
break;
case ALL:
visitor.consumeTermsMatching(parent, field, new ByteRunAutomaton(Automata.makeAnyString()));
break;
case SINGLE:
visitor.consumeTerms(parent, new Term(field, term));
break;
}
}
}
/** Finds largest term accepted by this Automaton, that's /** Finds largest term accepted by this Automaton, that's
* <= the provided input term. The result is placed in * <= the provided input term. The result is placed in
* output; it's fine for output and input to point to * output; it's fine for output and input to point to

View File

@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.util.List;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
* Matches a character array
*
* @lucene.internal
*/
public interface CharArrayMatcher {
/**
* Return {@code true} if the passed-in character array matches
*/
boolean match(char[] s, int offset, int length);
/**
* Return {@code true} if the passed-in CharsRef matches
*/
default boolean match(CharsRef chars) {
return match(chars.chars, chars.offset, chars.length);
}
static CharArrayMatcher fromTerms(List<BytesRef> terms) {
CharacterRunAutomaton a = new CharacterRunAutomaton(Automata.makeStringUnion(terms));
return a::run;
}
}

View File

@ -31,7 +31,6 @@ import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/** /**
* Ultimately returns an {@link OffsetsEnum} yielding potentially highlightable words in the text. Needs * Ultimately returns an {@link OffsetsEnum} yielding potentially highlightable words in the text. Needs
@ -168,7 +167,7 @@ public abstract class FieldOffsetStrategy {
} }
protected void createOffsetsEnumsForAutomata(Terms termsIndex, int doc, List<OffsetsEnum> results) throws IOException { protected void createOffsetsEnumsForAutomata(Terms termsIndex, int doc, List<OffsetsEnum> results) throws IOException {
final CharacterRunAutomaton[] automata = components.getAutomata(); final LabelledCharArrayMatcher[] automata = components.getAutomata();
List<List<PostingsEnum>> automataPostings = new ArrayList<>(automata.length); List<List<PostingsEnum>> automataPostings = new ArrayList<>(automata.length);
for (int i = 0; i < automata.length; i++) { for (int i = 0; i < automata.length; i++) {
automataPostings.add(new ArrayList<>()); automataPostings.add(new ArrayList<>());
@ -180,9 +179,9 @@ public abstract class FieldOffsetStrategy {
CharsRefBuilder refBuilder = new CharsRefBuilder(); CharsRefBuilder refBuilder = new CharsRefBuilder();
while ((term = termsEnum.next()) != null) { while ((term = termsEnum.next()) != null) {
for (int i = 0; i < automata.length; i++) { for (int i = 0; i < automata.length; i++) {
CharacterRunAutomaton automaton = automata[i]; CharArrayMatcher automaton = automata[i];
refBuilder.copyUTF8Bytes(term); refBuilder.copyUTF8Bytes(term);
if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) { if (automaton.match(refBuilder.get())) {
PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS); PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
if (doc == postings.advance(doc)) { if (doc == postings.advance(doc)) {
automataPostings.get(i).add(postings); automataPostings.get(i).add(postings);
@ -192,13 +191,13 @@ public abstract class FieldOffsetStrategy {
} }
for (int i = 0; i < automata.length; i++) { for (int i = 0; i < automata.length; i++) {
CharacterRunAutomaton automaton = automata[i]; LabelledCharArrayMatcher automaton = automata[i];
List<PostingsEnum> postingsEnums = automataPostings.get(i); List<PostingsEnum> postingsEnums = automataPostings.get(i);
if (postingsEnums.isEmpty()) { if (postingsEnums.isEmpty()) {
continue; continue;
} }
// Build one OffsetsEnum exposing the automata.toString as the term, and the sum of freq // Build one OffsetsEnum exposing the automaton label as the term, and the sum of freq
BytesRef wildcardTerm = new BytesRef(automaton.toString()); BytesRef wildcardTerm = new BytesRef(automaton.getLabel());
int sumFreq = 0; int sumFreq = 0;
for (PostingsEnum postingsEnum : postingsEnums) { for (PostingsEnum postingsEnum : postingsEnums) {
sumFreq += postingsEnum.freq(); sumFreq += postingsEnum.freq();

View File

@ -0,0 +1,88 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
/**
* Associates a label with a CharArrayMatcher to distinguish different sources for terms in highlighting
*
* @lucene.internal
*/
public interface LabelledCharArrayMatcher extends CharArrayMatcher {
/**
* @return the label for this matcher
*/
String getLabel();
/**
* Associates a label with a CharArrayMatcher
*/
static LabelledCharArrayMatcher wrap(String label, CharArrayMatcher in) {
return new LabelledCharArrayMatcher() {
@Override
public String getLabel() {
return label;
}
@Override
public boolean match(char[] s, int offset, int length) {
return in.match(s, offset, length);
}
};
}
/**
* Returns a representation of the automaton that matches char[] instead of byte[]
*/
static LabelledCharArrayMatcher wrap(String label, ByteRunAutomaton runAutomaton) {
return wrap(label, (chars, offset, length) -> {
int state = 0;
final int maxIdx = offset + length;
for (int i = offset; i < maxIdx; i++) {
final int code = chars[i];
int b;
// UTF16 to UTF8 (inlined logic from UnicodeUtil.UTF16toUTF8 )
if (code < 0x80) {
state = runAutomaton.step(state, code);
if (state == -1) return false;
} else if (code < 0x800) {
b = (0xC0 | (code >> 6));
state = runAutomaton.step(state, b);
if (state == -1) return false;
b = (0x80 | (code & 0x3F));
state = runAutomaton.step(state, b);
if (state == -1) return false;
} else {
// more complex
byte[] utf8Bytes = new byte[4 * (maxIdx - i)];
int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes);
for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) {
state = runAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF);
if (state == -1) return false;
}
break;
}
}
return runAutomaton.isAccept(state);
});
}
}

View File

@ -29,8 +29,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.memory.MemoryIndex; import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/** /**
@ -42,7 +40,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
private final MemoryIndex memoryIndex; private final MemoryIndex memoryIndex;
private final LeafReader memIndexLeafReader; private final LeafReader memIndexLeafReader;
private final CharacterRunAutomaton preMemIndexFilterAutomaton; private final CharArrayMatcher preMemIndexFilterAutomaton;
public MemoryIndexOffsetStrategy(UHComponents components, Analyzer analyzer) { public MemoryIndexOffsetStrategy(UHComponents components, Analyzer analyzer) {
super(components, analyzer); super(components, analyzer);
@ -54,17 +52,17 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
} }
/** /**
* Build one {@link CharacterRunAutomaton} matching any term the query might match. * Build one {@link CharArrayMatcher} matching any term the query might match.
*/ */
private static CharacterRunAutomaton buildCombinedAutomaton(UHComponents components) { private static CharArrayMatcher buildCombinedAutomaton(UHComponents components) {
// We don't know enough about the query to do this confidently // We don't know enough about the query to do this confidently
if (components.getTerms() == null || components.getAutomata() == null) { if (components.getTerms() == null || components.getAutomata() == null) {
return null; return null;
} }
List<CharacterRunAutomaton> allAutomata = new ArrayList<>(); List<CharArrayMatcher> allAutomata = new ArrayList<>();
if (components.getTerms().length > 0) { if (components.getTerms().length > 0) {
allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(components.getTerms())))); allAutomata.add(CharArrayMatcher.fromTerms(Arrays.asList(components.getTerms())));
} }
Collections.addAll(allAutomata, components.getAutomata()); Collections.addAll(allAutomata, components.getAutomata());
for (SpanQuery spanQuery : components.getPhraseHelper().getSpanQueries()) { for (SpanQuery spanQuery : components.getPhraseHelper().getSpanQueries()) {
@ -75,20 +73,18 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
if (allAutomata.size() == 1) { if (allAutomata.size() == 1) {
return allAutomata.get(0); return allAutomata.get(0);
} }
//TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we //TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
// could union them all. But it's not exposed, and sometimes the automaton is byte (not char) oriented // could union them all. But it's not exposed, and sometimes the automaton is byte (not char) oriented
// Return an aggregate CharacterRunAutomaton of others // Return an aggregate CharArrayMatcher of others
return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used return (chars, offset, length) -> {
@Override for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
public boolean run(char[] chars, int offset, int length) { if (allAutomata.get(i).match(chars, offset, length)) {
for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation return true;
if (allAutomata.get(i).run(chars, offset, length)) {
return true;
}
} }
return false;
} }
return false;
}; };
} }
@ -118,14 +114,14 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
} }
private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream, private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream,
final CharacterRunAutomaton charRunAutomaton) { final CharArrayMatcher matcher) {
// it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface? // it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface?
return new FilteringTokenFilter(tokenStream) { return new FilteringTokenFilter(tokenStream) {
final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class); final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class);
@Override @Override
protected boolean accept() throws IOException { protected boolean accept() throws IOException {
return charRunAutomaton.run(charAtt.buffer(), 0, charAtt.length()); return matcher.match(charAtt.buffer(), 0, charAtt.length());
} }
}; };
} }

View File

@ -26,12 +26,7 @@ import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton; import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.Operations;
/** /**
* Support for highlighting multi-term queries. * Support for highlighting multi-term queries.
@ -46,11 +41,10 @@ final class MultiTermHighlighting {
* Extracts MultiTermQueries that match the provided field predicate. * Extracts MultiTermQueries that match the provided field predicate.
* Returns equivalent automata that will match terms. * Returns equivalent automata that will match terms.
*/ */
static CharacterRunAutomaton[] extractAutomata(Query query, Predicate<String> fieldMatcher, boolean lookInSpan) { static LabelledCharArrayMatcher[] extractAutomata(Query query, Predicate<String> fieldMatcher, boolean lookInSpan) {
AutomataCollector collector = new AutomataCollector(lookInSpan, fieldMatcher); AutomataCollector collector = new AutomataCollector(lookInSpan, fieldMatcher);
query.visit(collector); query.visit(collector);
return collector.runAutomata.toArray(new CharacterRunAutomaton[0]); return collector.runAutomata.toArray(new LabelledCharArrayMatcher[0]);
} }
/** /**
@ -63,7 +57,7 @@ final class MultiTermHighlighting {
private static class AutomataCollector extends QueryVisitor { private static class AutomataCollector extends QueryVisitor {
List<CharacterRunAutomaton> runAutomata = new ArrayList<>(); List<LabelledCharArrayMatcher> runAutomata = new ArrayList<>();
final boolean lookInSpan; final boolean lookInSpan;
final Predicate<String> fieldMatcher; final Predicate<String> fieldMatcher;
@ -86,85 +80,10 @@ final class MultiTermHighlighting {
} }
@Override @Override
public void visitLeaf(Query query) { public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
if (query instanceof AutomatonQuery) { runAutomata.add(LabelledCharArrayMatcher.wrap(query.toString(), automaton));
AutomatonQuery aq = (AutomatonQuery) query;
if (aq.isAutomatonBinary() == false) {
// WildcardQuery, RegexpQuery
runAutomata.add(new CharacterRunAutomaton(aq.getAutomaton()) {
@Override
public String toString() {
return query.toString();
}
});
}
else {
runAutomata.add(binaryToCharRunAutomaton(aq.getAutomaton(), query.toString()));
}
}
else if (query instanceof FuzzyQuery) {
FuzzyQuery fq = (FuzzyQuery) query;
if (fq.getMaxEdits() == 0 || fq.getPrefixLength() >= fq.getTerm().text().length()) {
consumeTerms(query, fq.getTerm());
}
else {
runAutomata.add(new CharacterRunAutomaton(fq.toAutomaton()){
@Override
public String toString() {
return query.toString();
}
});
}
}
} }
} }
private static CharacterRunAutomaton binaryToCharRunAutomaton(Automaton binaryAutomaton, String description) {
return new CharacterRunAutomaton(Automata.makeEmpty()) { // empty here is bogus just to satisfy API
// TODO can we get access to the aq.compiledAutomaton.runAutomaton ?
ByteRunAutomaton byteRunAutomaton =
new ByteRunAutomaton(binaryAutomaton, true, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
@Override
public String toString() {
return description;
}
@Override
public boolean run(char[] chars, int offset, int length) {
int state = 0;
final int maxIdx = offset + length;
for (int i = offset; i < maxIdx; i++) {
final int code = chars[i];
int b;
// UTF16 to UTF8 (inlined logic from UnicodeUtil.UTF16toUTF8 )
if (code < 0x80) {
state = byteRunAutomaton.step(state, code);
if (state == -1) return false;
} else if (code < 0x800) {
b = (0xC0 | (code >> 6));
state = byteRunAutomaton.step(state, b);
if (state == -1) return false;
b = (0x80 | (code & 0x3F));
state = byteRunAutomaton.step(state, b);
if (state == -1) return false;
} else {
// more complex
byte[] utf8Bytes = new byte[4 * (maxIdx - i)];
int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes);
for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) {
state = byteRunAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF);
if (state == -1) return false;
}
break;
}
}
return byteRunAutomaton.isAccept(state);
}
};
}
} }

View File

@ -22,7 +22,6 @@ import java.util.Collections;
import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReader;
import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/** /**
* Never returns offsets. Used when the query would highlight nothing. * Never returns offsets. Used when the query would highlight nothing.
@ -34,7 +33,8 @@ public class NoOpOffsetStrategy extends FieldOffsetStrategy {
public static final NoOpOffsetStrategy INSTANCE = new NoOpOffsetStrategy(); public static final NoOpOffsetStrategy INSTANCE = new NoOpOffsetStrategy();
private NoOpOffsetStrategy() { private NoOpOffsetStrategy() {
super(new UHComponents("_ignored_", (s) -> false, new MatchNoDocsQuery(), new BytesRef[0], PhraseHelper.NONE, new CharacterRunAutomaton[0], false, Collections.emptySet())); super(new UHComponents("_ignored_", (s) -> false, new MatchNoDocsQuery(),
new BytesRef[0], PhraseHelper.NONE, new LabelledCharArrayMatcher[0], false, Collections.emptySet()));
} }
@Override @Override

View File

@ -34,28 +34,24 @@ import org.apache.lucene.util.automaton.CharacterRunAutomaton;
*/ */
public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy { public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
private final CharacterRunAutomaton[] combinedAutomata; private final CharArrayMatcher[] combinedAutomata;
public TokenStreamOffsetStrategy(UHComponents components, Analyzer indexAnalyzer) { public TokenStreamOffsetStrategy(UHComponents components, Analyzer indexAnalyzer) {
super(components, indexAnalyzer); super(components, indexAnalyzer);
assert components.getPhraseHelper().hasPositionSensitivity() == false; assert components.getPhraseHelper().hasPositionSensitivity() == false;
combinedAutomata = convertTermsToAutomata(components.getTerms(), components.getAutomata()); combinedAutomata = convertTermsToMatchers(components.getTerms(), components.getAutomata());
} }
//TODO this is inefficient; instead build a union automata just for terms part. //TODO this is inefficient; instead build a union automata just for terms part.
private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) { private static CharArrayMatcher[] convertTermsToMatchers(BytesRef[] terms, CharArrayMatcher[] matchers) {
CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length]; CharArrayMatcher[] newAutomata = new CharArrayMatcher[terms.length + matchers.length];
for (int i = 0; i < terms.length; i++) { for (int i = 0; i < terms.length; i++) {
String termString = terms[i].utf8ToString(); String termString = terms[i].utf8ToString();
newAutomata[i] = new CharacterRunAutomaton(Automata.makeString(termString)) { CharacterRunAutomaton a = new CharacterRunAutomaton(Automata.makeString(termString));
@Override newAutomata[i] = LabelledCharArrayMatcher.wrap(termString, a::run);
public String toString() {
return termString;
}
};
} }
// Append existing automata (that which is used for MTQs) // Append existing automata (that which is used for MTQs)
System.arraycopy(automata, 0, newAutomata, terms.length, automata.length); System.arraycopy(matchers, 0, newAutomata, terms.length, matchers.length);
return newAutomata; return newAutomata;
} }
@ -66,7 +62,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
private static class TokenStreamOffsetsEnum extends OffsetsEnum { private static class TokenStreamOffsetsEnum extends OffsetsEnum {
TokenStream stream; // becomes null when closed TokenStream stream; // becomes null when closed
final CharacterRunAutomaton[] matchers; final CharArrayMatcher[] matchers;
final CharTermAttribute charTermAtt; final CharTermAttribute charTermAtt;
final OffsetAttribute offsetAtt; final OffsetAttribute offsetAtt;
@ -74,7 +70,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
final BytesRef matchDescriptions[]; final BytesRef matchDescriptions[];
TokenStreamOffsetsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException { TokenStreamOffsetsEnum(TokenStream ts, CharArrayMatcher[] matchers) throws IOException {
this.stream = ts; this.stream = ts;
this.matchers = matchers; this.matchers = matchers;
matchDescriptions = new BytesRef[matchers.length]; matchDescriptions = new BytesRef[matchers.length];
@ -88,7 +84,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
if (stream != null) { if (stream != null) {
while (stream.incrementToken()) { while (stream.incrementToken()) {
for (int i = 0; i < matchers.length; i++) { for (int i = 0; i < matchers.length; i++) {
if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) { if (matchers[i].match(charTermAtt.buffer(), 0, charTermAtt.length())) {
currentMatch = i; currentMatch = i;
return true; return true;
} }

View File

@ -22,7 +22,6 @@ import java.util.function.Predicate;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/** /**
* A parameter object to hold the components a {@link FieldOffsetStrategy} needs. * A parameter object to hold the components a {@link FieldOffsetStrategy} needs.
@ -35,12 +34,12 @@ public class UHComponents {
private final Query query; private final Query query;
private final BytesRef[] terms; // Query: all terms we extracted (some may be position sensitive) private final BytesRef[] terms; // Query: all terms we extracted (some may be position sensitive)
private final PhraseHelper phraseHelper; // Query: position-sensitive information private final PhraseHelper phraseHelper; // Query: position-sensitive information
private final CharacterRunAutomaton[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive private final LabelledCharArrayMatcher[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive
private final boolean hasUnrecognizedQueryPart; // Query: if part of the query (other than the extracted terms / automata) is a leaf we don't know private final boolean hasUnrecognizedQueryPart; // Query: if part of the query (other than the extracted terms / automata) is a leaf we don't know
private final Set<UnifiedHighlighter.HighlightFlag> highlightFlags; private final Set<UnifiedHighlighter.HighlightFlag> highlightFlags;
public UHComponents(String field, Predicate<String> fieldMatcher, Query query, public UHComponents(String field, Predicate<String> fieldMatcher, Query query,
BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, BytesRef[] terms, PhraseHelper phraseHelper, LabelledCharArrayMatcher[] automata,
boolean hasUnrecognizedQueryPart, Set<UnifiedHighlighter.HighlightFlag> highlightFlags) { boolean hasUnrecognizedQueryPart, Set<UnifiedHighlighter.HighlightFlag> highlightFlags) {
this.field = field; this.field = field;
this.fieldMatcher = fieldMatcher; this.fieldMatcher = fieldMatcher;
@ -72,7 +71,7 @@ public class UHComponents {
return phraseHelper; return phraseHelper;
} }
public CharacterRunAutomaton[] getAutomata() { public LabelledCharArrayMatcher[] getAutomata() {
return automata; return automata;
} }

View File

@ -62,7 +62,6 @@ import org.apache.lucene.search.Weight;
import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.InPlaceMergeSorter; import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/** /**
* A Highlighter that can get offsets from either * A Highlighter that can get offsets from either
@ -110,7 +109,7 @@ public class UnifiedHighlighter {
} }
} }
protected static final CharacterRunAutomaton[] ZERO_LEN_AUTOMATA_ARRAY = new CharacterRunAutomaton[0]; protected static final LabelledCharArrayMatcher[] ZERO_LEN_AUTOMATA_ARRAY = new LabelledCharArrayMatcher[0];
protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher
@ -769,7 +768,7 @@ public class UnifiedHighlighter {
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags); PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
boolean queryHasUnrecognizedPart = hasUnrecognizedQuery(fieldMatcher, query); boolean queryHasUnrecognizedPart = hasUnrecognizedQuery(fieldMatcher, query);
BytesRef[] terms = null; BytesRef[] terms = null;
CharacterRunAutomaton[] automata = null; LabelledCharArrayMatcher[] automata = null;
if (!highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES) || !queryHasUnrecognizedPart) { if (!highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES) || !queryHasUnrecognizedPart) {
terms = filterExtractedTerms(fieldMatcher, allTerms); terms = filterExtractedTerms(fieldMatcher, allTerms);
automata = getAutomata(field, query, highlightFlags); automata = getAutomata(field, query, highlightFlags);
@ -839,7 +838,7 @@ public class UnifiedHighlighter {
: PhraseHelper.NONE; : PhraseHelper.NONE;
} }
protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set<HighlightFlag> highlightFlags) { protected LabelledCharArrayMatcher[] getAutomata(String field, Query query, Set<HighlightFlag> highlightFlags) {
// do we "eagerly" look in span queries for automata here, or do we not and let PhraseHelper handle those? // do we "eagerly" look in span queries for automata here, or do we not and let PhraseHelper handle those?
// if don't highlight phrases strictly, // if don't highlight phrases strictly,
final boolean lookInSpan = final boolean lookInSpan =

View File

@ -36,6 +36,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.uhighlight.FieldHighlighter; import org.apache.lucene.search.uhighlight.FieldHighlighter;
import org.apache.lucene.search.uhighlight.FieldOffsetStrategy; import org.apache.lucene.search.uhighlight.FieldOffsetStrategy;
import org.apache.lucene.search.uhighlight.LabelledCharArrayMatcher;
import org.apache.lucene.search.uhighlight.OffsetsEnum; import org.apache.lucene.search.uhighlight.OffsetsEnum;
import org.apache.lucene.search.uhighlight.Passage; import org.apache.lucene.search.uhighlight.Passage;
import org.apache.lucene.search.uhighlight.PassageFormatter; import org.apache.lucene.search.uhighlight.PassageFormatter;
@ -46,7 +47,6 @@ import org.apache.lucene.search.uhighlight.UHComponents;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter; import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.junit.Test; import org.junit.Test;
/** /**
@ -65,7 +65,7 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
(s) -> false, (s) -> false,
new MatchAllDocsQuery(), new BytesRef[0], new MatchAllDocsQuery(), new BytesRef[0],
PhraseHelper.NONE, PhraseHelper.NONE,
new CharacterRunAutomaton[0], false, Collections.emptySet())) { new LabelledCharArrayMatcher[0], false, Collections.emptySet())) {
@Override @Override
public UnifiedHighlighter.OffsetSource getOffsetSource() { public UnifiedHighlighter.OffsetSource getOffsetSource() {
return offsetSource; return offsetSource;
@ -180,7 +180,7 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
BytesRef[] terms = filterExtractedTerms(fieldMatcher, allTerms); BytesRef[] terms = filterExtractedTerms(fieldMatcher, allTerms);
Set<HighlightFlag> highlightFlags = getFlags(field); Set<HighlightFlag> highlightFlags = getFlags(field);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags); PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags); LabelledCharArrayMatcher[] automata = getAutomata(field, query, highlightFlags);
boolean queryHasUnrecognizedPart = false; boolean queryHasUnrecognizedPart = false;
return new UHComponents(field, fieldMatcher, query, terms, phraseHelper, automata, queryHasUnrecognizedPart, highlightFlags); return new UHComponents(field, fieldMatcher, query, terms, phraseHelper, automata, queryHasUnrecognizedPart, highlightFlags);
} }

View File

@ -96,7 +96,7 @@ class MultiTermIntervalsSource extends IntervalsSource {
@Override @Override
public void visit(String field, QueryVisitor visitor) { public void visit(String field, QueryVisitor visitor) {
visitor.visitLeaf(new IntervalQuery(field, this)); automaton.visit(visitor, new IntervalQuery(field, this), field);
} }
@Override @Override