mirror of https://github.com/apache/lucene.git
LUCENE-9062: QueryVisitor.consumeTermsMatching (#1037)
This commit adds a consumeTermsMatching() method to QueryVisitor, allowing queries that match against a class of terms to report this back to the visitor. It also changes highlighting code to use this new method, replacing the current implementation via instanceof checks.
This commit is contained in:
parent
47a908a0b9
commit
bed694ec88
|
@ -92,6 +92,10 @@ Improvements
|
|||
|
||||
* LUCENE-9036: ExitableDirectoryReader may interupt scaning over DocValues (Mikhail Khludnev)
|
||||
|
||||
* LUCENE-9062: QueryVisitor now has a consumeTermsMatching() method, allowing queries
|
||||
that match a class of terms to pass a ByteRunAutomaton matching those that class
|
||||
back to the visitor. (Alan Woodward, David Smiley)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-8928: When building a kd-tree for dimensions n > 2, compute exact bounds for an inner node every N splits
|
||||
|
|
|
@ -162,8 +162,8 @@ public class AutomatonQuery extends MultiTermQuery implements Accountable {
|
|||
|
||||
@Override
|
||||
public void visit(QueryVisitor visitor) {
|
||||
if (visitor.acceptField(getField())) {
|
||||
visitor.visitLeaf(this);
|
||||
if (visitor.acceptField(field)) {
|
||||
compiled.visit(visitor, this, field);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -25,7 +25,9 @@ import org.apache.lucene.index.Terms;
|
|||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
|
||||
/** Implements the fuzzy search query. The similarity measurement
|
||||
* is based on the Damerau-Levenshtein (optimal string alignment) algorithm,
|
||||
|
@ -156,9 +158,14 @@ public class FuzzyQuery extends MultiTermQuery {
|
|||
|
||||
@Override
|
||||
public void visit(QueryVisitor visitor) {
|
||||
// TODO find some way of consuming Automata
|
||||
if (visitor.acceptField(term.field())) {
|
||||
visitor.visitLeaf(this);
|
||||
if (visitor.acceptField(field)) {
|
||||
if (maxEdits == 0 || prefixLength >= term.text().length()) {
|
||||
visitor.consumeTerms(this, term);
|
||||
} else {
|
||||
// Note: we're rebuilding the automaton here, so this can be expensive
|
||||
visitor.consumeTermsMatching(this, field,
|
||||
new ByteRunAutomaton(toAutomaton(), false, Operations.DEFAULT_MAX_DETERMINIZED_STATES));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.util.Arrays;
|
|||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||
|
||||
/**
|
||||
* Allows recursion through a query tree
|
||||
|
@ -37,8 +38,18 @@ public abstract class QueryVisitor {
|
|||
*/
|
||||
public void consumeTerms(Query query, Term... terms) { }
|
||||
|
||||
// TODO it would be nice to have a way to consume 'classes' of Terms from
|
||||
// things like AutomatonQuery
|
||||
/**
|
||||
* Called by leaf queries that match on a class of terms
|
||||
*
|
||||
* @param query the leaf query
|
||||
* @param field the field queried against
|
||||
* @param automaton an automaton defining which terms match
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
|
||||
visitLeaf(query); // default impl for backward compatibility
|
||||
}
|
||||
|
||||
/**
|
||||
* Called by leaf queries that do not match on terms
|
||||
|
|
|
@ -16,7 +16,6 @@
|
|||
*/
|
||||
package org.apache.lucene.util.automaton;
|
||||
|
||||
|
||||
/**
|
||||
* Automaton representation for matching UTF-8 byte[].
|
||||
*/
|
||||
|
|
|
@ -22,8 +22,11 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.SingleTermsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.QueryVisitor;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
|
@ -344,6 +347,27 @@ public class CompiledAutomaton implements Accountable {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Report back to a QueryVisitor how this automaton matches terms
|
||||
*/
|
||||
public void visit(QueryVisitor visitor, Query parent, String field) {
|
||||
if (visitor.acceptField(field)) {
|
||||
switch (type) {
|
||||
case NORMAL:
|
||||
visitor.consumeTermsMatching(parent, field, runAutomaton);
|
||||
break;
|
||||
case NONE:
|
||||
break;
|
||||
case ALL:
|
||||
visitor.consumeTermsMatching(parent, field, new ByteRunAutomaton(Automata.makeAnyString()));
|
||||
break;
|
||||
case SINGLE:
|
||||
visitor.consumeTerms(parent, new Term(field, term));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Finds largest term accepted by this Automaton, that's
|
||||
* <= the provided input term. The result is placed in
|
||||
* output; it's fine for output and input to point to
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
|
||||
/**
|
||||
* Matches a character array
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
public interface CharArrayMatcher {
|
||||
|
||||
/**
|
||||
* Return {@code true} if the passed-in character array matches
|
||||
*/
|
||||
boolean match(char[] s, int offset, int length);
|
||||
|
||||
/**
|
||||
* Return {@code true} if the passed-in CharsRef matches
|
||||
*/
|
||||
default boolean match(CharsRef chars) {
|
||||
return match(chars.chars, chars.offset, chars.length);
|
||||
}
|
||||
|
||||
static CharArrayMatcher fromTerms(List<BytesRef> terms) {
|
||||
CharacterRunAutomaton a = new CharacterRunAutomaton(Automata.makeStringUnion(terms));
|
||||
return a::run;
|
||||
}
|
||||
|
||||
}
|
|
@ -31,7 +31,6 @@ import org.apache.lucene.search.MatchesIterator;
|
|||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
|
||||
/**
|
||||
* Ultimately returns an {@link OffsetsEnum} yielding potentially highlightable words in the text. Needs
|
||||
|
@ -168,7 +167,7 @@ public abstract class FieldOffsetStrategy {
|
|||
}
|
||||
|
||||
protected void createOffsetsEnumsForAutomata(Terms termsIndex, int doc, List<OffsetsEnum> results) throws IOException {
|
||||
final CharacterRunAutomaton[] automata = components.getAutomata();
|
||||
final LabelledCharArrayMatcher[] automata = components.getAutomata();
|
||||
List<List<PostingsEnum>> automataPostings = new ArrayList<>(automata.length);
|
||||
for (int i = 0; i < automata.length; i++) {
|
||||
automataPostings.add(new ArrayList<>());
|
||||
|
@ -180,9 +179,9 @@ public abstract class FieldOffsetStrategy {
|
|||
CharsRefBuilder refBuilder = new CharsRefBuilder();
|
||||
while ((term = termsEnum.next()) != null) {
|
||||
for (int i = 0; i < automata.length; i++) {
|
||||
CharacterRunAutomaton automaton = automata[i];
|
||||
CharArrayMatcher automaton = automata[i];
|
||||
refBuilder.copyUTF8Bytes(term);
|
||||
if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) {
|
||||
if (automaton.match(refBuilder.get())) {
|
||||
PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
|
||||
if (doc == postings.advance(doc)) {
|
||||
automataPostings.get(i).add(postings);
|
||||
|
@ -192,13 +191,13 @@ public abstract class FieldOffsetStrategy {
|
|||
}
|
||||
|
||||
for (int i = 0; i < automata.length; i++) {
|
||||
CharacterRunAutomaton automaton = automata[i];
|
||||
LabelledCharArrayMatcher automaton = automata[i];
|
||||
List<PostingsEnum> postingsEnums = automataPostings.get(i);
|
||||
if (postingsEnums.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
// Build one OffsetsEnum exposing the automata.toString as the term, and the sum of freq
|
||||
BytesRef wildcardTerm = new BytesRef(automaton.toString());
|
||||
// Build one OffsetsEnum exposing the automaton label as the term, and the sum of freq
|
||||
BytesRef wildcardTerm = new BytesRef(automaton.getLabel());
|
||||
int sumFreq = 0;
|
||||
for (PostingsEnum postingsEnum : postingsEnums) {
|
||||
sumFreq += postingsEnum.freq();
|
||||
|
|
|
@ -0,0 +1,88 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||
|
||||
/**
|
||||
* Associates a label with a CharArrayMatcher to distinguish different sources for terms in highlighting
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
public interface LabelledCharArrayMatcher extends CharArrayMatcher {
|
||||
|
||||
/**
|
||||
* @return the label for this matcher
|
||||
*/
|
||||
String getLabel();
|
||||
|
||||
/**
|
||||
* Associates a label with a CharArrayMatcher
|
||||
*/
|
||||
static LabelledCharArrayMatcher wrap(String label, CharArrayMatcher in) {
|
||||
return new LabelledCharArrayMatcher() {
|
||||
@Override
|
||||
public String getLabel() {
|
||||
return label;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean match(char[] s, int offset, int length) {
|
||||
return in.match(s, offset, length);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a representation of the automaton that matches char[] instead of byte[]
|
||||
*/
|
||||
static LabelledCharArrayMatcher wrap(String label, ByteRunAutomaton runAutomaton) {
|
||||
return wrap(label, (chars, offset, length) -> {
|
||||
int state = 0;
|
||||
final int maxIdx = offset + length;
|
||||
for (int i = offset; i < maxIdx; i++) {
|
||||
final int code = chars[i];
|
||||
int b;
|
||||
// UTF16 to UTF8 (inlined logic from UnicodeUtil.UTF16toUTF8 )
|
||||
if (code < 0x80) {
|
||||
state = runAutomaton.step(state, code);
|
||||
if (state == -1) return false;
|
||||
} else if (code < 0x800) {
|
||||
b = (0xC0 | (code >> 6));
|
||||
state = runAutomaton.step(state, b);
|
||||
if (state == -1) return false;
|
||||
b = (0x80 | (code & 0x3F));
|
||||
state = runAutomaton.step(state, b);
|
||||
if (state == -1) return false;
|
||||
} else {
|
||||
// more complex
|
||||
byte[] utf8Bytes = new byte[4 * (maxIdx - i)];
|
||||
int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes);
|
||||
for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) {
|
||||
state = runAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF);
|
||||
if (state == -1) return false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return runAutomaton.isAccept(state);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
|
@ -29,8 +29,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.memory.MemoryIndex;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -42,7 +40,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
|
|||
|
||||
private final MemoryIndex memoryIndex;
|
||||
private final LeafReader memIndexLeafReader;
|
||||
private final CharacterRunAutomaton preMemIndexFilterAutomaton;
|
||||
private final CharArrayMatcher preMemIndexFilterAutomaton;
|
||||
|
||||
public MemoryIndexOffsetStrategy(UHComponents components, Analyzer analyzer) {
|
||||
super(components, analyzer);
|
||||
|
@ -54,17 +52,17 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
|
|||
}
|
||||
|
||||
/**
|
||||
* Build one {@link CharacterRunAutomaton} matching any term the query might match.
|
||||
* Build one {@link CharArrayMatcher} matching any term the query might match.
|
||||
*/
|
||||
private static CharacterRunAutomaton buildCombinedAutomaton(UHComponents components) {
|
||||
private static CharArrayMatcher buildCombinedAutomaton(UHComponents components) {
|
||||
// We don't know enough about the query to do this confidently
|
||||
if (components.getTerms() == null || components.getAutomata() == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
List<CharacterRunAutomaton> allAutomata = new ArrayList<>();
|
||||
List<CharArrayMatcher> allAutomata = new ArrayList<>();
|
||||
if (components.getTerms().length > 0) {
|
||||
allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(components.getTerms()))));
|
||||
allAutomata.add(CharArrayMatcher.fromTerms(Arrays.asList(components.getTerms())));
|
||||
}
|
||||
Collections.addAll(allAutomata, components.getAutomata());
|
||||
for (SpanQuery spanQuery : components.getPhraseHelper().getSpanQueries()) {
|
||||
|
@ -75,20 +73,18 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
|
|||
if (allAutomata.size() == 1) {
|
||||
return allAutomata.get(0);
|
||||
}
|
||||
|
||||
//TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
|
||||
// could union them all. But it's not exposed, and sometimes the automaton is byte (not char) oriented
|
||||
|
||||
// Return an aggregate CharacterRunAutomaton of others
|
||||
return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used
|
||||
@Override
|
||||
public boolean run(char[] chars, int offset, int length) {
|
||||
for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
|
||||
if (allAutomata.get(i).run(chars, offset, length)) {
|
||||
return true;
|
||||
}
|
||||
// Return an aggregate CharArrayMatcher of others
|
||||
return (chars, offset, length) -> {
|
||||
for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
|
||||
if (allAutomata.get(i).match(chars, offset, length)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -118,14 +114,14 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
|
|||
}
|
||||
|
||||
private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream,
|
||||
final CharacterRunAutomaton charRunAutomaton) {
|
||||
final CharArrayMatcher matcher) {
|
||||
// it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface?
|
||||
return new FilteringTokenFilter(tokenStream) {
|
||||
final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
@Override
|
||||
protected boolean accept() throws IOException {
|
||||
return charRunAutomaton.run(charAtt.buffer(), 0, charAtt.length());
|
||||
return matcher.match(charAtt.buffer(), 0, charAtt.length());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -26,12 +26,7 @@ import org.apache.lucene.search.FuzzyQuery;
|
|||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.QueryVisitor;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
|
||||
/**
|
||||
* Support for highlighting multi-term queries.
|
||||
|
@ -46,11 +41,10 @@ final class MultiTermHighlighting {
|
|||
* Extracts MultiTermQueries that match the provided field predicate.
|
||||
* Returns equivalent automata that will match terms.
|
||||
*/
|
||||
static CharacterRunAutomaton[] extractAutomata(Query query, Predicate<String> fieldMatcher, boolean lookInSpan) {
|
||||
|
||||
static LabelledCharArrayMatcher[] extractAutomata(Query query, Predicate<String> fieldMatcher, boolean lookInSpan) {
|
||||
AutomataCollector collector = new AutomataCollector(lookInSpan, fieldMatcher);
|
||||
query.visit(collector);
|
||||
return collector.runAutomata.toArray(new CharacterRunAutomaton[0]);
|
||||
return collector.runAutomata.toArray(new LabelledCharArrayMatcher[0]);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -63,7 +57,7 @@ final class MultiTermHighlighting {
|
|||
|
||||
private static class AutomataCollector extends QueryVisitor {
|
||||
|
||||
List<CharacterRunAutomaton> runAutomata = new ArrayList<>();
|
||||
List<LabelledCharArrayMatcher> runAutomata = new ArrayList<>();
|
||||
final boolean lookInSpan;
|
||||
final Predicate<String> fieldMatcher;
|
||||
|
||||
|
@ -86,85 +80,10 @@ final class MultiTermHighlighting {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void visitLeaf(Query query) {
|
||||
if (query instanceof AutomatonQuery) {
|
||||
AutomatonQuery aq = (AutomatonQuery) query;
|
||||
if (aq.isAutomatonBinary() == false) {
|
||||
// WildcardQuery, RegexpQuery
|
||||
runAutomata.add(new CharacterRunAutomaton(aq.getAutomaton()) {
|
||||
@Override
|
||||
public String toString() {
|
||||
return query.toString();
|
||||
}
|
||||
});
|
||||
}
|
||||
else {
|
||||
runAutomata.add(binaryToCharRunAutomaton(aq.getAutomaton(), query.toString()));
|
||||
}
|
||||
}
|
||||
else if (query instanceof FuzzyQuery) {
|
||||
FuzzyQuery fq = (FuzzyQuery) query;
|
||||
if (fq.getMaxEdits() == 0 || fq.getPrefixLength() >= fq.getTerm().text().length()) {
|
||||
consumeTerms(query, fq.getTerm());
|
||||
}
|
||||
else {
|
||||
runAutomata.add(new CharacterRunAutomaton(fq.toAutomaton()){
|
||||
@Override
|
||||
public String toString() {
|
||||
return query.toString();
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
|
||||
runAutomata.add(LabelledCharArrayMatcher.wrap(query.toString(), automaton));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static CharacterRunAutomaton binaryToCharRunAutomaton(Automaton binaryAutomaton, String description) {
|
||||
return new CharacterRunAutomaton(Automata.makeEmpty()) { // empty here is bogus just to satisfy API
|
||||
// TODO can we get access to the aq.compiledAutomaton.runAutomaton ?
|
||||
ByteRunAutomaton byteRunAutomaton =
|
||||
new ByteRunAutomaton(binaryAutomaton, true, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return description;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean run(char[] chars, int offset, int length) {
|
||||
int state = 0;
|
||||
final int maxIdx = offset + length;
|
||||
for (int i = offset; i < maxIdx; i++) {
|
||||
final int code = chars[i];
|
||||
int b;
|
||||
// UTF16 to UTF8 (inlined logic from UnicodeUtil.UTF16toUTF8 )
|
||||
if (code < 0x80) {
|
||||
state = byteRunAutomaton.step(state, code);
|
||||
if (state == -1) return false;
|
||||
} else if (code < 0x800) {
|
||||
b = (0xC0 | (code >> 6));
|
||||
state = byteRunAutomaton.step(state, b);
|
||||
if (state == -1) return false;
|
||||
b = (0x80 | (code & 0x3F));
|
||||
state = byteRunAutomaton.step(state, b);
|
||||
if (state == -1) return false;
|
||||
} else {
|
||||
// more complex
|
||||
byte[] utf8Bytes = new byte[4 * (maxIdx - i)];
|
||||
int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes);
|
||||
for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) {
|
||||
state = byteRunAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF);
|
||||
if (state == -1) return false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return byteRunAutomaton.isAccept(state);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.util.Collections;
|
|||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.search.MatchNoDocsQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
|
||||
/**
|
||||
* Never returns offsets. Used when the query would highlight nothing.
|
||||
|
@ -34,7 +33,8 @@ public class NoOpOffsetStrategy extends FieldOffsetStrategy {
|
|||
public static final NoOpOffsetStrategy INSTANCE = new NoOpOffsetStrategy();
|
||||
|
||||
private NoOpOffsetStrategy() {
|
||||
super(new UHComponents("_ignored_", (s) -> false, new MatchNoDocsQuery(), new BytesRef[0], PhraseHelper.NONE, new CharacterRunAutomaton[0], false, Collections.emptySet()));
|
||||
super(new UHComponents("_ignored_", (s) -> false, new MatchNoDocsQuery(),
|
||||
new BytesRef[0], PhraseHelper.NONE, new LabelledCharArrayMatcher[0], false, Collections.emptySet()));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -34,28 +34,24 @@ import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
|||
*/
|
||||
public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
|
||||
|
||||
private final CharacterRunAutomaton[] combinedAutomata;
|
||||
private final CharArrayMatcher[] combinedAutomata;
|
||||
|
||||
public TokenStreamOffsetStrategy(UHComponents components, Analyzer indexAnalyzer) {
|
||||
super(components, indexAnalyzer);
|
||||
assert components.getPhraseHelper().hasPositionSensitivity() == false;
|
||||
combinedAutomata = convertTermsToAutomata(components.getTerms(), components.getAutomata());
|
||||
combinedAutomata = convertTermsToMatchers(components.getTerms(), components.getAutomata());
|
||||
}
|
||||
|
||||
//TODO this is inefficient; instead build a union automata just for terms part.
|
||||
private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) {
|
||||
CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length];
|
||||
private static CharArrayMatcher[] convertTermsToMatchers(BytesRef[] terms, CharArrayMatcher[] matchers) {
|
||||
CharArrayMatcher[] newAutomata = new CharArrayMatcher[terms.length + matchers.length];
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
String termString = terms[i].utf8ToString();
|
||||
newAutomata[i] = new CharacterRunAutomaton(Automata.makeString(termString)) {
|
||||
@Override
|
||||
public String toString() {
|
||||
return termString;
|
||||
}
|
||||
};
|
||||
CharacterRunAutomaton a = new CharacterRunAutomaton(Automata.makeString(termString));
|
||||
newAutomata[i] = LabelledCharArrayMatcher.wrap(termString, a::run);
|
||||
}
|
||||
// Append existing automata (that which is used for MTQs)
|
||||
System.arraycopy(automata, 0, newAutomata, terms.length, automata.length);
|
||||
System.arraycopy(matchers, 0, newAutomata, terms.length, matchers.length);
|
||||
return newAutomata;
|
||||
}
|
||||
|
||||
|
@ -66,7 +62,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
|
|||
|
||||
private static class TokenStreamOffsetsEnum extends OffsetsEnum {
|
||||
TokenStream stream; // becomes null when closed
|
||||
final CharacterRunAutomaton[] matchers;
|
||||
final CharArrayMatcher[] matchers;
|
||||
final CharTermAttribute charTermAtt;
|
||||
final OffsetAttribute offsetAtt;
|
||||
|
||||
|
@ -74,7 +70,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
|
|||
|
||||
final BytesRef matchDescriptions[];
|
||||
|
||||
TokenStreamOffsetsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException {
|
||||
TokenStreamOffsetsEnum(TokenStream ts, CharArrayMatcher[] matchers) throws IOException {
|
||||
this.stream = ts;
|
||||
this.matchers = matchers;
|
||||
matchDescriptions = new BytesRef[matchers.length];
|
||||
|
@ -88,7 +84,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
|
|||
if (stream != null) {
|
||||
while (stream.incrementToken()) {
|
||||
for (int i = 0; i < matchers.length; i++) {
|
||||
if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
|
||||
if (matchers[i].match(charTermAtt.buffer(), 0, charTermAtt.length())) {
|
||||
currentMatch = i;
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.util.function.Predicate;
|
|||
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
|
||||
/**
|
||||
* A parameter object to hold the components a {@link FieldOffsetStrategy} needs.
|
||||
|
@ -35,12 +34,12 @@ public class UHComponents {
|
|||
private final Query query;
|
||||
private final BytesRef[] terms; // Query: all terms we extracted (some may be position sensitive)
|
||||
private final PhraseHelper phraseHelper; // Query: position-sensitive information
|
||||
private final CharacterRunAutomaton[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive
|
||||
private final LabelledCharArrayMatcher[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive
|
||||
private final boolean hasUnrecognizedQueryPart; // Query: if part of the query (other than the extracted terms / automata) is a leaf we don't know
|
||||
private final Set<UnifiedHighlighter.HighlightFlag> highlightFlags;
|
||||
|
||||
public UHComponents(String field, Predicate<String> fieldMatcher, Query query,
|
||||
BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata,
|
||||
BytesRef[] terms, PhraseHelper phraseHelper, LabelledCharArrayMatcher[] automata,
|
||||
boolean hasUnrecognizedQueryPart, Set<UnifiedHighlighter.HighlightFlag> highlightFlags) {
|
||||
this.field = field;
|
||||
this.fieldMatcher = fieldMatcher;
|
||||
|
@ -72,7 +71,7 @@ public class UHComponents {
|
|||
return phraseHelper;
|
||||
}
|
||||
|
||||
public CharacterRunAutomaton[] getAutomata() {
|
||||
public LabelledCharArrayMatcher[] getAutomata() {
|
||||
return automata;
|
||||
}
|
||||
|
||||
|
|
|
@ -62,7 +62,6 @@ import org.apache.lucene.search.Weight;
|
|||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.InPlaceMergeSorter;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
|
||||
/**
|
||||
* A Highlighter that can get offsets from either
|
||||
|
@ -110,7 +109,7 @@ public class UnifiedHighlighter {
|
|||
}
|
||||
}
|
||||
|
||||
protected static final CharacterRunAutomaton[] ZERO_LEN_AUTOMATA_ARRAY = new CharacterRunAutomaton[0];
|
||||
protected static final LabelledCharArrayMatcher[] ZERO_LEN_AUTOMATA_ARRAY = new LabelledCharArrayMatcher[0];
|
||||
|
||||
protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher
|
||||
|
||||
|
@ -769,7 +768,7 @@ public class UnifiedHighlighter {
|
|||
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
|
||||
boolean queryHasUnrecognizedPart = hasUnrecognizedQuery(fieldMatcher, query);
|
||||
BytesRef[] terms = null;
|
||||
CharacterRunAutomaton[] automata = null;
|
||||
LabelledCharArrayMatcher[] automata = null;
|
||||
if (!highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES) || !queryHasUnrecognizedPart) {
|
||||
terms = filterExtractedTerms(fieldMatcher, allTerms);
|
||||
automata = getAutomata(field, query, highlightFlags);
|
||||
|
@ -839,7 +838,7 @@ public class UnifiedHighlighter {
|
|||
: PhraseHelper.NONE;
|
||||
}
|
||||
|
||||
protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set<HighlightFlag> highlightFlags) {
|
||||
protected LabelledCharArrayMatcher[] getAutomata(String field, Query query, Set<HighlightFlag> highlightFlags) {
|
||||
// do we "eagerly" look in span queries for automata here, or do we not and let PhraseHelper handle those?
|
||||
// if don't highlight phrases strictly,
|
||||
final boolean lookInSpan =
|
||||
|
|
|
@ -36,6 +36,7 @@ import org.apache.lucene.search.Query;
|
|||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.uhighlight.FieldHighlighter;
|
||||
import org.apache.lucene.search.uhighlight.FieldOffsetStrategy;
|
||||
import org.apache.lucene.search.uhighlight.LabelledCharArrayMatcher;
|
||||
import org.apache.lucene.search.uhighlight.OffsetsEnum;
|
||||
import org.apache.lucene.search.uhighlight.Passage;
|
||||
import org.apache.lucene.search.uhighlight.PassageFormatter;
|
||||
|
@ -46,7 +47,6 @@ import org.apache.lucene.search.uhighlight.UHComponents;
|
|||
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
|
@ -65,7 +65,7 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
|
|||
(s) -> false,
|
||||
new MatchAllDocsQuery(), new BytesRef[0],
|
||||
PhraseHelper.NONE,
|
||||
new CharacterRunAutomaton[0], false, Collections.emptySet())) {
|
||||
new LabelledCharArrayMatcher[0], false, Collections.emptySet())) {
|
||||
@Override
|
||||
public UnifiedHighlighter.OffsetSource getOffsetSource() {
|
||||
return offsetSource;
|
||||
|
@ -180,7 +180,7 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
|
|||
BytesRef[] terms = filterExtractedTerms(fieldMatcher, allTerms);
|
||||
Set<HighlightFlag> highlightFlags = getFlags(field);
|
||||
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
|
||||
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
|
||||
LabelledCharArrayMatcher[] automata = getAutomata(field, query, highlightFlags);
|
||||
boolean queryHasUnrecognizedPart = false;
|
||||
return new UHComponents(field, fieldMatcher, query, terms, phraseHelper, automata, queryHasUnrecognizedPart, highlightFlags);
|
||||
}
|
||||
|
|
|
@ -96,7 +96,7 @@ class MultiTermIntervalsSource extends IntervalsSource {
|
|||
|
||||
@Override
|
||||
public void visit(String field, QueryVisitor visitor) {
|
||||
visitor.visitLeaf(new IntervalQuery(field, this));
|
||||
automaton.visit(visitor, new IntervalQuery(field, this), field);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
Loading…
Reference in New Issue