mirror of https://github.com/apache/lucene.git
LUCENE-7719: Generalize UnifiedHighlighter's support for AutomatonQuery
This commit is contained in:
parent
b49ce68935
commit
d0b9d3459f
|
@ -130,6 +130,9 @@ Other
|
|||
* LUCENE-7852: Correct copyright year(s) in lucene/LICENSE.txt file.
|
||||
(Christine Poerschke, Steve Rowe)
|
||||
|
||||
* LUCENE-7719: Generalized the UnifiedHighlighter's support for AutomatonQuery
|
||||
for character & binary automata. Added AutomatonQuery.isBinary. (David Smiley)
|
||||
|
||||
======================= Lucene 6.7.0 =======================
|
||||
|
||||
New Features
|
||||
|
|
|
@ -51,6 +51,7 @@ public class AutomatonQuery extends MultiTermQuery {
|
|||
protected final CompiledAutomaton compiled;
|
||||
/** term containing the field, and possibly some pattern structure */
|
||||
protected final Term term;
|
||||
protected final boolean automatonIsBinary;
|
||||
|
||||
/**
|
||||
* Create a new AutomatonQuery from an {@link Automaton}.
|
||||
|
@ -98,6 +99,7 @@ public class AutomatonQuery extends MultiTermQuery {
|
|||
super(term.field());
|
||||
this.term = term;
|
||||
this.automaton = automaton;
|
||||
this.automatonIsBinary = isBinary;
|
||||
// TODO: we could take isFinite too, to save a bit of CPU in CompiledAutomaton ctor?:
|
||||
this.compiled = new CompiledAutomaton(automaton, null, true, maxDeterminizedStates, isBinary);
|
||||
}
|
||||
|
@ -154,4 +156,9 @@ public class AutomatonQuery extends MultiTermQuery {
|
|||
public Automaton getAutomaton() {
|
||||
return automaton;
|
||||
}
|
||||
|
||||
/** Is this a binary (byte) oriented automaton. See the constructor. */
|
||||
public boolean isAutomatonBinary() {
|
||||
return automatonIsBinary;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -83,8 +83,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
|
|||
return allAutomata.get(0);
|
||||
}
|
||||
//TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
|
||||
// could union them all. But it's not exposed, and note TermRangeQuery isn't modelled as an Automaton
|
||||
// by MultiTermHighlighting.
|
||||
// could union them all. But it's not exposed, and sometimes the automaton is byte (not char) oriented
|
||||
|
||||
// Return an aggregate CharacterRunAutomaton of others
|
||||
return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used
|
||||
|
|
|
@ -19,12 +19,10 @@ package org.apache.lucene.search.uhighlight;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.AutomatonQuery;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
|
@ -32,19 +30,17 @@ import org.apache.lucene.search.BoostQuery;
|
|||
import org.apache.lucene.search.ConstantScoreQuery;
|
||||
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||
import org.apache.lucene.search.FuzzyQuery;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermRangeQuery;
|
||||
import org.apache.lucene.search.spans.SpanBoostQuery;
|
||||
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanNotQuery;
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanPositionCheckQuery;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
|
@ -110,18 +106,6 @@ class MultiTermHighlighting {
|
|||
} else if (lookInSpan && query instanceof SpanMultiTermQueryWrapper) {
|
||||
list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(),
|
||||
fieldMatcher, lookInSpan, preRewriteFunc)));
|
||||
} else if (query instanceof PrefixQuery) {
|
||||
final PrefixQuery pq = (PrefixQuery) query;
|
||||
Term prefix = pq.getPrefix();
|
||||
if (fieldMatcher.test(prefix.field())) {
|
||||
list.add(new CharacterRunAutomaton(Operations.concatenate(Automata.makeString(prefix.text()),
|
||||
Automata.makeAnyString())) {
|
||||
@Override
|
||||
public String toString() {
|
||||
return pq.toString();
|
||||
}
|
||||
});
|
||||
}
|
||||
} else if (query instanceof FuzzyQuery) {
|
||||
final FuzzyQuery fq = (FuzzyQuery) query;
|
||||
if (fieldMatcher.test(fq.getField())) {
|
||||
|
@ -143,69 +127,63 @@ class MultiTermHighlighting {
|
|||
}
|
||||
});
|
||||
}
|
||||
} else if (query instanceof TermRangeQuery) {
|
||||
final TermRangeQuery tq = (TermRangeQuery) query;
|
||||
if (fieldMatcher.test(tq.getField())) {
|
||||
final CharsRef lowerBound;
|
||||
if (tq.getLowerTerm() == null) {
|
||||
lowerBound = null;
|
||||
} else {
|
||||
lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString());
|
||||
}
|
||||
|
||||
final CharsRef upperBound;
|
||||
if (tq.getUpperTerm() == null) {
|
||||
upperBound = null;
|
||||
} else {
|
||||
upperBound = new CharsRef(tq.getUpperTerm().utf8ToString());
|
||||
}
|
||||
|
||||
final boolean includeLower = tq.includesLower();
|
||||
final boolean includeUpper = tq.includesUpper();
|
||||
final CharsRef scratch = new CharsRef();
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();
|
||||
|
||||
// this is *not* an automaton, but its very simple
|
||||
list.add(new CharacterRunAutomaton(Automata.makeEmpty()) {
|
||||
@Override
|
||||
public boolean run(char[] s, int offset, int length) {
|
||||
scratch.chars = s;
|
||||
scratch.offset = offset;
|
||||
scratch.length = length;
|
||||
|
||||
if (lowerBound != null) {
|
||||
int cmp = comparator.compare(scratch, lowerBound);
|
||||
if (cmp < 0 || (!includeLower && cmp == 0)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (upperBound != null) {
|
||||
int cmp = comparator.compare(scratch, upperBound);
|
||||
if (cmp > 0 || (!includeUpper && cmp == 0)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return tq.toString();
|
||||
}
|
||||
});
|
||||
}
|
||||
} else if (query instanceof AutomatonQuery) {
|
||||
final AutomatonQuery aq = (AutomatonQuery) query;
|
||||
if (fieldMatcher.test(aq.getField())) {
|
||||
list.add(new CharacterRunAutomaton(aq.getAutomaton()) {
|
||||
@Override
|
||||
public String toString() {
|
||||
return aq.toString();
|
||||
}
|
||||
});
|
||||
|
||||
if (aq.isAutomatonBinary() == false) { // note: is the case for WildcardQuery, RegexpQuery
|
||||
list.add(new CharacterRunAutomaton(aq.getAutomaton()) {
|
||||
@Override
|
||||
public String toString() {
|
||||
return aq.toString();
|
||||
}
|
||||
});
|
||||
} else { // note: is the case for PrefixQuery, TermRangeQuery
|
||||
// byte oriented automaton:
|
||||
list.add(new CharacterRunAutomaton(Automata.makeEmpty()) { // empty here is bogus just to satisfy API
|
||||
// TODO can we get access to the aq.compiledAutomaton.runAutomaton ?
|
||||
ByteRunAutomaton byteRunAutomaton =
|
||||
new ByteRunAutomaton(aq.getAutomaton(), true, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
|
||||
@Override
|
||||
public boolean run(char[] chars, int offset, int length) {
|
||||
int state = 0;
|
||||
final int maxIdx = offset + length;
|
||||
for (int i = offset; i < maxIdx; i++) {
|
||||
final int code = chars[i];
|
||||
int b;
|
||||
// UTF16 to UTF8 (inlined logic from UnicodeUtil.UTF16toUTF8 )
|
||||
if (code < 0x80) {
|
||||
state = byteRunAutomaton.step(state, code);
|
||||
if (state == -1) return false;
|
||||
} else if (code < 0x800) {
|
||||
b = (0xC0 | (code >> 6));
|
||||
state = byteRunAutomaton.step(state, b);
|
||||
if (state == -1) return false;
|
||||
b = (0x80 | (code & 0x3F));
|
||||
state = byteRunAutomaton.step(state, b);
|
||||
if (state == -1) return false;
|
||||
} else {
|
||||
// more complex
|
||||
byte[] utf8Bytes = new byte[4 * (maxIdx - i)];
|
||||
int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes);
|
||||
for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) {
|
||||
state = byteRunAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF);
|
||||
if (state == -1) return false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return byteRunAutomaton.isAccept(state);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return aq.toString();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return list.toArray(new CharacterRunAutomaton[list.size()]);
|
||||
|
|
|
@ -24,11 +24,13 @@ import java.util.List;
|
|||
import java.util.Objects;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
@ -63,16 +65,15 @@ import org.apache.lucene.search.spans.SpanQuery;
|
|||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.search.spans.SpanWeight;
|
||||
import org.apache.lucene.store.BaseDirectoryWrapper;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
|
||||
/**
|
||||
* Some tests that highlight wildcard, fuzzy, etc queries.
|
||||
*/
|
||||
@SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom", "Lucene3x"})
|
||||
@LuceneTestCase.SuppressSysoutChecks(bugUrl = "")//Gradle interferes with this Lucene test rule
|
||||
public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
|
||||
|
||||
final FieldType fieldType;
|
||||
|
@ -1079,4 +1080,66 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
|
|||
assertEquals("[<b>я</b>]", Arrays.toString(snippets));
|
||||
ir.close();
|
||||
}
|
||||
|
||||
// LUCENE-7719
|
||||
public void testMultiByteMTQ() throws IOException {
|
||||
Analyzer analyzer = new KeywordAnalyzer();
|
||||
try (RandomIndexWriter iw = new RandomIndexWriter(random(), dir, analyzer)) {
|
||||
for (int attempt = 0; attempt < 20; attempt++) {
|
||||
iw.deleteAll();
|
||||
String field = "title";
|
||||
String value = RandomStrings.randomUnicodeOfLength(random(), 3);
|
||||
if (value.contains(UnifiedHighlighter.MULTIVAL_SEP_CHAR+"")) { // will throw things off
|
||||
continue;
|
||||
}
|
||||
int[] valuePoints = value.codePoints().toArray();
|
||||
|
||||
iw.addDocument(Collections.singleton(
|
||||
new Field(field, value, fieldType)));
|
||||
iw.commit();
|
||||
try (IndexReader ir = iw.getReader()) {
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, analyzer);
|
||||
highlighter.setBreakIterator(WholeBreakIterator::new);
|
||||
|
||||
// Test PrefixQuery
|
||||
Query query = new PrefixQuery(new Term(field,
|
||||
UnicodeUtil.newString(valuePoints, 0, 1)));
|
||||
highlightAndAssertMatch(searcher, highlighter, query, field, value);
|
||||
|
||||
// Test TermRangeQuery
|
||||
query = new TermRangeQuery(field,
|
||||
new BytesRef(value),
|
||||
new BytesRef(value),
|
||||
true, true );
|
||||
highlightAndAssertMatch(searcher, highlighter, query, field, value);
|
||||
|
||||
// Test FuzzyQuery
|
||||
query = new FuzzyQuery(new Term(field, value + "Z"), 1);
|
||||
highlightAndAssertMatch(searcher, highlighter, query, field, value);
|
||||
|
||||
if (valuePoints.length != 3) {
|
||||
continue; // even though we ask RandomStrings for a String with 3 code points, it seems sometimes it's less
|
||||
}
|
||||
|
||||
// Test WildcardQuery
|
||||
query = new WildcardQuery(new Term(field,
|
||||
new StringBuilder()
|
||||
.append(WildcardQuery.WILDCARD_ESCAPE).appendCodePoint(valuePoints[0])
|
||||
.append(WildcardQuery.WILDCARD_CHAR)
|
||||
.append(WildcardQuery.WILDCARD_ESCAPE).appendCodePoint(valuePoints[2]).toString()));
|
||||
highlightAndAssertMatch(searcher, highlighter, query, field, value);
|
||||
|
||||
//TODO hmmm; how to randomly generate RegexpQuery? Low priority; we've covered the others well.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void highlightAndAssertMatch(IndexSearcher searcher, UnifiedHighlighter highlighter, Query query, String field, String fieldVal) throws IOException {
|
||||
TopDocs topDocs = searcher.search(query, 1);
|
||||
assertEquals(1, topDocs.totalHits);
|
||||
String[] snippets = highlighter.highlight(field, query, topDocs);
|
||||
assertEquals("[<b>"+fieldVal+"</b>]", Arrays.toString(snippets));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue