LUCENE-7719: Generalize UnifiedHighlighter's support for AutomatonQuery

This commit is contained in:
David Smiley 2017-06-16 22:57:51 -04:00
parent b49ce68935
commit d0b9d3459f
5 changed files with 132 additions and 82 deletions

View File

@ -130,6 +130,9 @@ Other
* LUCENE-7852: Correct copyright year(s) in lucene/LICENSE.txt file.
(Christine Poerschke, Steve Rowe)
* LUCENE-7719: Generalized the UnifiedHighlighter's support for AutomatonQuery
for character & binary automata. Added AutomatonQuery.isBinary. (David Smiley)
======================= Lucene 6.7.0 =======================
New Features

View File

@ -51,6 +51,7 @@ public class AutomatonQuery extends MultiTermQuery {
protected final CompiledAutomaton compiled;
/** term containing the field, and possibly some pattern structure */
protected final Term term;
protected final boolean automatonIsBinary;
/**
* Create a new AutomatonQuery from an {@link Automaton}.
@ -98,6 +99,7 @@ public class AutomatonQuery extends MultiTermQuery {
super(term.field());
this.term = term;
this.automaton = automaton;
this.automatonIsBinary = isBinary;
// TODO: we could take isFinite too, to save a bit of CPU in CompiledAutomaton ctor?:
this.compiled = new CompiledAutomaton(automaton, null, true, maxDeterminizedStates, isBinary);
}
@ -154,4 +156,9 @@ public class AutomatonQuery extends MultiTermQuery {
public Automaton getAutomaton() {
return automaton;
}
/** Is this a binary (byte) oriented automaton. See the constructor. */
public boolean isAutomatonBinary() {
return automatonIsBinary;
}
}

View File

@ -83,8 +83,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
return allAutomata.get(0);
}
//TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
// could union them all. But it's not exposed, and note TermRangeQuery isn't modelled as an Automaton
// by MultiTermHighlighting.
// could union them all. But it's not exposed, and sometimes the automaton is byte (not char) oriented
// Return an aggregate CharacterRunAutomaton of others
return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used

View File

@ -19,12 +19,10 @@ package org.apache.lucene.search.uhighlight;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.List;
import java.util.function.Function;
import java.util.function.Predicate;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
@ -32,19 +30,17 @@ import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.spans.SpanBoostQuery;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanNotQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanPositionCheckQuery;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
@ -110,18 +106,6 @@ class MultiTermHighlighting {
} else if (lookInSpan && query instanceof SpanMultiTermQueryWrapper) {
list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(),
fieldMatcher, lookInSpan, preRewriteFunc)));
} else if (query instanceof PrefixQuery) {
final PrefixQuery pq = (PrefixQuery) query;
Term prefix = pq.getPrefix();
if (fieldMatcher.test(prefix.field())) {
list.add(new CharacterRunAutomaton(Operations.concatenate(Automata.makeString(prefix.text()),
Automata.makeAnyString())) {
@Override
public String toString() {
return pq.toString();
}
});
}
} else if (query instanceof FuzzyQuery) {
final FuzzyQuery fq = (FuzzyQuery) query;
if (fieldMatcher.test(fq.getField())) {
@ -143,69 +127,63 @@ class MultiTermHighlighting {
}
});
}
} else if (query instanceof TermRangeQuery) {
final TermRangeQuery tq = (TermRangeQuery) query;
if (fieldMatcher.test(tq.getField())) {
final CharsRef lowerBound;
if (tq.getLowerTerm() == null) {
lowerBound = null;
} else {
lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString());
}
final CharsRef upperBound;
if (tq.getUpperTerm() == null) {
upperBound = null;
} else {
upperBound = new CharsRef(tq.getUpperTerm().utf8ToString());
}
final boolean includeLower = tq.includesLower();
final boolean includeUpper = tq.includesUpper();
final CharsRef scratch = new CharsRef();
@SuppressWarnings("deprecation")
final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();
// this is *not* an automaton, but its very simple
list.add(new CharacterRunAutomaton(Automata.makeEmpty()) {
@Override
public boolean run(char[] s, int offset, int length) {
scratch.chars = s;
scratch.offset = offset;
scratch.length = length;
if (lowerBound != null) {
int cmp = comparator.compare(scratch, lowerBound);
if (cmp < 0 || (!includeLower && cmp == 0)) {
return false;
}
}
if (upperBound != null) {
int cmp = comparator.compare(scratch, upperBound);
if (cmp > 0 || (!includeUpper && cmp == 0)) {
return false;
}
}
return true;
}
@Override
public String toString() {
return tq.toString();
}
});
}
} else if (query instanceof AutomatonQuery) {
final AutomatonQuery aq = (AutomatonQuery) query;
if (fieldMatcher.test(aq.getField())) {
list.add(new CharacterRunAutomaton(aq.getAutomaton()) {
@Override
public String toString() {
return aq.toString();
}
});
if (aq.isAutomatonBinary() == false) { // note: is the case for WildcardQuery, RegexpQuery
list.add(new CharacterRunAutomaton(aq.getAutomaton()) {
@Override
public String toString() {
return aq.toString();
}
});
} else { // note: is the case for PrefixQuery, TermRangeQuery
// byte oriented automaton:
list.add(new CharacterRunAutomaton(Automata.makeEmpty()) { // empty here is bogus just to satisfy API
// TODO can we get access to the aq.compiledAutomaton.runAutomaton ?
ByteRunAutomaton byteRunAutomaton =
new ByteRunAutomaton(aq.getAutomaton(), true, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
@Override
public boolean run(char[] chars, int offset, int length) {
int state = 0;
final int maxIdx = offset + length;
for (int i = offset; i < maxIdx; i++) {
final int code = chars[i];
int b;
// UTF16 to UTF8 (inlined logic from UnicodeUtil.UTF16toUTF8 )
if (code < 0x80) {
state = byteRunAutomaton.step(state, code);
if (state == -1) return false;
} else if (code < 0x800) {
b = (0xC0 | (code >> 6));
state = byteRunAutomaton.step(state, b);
if (state == -1) return false;
b = (0x80 | (code & 0x3F));
state = byteRunAutomaton.step(state, b);
if (state == -1) return false;
} else {
// more complex
byte[] utf8Bytes = new byte[4 * (maxIdx - i)];
int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes);
for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) {
state = byteRunAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF);
if (state == -1) return false;
}
break;
}
}
return byteRunAutomaton.isAccept(state);
}
@Override
public String toString() {
return aq.toString();
}
});
}
}
}
return list.toArray(new CharacterRunAutomaton[list.size()]);

View File

@ -24,11 +24,13 @@ import java.util.List;
import java.util.Objects;
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@ -63,16 +65,15 @@ import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanWeight;
import org.apache.lucene.store.BaseDirectoryWrapper;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.apache.lucene.util.UnicodeUtil;
import org.junit.After;
import org.junit.Before;
/**
* Some tests that highlight wildcard, fuzzy, etc queries.
*/
@SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom", "Lucene3x"})
@LuceneTestCase.SuppressSysoutChecks(bugUrl = "")//Gradle interferes with this Lucene test rule
public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
final FieldType fieldType;
@ -1079,4 +1080,66 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
assertEquals("[<b>я</b>]", Arrays.toString(snippets));
ir.close();
}
// LUCENE-7719
public void testMultiByteMTQ() throws IOException {
Analyzer analyzer = new KeywordAnalyzer();
try (RandomIndexWriter iw = new RandomIndexWriter(random(), dir, analyzer)) {
for (int attempt = 0; attempt < 20; attempt++) {
iw.deleteAll();
String field = "title";
String value = RandomStrings.randomUnicodeOfLength(random(), 3);
if (value.contains(UnifiedHighlighter.MULTIVAL_SEP_CHAR+"")) { // will throw things off
continue;
}
int[] valuePoints = value.codePoints().toArray();
iw.addDocument(Collections.singleton(
new Field(field, value, fieldType)));
iw.commit();
try (IndexReader ir = iw.getReader()) {
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, analyzer);
highlighter.setBreakIterator(WholeBreakIterator::new);
// Test PrefixQuery
Query query = new PrefixQuery(new Term(field,
UnicodeUtil.newString(valuePoints, 0, 1)));
highlightAndAssertMatch(searcher, highlighter, query, field, value);
// Test TermRangeQuery
query = new TermRangeQuery(field,
new BytesRef(value),
new BytesRef(value),
true, true );
highlightAndAssertMatch(searcher, highlighter, query, field, value);
// Test FuzzyQuery
query = new FuzzyQuery(new Term(field, value + "Z"), 1);
highlightAndAssertMatch(searcher, highlighter, query, field, value);
if (valuePoints.length != 3) {
continue; // even though we ask RandomStrings for a String with 3 code points, it seems sometimes it's less
}
// Test WildcardQuery
query = new WildcardQuery(new Term(field,
new StringBuilder()
.append(WildcardQuery.WILDCARD_ESCAPE).appendCodePoint(valuePoints[0])
.append(WildcardQuery.WILDCARD_CHAR)
.append(WildcardQuery.WILDCARD_ESCAPE).appendCodePoint(valuePoints[2]).toString()));
highlightAndAssertMatch(searcher, highlighter, query, field, value);
//TODO hmmm; how to randomly generate RegexpQuery? Low priority; we've covered the others well.
}
}
}
}
private void highlightAndAssertMatch(IndexSearcher searcher, UnifiedHighlighter highlighter, Query query, String field, String fieldVal) throws IOException {
TopDocs topDocs = searcher.search(query, 1);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighter.highlight(field, query, topDocs);
assertEquals("[<b>"+fieldVal+"</b>]", Arrays.toString(snippets));
}
}