From d0b9d3459fd097dba677cdda170632f6fca5e042 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Fri, 16 Jun 2017 22:57:51 -0400 Subject: [PATCH] LUCENE-7719: Generalize UnifiedHighlighter's support for AutomatonQuery --- lucene/CHANGES.txt | 3 + .../apache/lucene/search/AutomatonQuery.java | 7 + .../uhighlight/MemoryIndexOffsetStrategy.java | 3 +- .../uhighlight/MultiTermHighlighting.java | 132 ++++++++---------- .../uhighlight/TestUnifiedHighlighterMTQ.java | 69 ++++++++- 5 files changed, 132 insertions(+), 82 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 595ed120f35..e708888739c 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -130,6 +130,9 @@ Other * LUCENE-7852: Correct copyright year(s) in lucene/LICENSE.txt file. (Christine Poerschke, Steve Rowe) +* LUCENE-7719: Generalized the UnifiedHighlighter's support for AutomatonQuery + for character & binary automata. Added AutomatonQuery.isBinary. (David Smiley) + ======================= Lucene 6.7.0 ======================= New Features diff --git a/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java b/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java index 3b113a28a31..7fb155d78e2 100644 --- a/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java @@ -51,6 +51,7 @@ public class AutomatonQuery extends MultiTermQuery { protected final CompiledAutomaton compiled; /** term containing the field, and possibly some pattern structure */ protected final Term term; + protected final boolean automatonIsBinary; /** * Create a new AutomatonQuery from an {@link Automaton}. @@ -98,6 +99,7 @@ public class AutomatonQuery extends MultiTermQuery { super(term.field()); this.term = term; this.automaton = automaton; + this.automatonIsBinary = isBinary; // TODO: we could take isFinite too, to save a bit of CPU in CompiledAutomaton ctor?: this.compiled = new CompiledAutomaton(automaton, null, true, maxDeterminizedStates, isBinary); } @@ -154,4 +156,9 @@ public class AutomatonQuery extends MultiTermQuery { public Automaton getAutomaton() { return automaton; } + + /** Is this a binary (byte) oriented automaton. See the constructor. */ + public boolean isAutomatonBinary() { + return automatonIsBinary; + } } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java index 0001a801f8c..cf7a3c91f7e 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java @@ -83,8 +83,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy { return allAutomata.get(0); } //TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we - // could union them all. But it's not exposed, and note TermRangeQuery isn't modelled as an Automaton - // by MultiTermHighlighting. + // could union them all. But it's not exposed, and sometimes the automaton is byte (not char) oriented // Return an aggregate CharacterRunAutomaton of others return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java index 15f4bddf7e5..e745d78c534 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java @@ -19,12 +19,10 @@ package org.apache.lucene.search.uhighlight; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; -import java.util.Comparator; import java.util.List; import java.util.function.Function; import java.util.function.Predicate; -import org.apache.lucene.index.Term; import org.apache.lucene.search.AutomatonQuery; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; @@ -32,19 +30,17 @@ import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.FuzzyQuery; -import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.spans.SpanBoostQuery; import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanNotQuery; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanPositionCheckQuery; -import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.ByteRunAutomaton; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.LevenshteinAutomata; import org.apache.lucene.util.automaton.Operations; @@ -110,18 +106,6 @@ class MultiTermHighlighting { } else if (lookInSpan && query instanceof SpanMultiTermQueryWrapper) { list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper) query).getWrappedQuery(), fieldMatcher, lookInSpan, preRewriteFunc))); - } else if (query instanceof PrefixQuery) { - final PrefixQuery pq = (PrefixQuery) query; - Term prefix = pq.getPrefix(); - if (fieldMatcher.test(prefix.field())) { - list.add(new CharacterRunAutomaton(Operations.concatenate(Automata.makeString(prefix.text()), - Automata.makeAnyString())) { - @Override - public String toString() { - return pq.toString(); - } - }); - } } else if (query instanceof FuzzyQuery) { final FuzzyQuery fq = (FuzzyQuery) query; if (fieldMatcher.test(fq.getField())) { @@ -143,69 +127,63 @@ class MultiTermHighlighting { } }); } - } else if (query instanceof TermRangeQuery) { - final TermRangeQuery tq = (TermRangeQuery) query; - if (fieldMatcher.test(tq.getField())) { - final CharsRef lowerBound; - if (tq.getLowerTerm() == null) { - lowerBound = null; - } else { - lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString()); - } - - final CharsRef upperBound; - if (tq.getUpperTerm() == null) { - upperBound = null; - } else { - upperBound = new CharsRef(tq.getUpperTerm().utf8ToString()); - } - - final boolean includeLower = tq.includesLower(); - final boolean includeUpper = tq.includesUpper(); - final CharsRef scratch = new CharsRef(); - - @SuppressWarnings("deprecation") - final Comparator comparator = CharsRef.getUTF16SortedAsUTF8Comparator(); - - // this is *not* an automaton, but its very simple - list.add(new CharacterRunAutomaton(Automata.makeEmpty()) { - @Override - public boolean run(char[] s, int offset, int length) { - scratch.chars = s; - scratch.offset = offset; - scratch.length = length; - - if (lowerBound != null) { - int cmp = comparator.compare(scratch, lowerBound); - if (cmp < 0 || (!includeLower && cmp == 0)) { - return false; - } - } - - if (upperBound != null) { - int cmp = comparator.compare(scratch, upperBound); - if (cmp > 0 || (!includeUpper && cmp == 0)) { - return false; - } - } - return true; - } - - @Override - public String toString() { - return tq.toString(); - } - }); - } } else if (query instanceof AutomatonQuery) { final AutomatonQuery aq = (AutomatonQuery) query; if (fieldMatcher.test(aq.getField())) { - list.add(new CharacterRunAutomaton(aq.getAutomaton()) { - @Override - public String toString() { - return aq.toString(); - } - }); + + if (aq.isAutomatonBinary() == false) { // note: is the case for WildcardQuery, RegexpQuery + list.add(new CharacterRunAutomaton(aq.getAutomaton()) { + @Override + public String toString() { + return aq.toString(); + } + }); + } else { // note: is the case for PrefixQuery, TermRangeQuery + // byte oriented automaton: + list.add(new CharacterRunAutomaton(Automata.makeEmpty()) { // empty here is bogus just to satisfy API + // TODO can we get access to the aq.compiledAutomaton.runAutomaton ? + ByteRunAutomaton byteRunAutomaton = + new ByteRunAutomaton(aq.getAutomaton(), true, Operations.DEFAULT_MAX_DETERMINIZED_STATES); + + @Override + public boolean run(char[] chars, int offset, int length) { + int state = 0; + final int maxIdx = offset + length; + for (int i = offset; i < maxIdx; i++) { + final int code = chars[i]; + int b; + // UTF16 to UTF8 (inlined logic from UnicodeUtil.UTF16toUTF8 ) + if (code < 0x80) { + state = byteRunAutomaton.step(state, code); + if (state == -1) return false; + } else if (code < 0x800) { + b = (0xC0 | (code >> 6)); + state = byteRunAutomaton.step(state, b); + if (state == -1) return false; + b = (0x80 | (code & 0x3F)); + state = byteRunAutomaton.step(state, b); + if (state == -1) return false; + } else { + // more complex + byte[] utf8Bytes = new byte[4 * (maxIdx - i)]; + int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes); + for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) { + state = byteRunAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF); + if (state == -1) return false; + } + break; + } + } + return byteRunAutomaton.isAccept(state); + } + + @Override + public String toString() { + return aq.toString(); + } + }); + } + } } return list.toArray(new CharacterRunAutomaton[list.size()]); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java index 57f174f6da6..ac99c7926dc 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java @@ -24,11 +24,13 @@ import java.util.List; import java.util.Objects; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; +import com.carrotsearch.randomizedtesting.generators.RandomStrings; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -63,16 +65,15 @@ import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.SpanWeight; import org.apache.lucene.store.BaseDirectoryWrapper; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; +import org.apache.lucene.util.UnicodeUtil; import org.junit.After; import org.junit.Before; /** * Some tests that highlight wildcard, fuzzy, etc queries. */ -@SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom", "Lucene3x"}) -@LuceneTestCase.SuppressSysoutChecks(bugUrl = "")//Gradle interferes with this Lucene test rule public class TestUnifiedHighlighterMTQ extends LuceneTestCase { final FieldType fieldType; @@ -1079,4 +1080,66 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase { assertEquals("[я]", Arrays.toString(snippets)); ir.close(); } + + // LUCENE-7719 + public void testMultiByteMTQ() throws IOException { + Analyzer analyzer = new KeywordAnalyzer(); + try (RandomIndexWriter iw = new RandomIndexWriter(random(), dir, analyzer)) { + for (int attempt = 0; attempt < 20; attempt++) { + iw.deleteAll(); + String field = "title"; + String value = RandomStrings.randomUnicodeOfLength(random(), 3); + if (value.contains(UnifiedHighlighter.MULTIVAL_SEP_CHAR+"")) { // will throw things off + continue; + } + int[] valuePoints = value.codePoints().toArray(); + + iw.addDocument(Collections.singleton( + new Field(field, value, fieldType))); + iw.commit(); + try (IndexReader ir = iw.getReader()) { + IndexSearcher searcher = newSearcher(ir); + UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, analyzer); + highlighter.setBreakIterator(WholeBreakIterator::new); + + // Test PrefixQuery + Query query = new PrefixQuery(new Term(field, + UnicodeUtil.newString(valuePoints, 0, 1))); + highlightAndAssertMatch(searcher, highlighter, query, field, value); + + // Test TermRangeQuery + query = new TermRangeQuery(field, + new BytesRef(value), + new BytesRef(value), + true, true ); + highlightAndAssertMatch(searcher, highlighter, query, field, value); + + // Test FuzzyQuery + query = new FuzzyQuery(new Term(field, value + "Z"), 1); + highlightAndAssertMatch(searcher, highlighter, query, field, value); + + if (valuePoints.length != 3) { + continue; // even though we ask RandomStrings for a String with 3 code points, it seems sometimes it's less + } + + // Test WildcardQuery + query = new WildcardQuery(new Term(field, + new StringBuilder() + .append(WildcardQuery.WILDCARD_ESCAPE).appendCodePoint(valuePoints[0]) + .append(WildcardQuery.WILDCARD_CHAR) + .append(WildcardQuery.WILDCARD_ESCAPE).appendCodePoint(valuePoints[2]).toString())); + highlightAndAssertMatch(searcher, highlighter, query, field, value); + + //TODO hmmm; how to randomly generate RegexpQuery? Low priority; we've covered the others well. + } + } + } + } + + private void highlightAndAssertMatch(IndexSearcher searcher, UnifiedHighlighter highlighter, Query query, String field, String fieldVal) throws IOException { + TopDocs topDocs = searcher.search(query, 1); + assertEquals(1, topDocs.totalHits); + String[] snippets = highlighter.highlight(field, query, topDocs); + assertEquals("["+fieldVal+"]", Arrays.toString(snippets)); + } }