From da841be88774e7a00b4dd954399c7e997bf15d1a Mon Sep 17 00:00:00 2001 From: David Smiley Date: Tue, 8 Nov 2016 16:13:32 -0500 Subject: [PATCH] LUCENE-7544: UnifiedHighlighter: extension hooks for custom query handling --- lucene/CHANGES.txt | 3 + .../uhighlight/AnalysisOffsetStrategy.java | 15 +++- .../uhighlight/MultiTermHighlighting.java | 32 ++++--- .../search/uhighlight/PhraseHelper.java | 19 +++- .../search/uhighlight/UnifiedHighlighter.java | 35 +++++++- .../uhighlight/TestUnifiedHighlighterMTQ.java | 88 +++++++++++++++++++ .../TestUnifiedHighlighterStrictPhrases.java | 77 ++++++++++++++++ 7 files changed, 250 insertions(+), 19 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 0ccb5ee8c59..f64e9e8986c 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -67,6 +67,9 @@ Improvements and exclude arguments of SpanNotQuery via negative pre and/or post arguments. (Marc Morissette via David Smiley) +* LUCENE-7544: UnifiedHighlighter: add extension points for handling custom queries. + (Michael Braun, David Smiley) + ======================= Lucene 6.3.0 ======================= API Changes diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java index 553a636ed6a..6b4cc74a48e 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java @@ -19,8 +19,10 @@ package org.apache.lucene.search.uhighlight; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.List; +import java.util.function.Function; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.FilteringTokenFilter; @@ -30,6 +32,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.Terms; import org.apache.lucene.index.memory.MemoryIndex; +import org.apache.lucene.search.Query; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automata; @@ -50,7 +53,9 @@ public class AnalysisOffsetStrategy extends FieldOffsetStrategy { private final LeafReader leafReader; private final CharacterRunAutomaton preMemIndexFilterAutomaton; - public AnalysisOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Analyzer analyzer) { + public AnalysisOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper, + CharacterRunAutomaton[] automata, Analyzer analyzer, + Function> multiTermQueryRewrite) { super(field, extractedTerms, phraseHelper, automata); this.analyzer = analyzer; // Automata (Wildcards / MultiTermQuery): @@ -68,7 +73,8 @@ public class AnalysisOffsetStrategy extends FieldOffsetStrategy { memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // preFilter for MemoryIndex - preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, strictPhrases); + preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, strictPhrases, + multiTermQueryRewrite); } else { memoryIndex = null; leafReader = null; @@ -155,7 +161,8 @@ public class AnalysisOffsetStrategy extends FieldOffsetStrategy { */ private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesRef[] terms, CharacterRunAutomaton[] automata, - PhraseHelper strictPhrases) { + PhraseHelper strictPhrases, + Function> multiTermQueryRewrite) { List allAutomata = new ArrayList<>(); if (terms.length > 0) { allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(terms)))); @@ -163,7 +170,7 @@ public class AnalysisOffsetStrategy extends FieldOffsetStrategy { Collections.addAll(allAutomata, automata); for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) { Collections.addAll(allAutomata, - MultiTermHighlighting.extractAutomata(spanQuery, field, true));//true==lookInSpan + MultiTermHighlighting.extractAutomata(spanQuery, field, true, multiTermQueryRewrite));//true==lookInSpan } if (allAutomata.size() == 1) { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java index 9498af584a8..e85fa3bffa9 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java @@ -20,8 +20,10 @@ import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.Comparator; import java.util.List; +import java.util.function.Function; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -69,34 +71,44 @@ class MultiTermHighlighting { * Extracts all MultiTermQueries for {@code field}, and returns equivalent * automata that will match terms. */ - public static CharacterRunAutomaton[] extractAutomata(Query query, String field, boolean lookInSpan) { + public static CharacterRunAutomaton[] extractAutomata(Query query, String field, boolean lookInSpan, + Function> preRewriteFunc) { List list = new ArrayList<>(); - if (query instanceof BooleanQuery) { + Collection customSubQueries = preRewriteFunc.apply(query); + if (customSubQueries != null) { + for (Query sub : customSubQueries) { + list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc))); + } + } else if (query instanceof BooleanQuery) { for (BooleanClause clause : (BooleanQuery) query) { if (!clause.isProhibited()) { - list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field, lookInSpan))); + list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field, lookInSpan, preRewriteFunc))); } } } else if (query instanceof ConstantScoreQuery) { - list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), field, lookInSpan))); + list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), field, lookInSpan, + preRewriteFunc))); } else if (query instanceof DisjunctionMaxQuery) { for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) { - list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan))); + list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc))); } } else if (lookInSpan && query instanceof SpanOrQuery) { for (Query sub : ((SpanOrQuery) query).getClauses()) { - list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan))); + list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc))); } } else if (lookInSpan && query instanceof SpanNearQuery) { for (Query sub : ((SpanNearQuery) query).getClauses()) { - list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan))); + list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc))); } } else if (lookInSpan && query instanceof SpanNotQuery) { - list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field, lookInSpan))); + list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field, lookInSpan, + preRewriteFunc))); } else if (lookInSpan && query instanceof SpanPositionCheckQuery) { - list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field, lookInSpan))); + list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field, lookInSpan, + preRewriteFunc))); } else if (lookInSpan && query instanceof SpanMultiTermQueryWrapper) { - list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper) query).getWrappedQuery(), field, lookInSpan))); + list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper) query).getWrappedQuery(), field, + lookInSpan, preRewriteFunc))); } else if (query instanceof AutomatonQuery) { final AutomatonQuery aq = (AutomatonQuery) query; if (aq.getField().equals(field)) { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java index 5225041f9be..95d51c917da 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java @@ -40,7 +40,7 @@ import java.util.function.Function; public class PhraseHelper { public static final PhraseHelper NONE = new PhraseHelper(new MatchAllDocsQuery(), "_ignored_", - spanQuery -> null, true); + spanQuery -> null, query -> null, true); //TODO it seems this ought to be a general thing on Spans? private static final Comparator SPANS_COMPARATOR = (o1, o2) -> { @@ -69,11 +69,14 @@ public class PhraseHelper { * {@code rewriteQueryPred} is an extension hook to override the default choice of * {@link WeightedSpanTermExtractor#mustRewriteQuery(SpanQuery)}. By default unknown query types are rewritten, * so use this to return {@link Boolean#FALSE} if you know the query doesn't need to be rewritten. + * Similarly, {@code preExtractRewriteFunction} is also an extension hook for extract to allow different queries + * to be set before the {@link WeightedSpanTermExtractor}'s extraction is invoked. * {@code ignoreQueriesNeedingRewrite} effectively ignores any query clause that needs to be "rewritten", which is * usually limited to just a {@link SpanMultiTermQueryWrapper} but could be other custom ones. */ public PhraseHelper(Query query, String field, Function rewriteQueryPred, - boolean ignoreQueriesNeedingRewrite) { + Function> preExtractRewriteFunction, + boolean ignoreQueriesNeedingRewrite) { this.fieldName = field; // if null then don't require field match // filter terms to those we want positionInsensitiveTerms = field != null ? new FieldFilteringTermHashSet(field) : new HashSet<>(); @@ -98,6 +101,18 @@ public class PhraseHelper { } } + @Override + protected void extract(Query query, float boost, Map terms) throws IOException { + Collection newQueriesToExtract = preExtractRewriteFunction.apply(query); + if (newQueriesToExtract != null) { + for (Query newQuery : newQueriesToExtract) { + extract(newQuery, boost, terms); + } + } else { + super.extract(query, boost, terms); + } + } + @Override protected boolean isQueryUnsupported(Class clazz) { if (clazz.isAssignableFrom(MultiTermQuery.class)) { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java index 72be180c177..5f09d84f033 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java @@ -21,6 +21,7 @@ import java.nio.charset.StandardCharsets; import java.text.BreakIterator; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.EnumSet; import java.util.HashMap; import java.util.List; @@ -732,7 +733,8 @@ public class UnifiedHighlighter { OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata); switch (offsetSource) { case ANALYSIS: - return new AnalysisOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer()); + return new AnalysisOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer(), + this::preMultiTermQueryRewrite); case NONE_NEEDED: return NoOpOffsetStrategy.INSTANCE; case TERM_VECTORS: @@ -776,13 +778,14 @@ public class UnifiedHighlighter { boolean highlightPhrasesStrictly = highlightFlags.contains(HighlightFlag.PHRASES); boolean handleMultiTermQuery = highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY); return highlightPhrasesStrictly ? - new PhraseHelper(query, field, this::requiresRewrite, !handleMultiTermQuery) : + new PhraseHelper(query, field, this::requiresRewrite, this::preSpanQueryRewrite, !handleMultiTermQuery) : PhraseHelper.NONE; } protected CharacterRunAutomaton[] getAutomata(String field, Query query, EnumSet highlightFlags) { return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY) - ? MultiTermHighlighting.extractAutomata(query, field, !highlightFlags.contains(HighlightFlag.PHRASES)) + ? MultiTermHighlighting.extractAutomata(query, field, !highlightFlags.contains(HighlightFlag.PHRASES), + this::preMultiTermQueryRewrite) : ZERO_LEN_AUTOMATA_ARRAY; } @@ -830,6 +833,32 @@ public class UnifiedHighlighter { return null; } + /** + * When highlighting phrases accurately, we may need to handle custom queries that aren't supported in the + * {@link org.apache.lucene.search.highlight.WeightedSpanTermExtractor} as called by the {@code PhraseHelper}. + * Should custom query types be needed, this method should be overriden to return a collection of queries if appropriate, + * or null if nothing to do. If the query is not custom, simply returning null will allow the default rules to apply. + * + * @param query Query to be highlighted + * @return A Collection of Query object(s) if needs to be rewritten, otherwise null. + */ + protected Collection preSpanQueryRewrite(Query query) { + return null; + } + + /** + * When dealing with multi term queries / span queries, we may need to handle custom queries that aren't supported + * by the default automata extraction in {@code MultiTermHighlighting}. This can be overridden to return a collection + * of queries if appropriate, or null if nothing to do. If query is not custom, simply returning null will allow the + * default rules to apply. + * + * @param query Query to be highlighted + * @return A Collection of Query object(s) if needst o be rewritten, otherwise null. + */ + protected Collection preMultiTermQueryRewrite(Query query) { + return null; + } + private DocIdSetIterator asDocIdSetIterator(int[] sortedDocIds) { return new DocIdSetIterator() { int idx = -1; diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java index 63f0bb1ca5b..ddc9507d62b 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java @@ -20,6 +20,8 @@ package org.apache.lucene.search.uhighlight; import java.io.IOException; import java.util.Arrays; import java.util.Collections; +import java.util.List; +import java.util.Objects; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import org.apache.lucene.analysis.Analyzer; @@ -56,6 +58,7 @@ import org.apache.lucene.search.spans.SpanNotQuery; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.SpanWeight; import org.apache.lucene.store.BaseDirectoryWrapper; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; @@ -933,4 +936,89 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase { ir.close(); } + public void testCustomSpanQueryHighlighting() throws Exception { + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer); + Document doc = new Document(); + doc.add(new Field("body", "alpha bravo charlie delta echo foxtrot golf hotel india juliet", fieldType)); + doc.add(newTextField("id", "id", Field.Store.YES)); + + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) { + @Override + protected List preMultiTermQueryRewrite(Query query) { + if (query instanceof MyWrapperSpanQuery) { + return Collections.singletonList(((MyWrapperSpanQuery) query).originalQuery); + } + return null; + } + }; + + int docId = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc; + + WildcardQuery wildcardQuery = new WildcardQuery(new Term("body", "foxtr*")); + SpanMultiTermQueryWrapper wildcardQueryWrapper = new SpanMultiTermQueryWrapper<>(wildcardQuery); + + SpanQuery wrappedQuery = new MyWrapperSpanQuery(wildcardQueryWrapper); + + BooleanQuery query = new BooleanQuery.Builder() + .add(wrappedQuery, BooleanClause.Occur.SHOULD) + .build(); + + int[] docIds = new int[]{docId}; + + String snippets[] = highlighter.highlightFields(new String[]{"body"}, query, docIds, new int[]{2}).get("body"); + assertEquals(1, snippets.length); + assertEquals("alpha bravo charlie delta echo foxtrot golf hotel india juliet", snippets[0]); + ir.close(); + } + + private static class MyWrapperSpanQuery extends SpanQuery { + + private final SpanQuery originalQuery; + + private MyWrapperSpanQuery(SpanQuery originalQuery) { + this.originalQuery = Objects.requireNonNull(originalQuery); + } + + @Override + public String getField() { + return originalQuery.getField(); + } + + @Override + public String toString(String field) { + return "(Wrapper[" + originalQuery.toString(field)+"])"; + } + + @Override + public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { + return originalQuery.createWeight(searcher, needsScores, boost); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + Query newOriginalQuery = originalQuery.rewrite(reader); + if (newOriginalQuery != originalQuery) { + return new MyWrapperSpanQuery((SpanQuery)newOriginalQuery); + } + return this; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + return originalQuery.equals(((MyWrapperSpanQuery)o).originalQuery); + } + + @Override + public int hashCode() { + return originalQuery.hashCode(); + } + } + } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java index 5fecdc6d5bc..dafb6e23460 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java @@ -17,6 +17,8 @@ package org.apache.lucene.search.uhighlight; import java.io.IOException; +import java.util.Collection; +import java.util.Collections; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import org.apache.lucene.analysis.MockAnalyzer; @@ -29,14 +31,17 @@ import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.Weight; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; import org.apache.lucene.search.spans.SpanNearQuery; @@ -401,4 +406,76 @@ public class TestUnifiedHighlighterStrictPhrases extends LuceneTestCase { Object o = highlighter.highlightWithoutSearcher("body", new MatchNoDocsQuery(), content, 1); assertEquals(content, o); } + + public void testPreSpanQueryRewrite() throws IOException { + indexWriter.addDocument(newDoc("There is no accord and satisfaction with this - Consideration of the accord is arbitrary.")); + initReaderSearcherHighlighter(); + + highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) { + @Override + protected Collection preSpanQueryRewrite(Query query) { + if (query instanceof MyQuery) { + return Collections.singletonList(((MyQuery)query).wrapped); + } + return null; + } + }; + highlighter.setHighlightPhrasesStrictly(true); + + BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); + Query phraseQuery = new BoostQuery(new PhraseQuery("body", "accord", "and", "satisfaction"), 2.0f); + Query oredTerms = new BooleanQuery.Builder() + .setMinimumNumberShouldMatch(2) + .add(new TermQuery(new Term("body", "accord")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("body", "satisfaction")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("body", "consideration")), BooleanClause.Occur.SHOULD) + .build(); + Query proximityBoostingQuery = new MyQuery(oredTerms); + Query totalQuery = bqBuilder + .add(phraseQuery, BooleanClause.Occur.SHOULD) + .add(proximityBoostingQuery, BooleanClause.Occur.SHOULD) + .build(); + TopDocs topDocs = searcher.search(totalQuery, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String[] snippets = highlighter.highlight("body", totalQuery, topDocs); + assertArrayEquals(new String[]{"There is no accord and satisfaction with this - Consideration of the accord is arbitrary."}, snippets); + } + + private static class MyQuery extends Query { + + private final Query wrapped; + + MyQuery(Query wrapped) { + this.wrapped = wrapped; + } + + @Override + public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { + return wrapped.createWeight(searcher, needsScores, boost); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + Query newWrapped = wrapped.rewrite(reader); + if (newWrapped != wrapped) { + return new MyQuery(newWrapped); + } + return this; + } + + @Override + public String toString(String field) { + return "[[["+wrapped.toString(field)+"]]]"; + } + + @Override + public boolean equals(Object obj) { + return obj != null && obj.getClass() == getClass() && wrapped.equals(((MyQuery)wrapped).wrapped); + } + + @Override + public int hashCode() { + return wrapped.hashCode(); + } + } }