From 5a4fd86cce57586dd14fb8f11f9b170b121eebc4 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 18 Oct 2018 10:15:57 +0200 Subject: [PATCH] LUCENE-8535: Drop out of the box Block-Join highlight support Highlighter doesn't support ToParent and ToChildBlockJoinQuery out of the box anymore. In oder to highlight on Block-Join Queries a custom WeightedSpanTermExtractor should be used. --- lucene/CHANGES.txt | 4 ++ lucene/MIGRATE.txt | 5 ++ lucene/highlighter/build.xml | 2 - .../lucene/search/highlight/QueryScorer.java | 3 +- .../highlight/WeightedSpanTermExtractor.java | 36 ++++++++--- .../FastVectorHighlighter.java | 6 +- .../search/vectorhighlight/FieldQuery.java | 12 +--- .../search/highlight/HighlighterTest.java | 60 ------------------- .../vectorhighlight/FieldQueryTest.java | 15 ----- .../highlight/DefaultSolrHighlighter.java | 49 ++++++++++++++- 10 files changed, 90 insertions(+), 102 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 50b1cf397ec..a3af603257e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -123,6 +123,10 @@ Changes in Runtime Behavior * LUCENE-8505: IndexWriter#addIndices will now fail if the target index is sorted but the candidate is not. (Jim Ferenczi) +* LUCENE-8535: Highlighter and FVH doesn't support ToParent and ToChildBlockJoinQuery out of the + box anymore. In order to highlight on Block-Join Queries a custom WeightedSpanTermExtractor / FieldQuery + should be used. (Simon Willnauer, Jim Ferenczi, Julie Tibshiran) + New Features * LUCENE-8340: LongPoint#newDistanceQuery may be used to boost scores based on diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index 1b56b6465d6..0515ff8123c 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -140,3 +140,8 @@ a LowerCaseFilter CharTokenizer now only performs tokenization. To perform any type of filtering use a TokenFilter chain as you would with any other Tokenizer. + +## Highlighter and FastVectorHighlighter no longer support ToParent/ToChildBlockJoinQuery + +Both Highlighter and FastVectorHighlighter need a custom WeightedSpanTermExtractor or FieldQuery respectively +in order to support ToParent/ToChildBlockJoinQuery. diff --git a/lucene/highlighter/build.xml b/lucene/highlighter/build.xml index ed6cc74be8a..6ecd793c9ed 100644 --- a/lucene/highlighter/build.xml +++ b/lucene/highlighter/build.xml @@ -31,14 +31,12 @@ - - diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java index 25d9b3c7e5a..5452fc7e830 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java @@ -229,8 +229,7 @@ public class QueryScorer implements Scorer { } protected WeightedSpanTermExtractor newTermExtractor(String defaultField) { - return defaultField == null ? new WeightedSpanTermExtractor() - : new WeightedSpanTermExtractor(defaultField); + return new WeightedSpanTermExtractor(defaultField); } /* diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java index e0655c5ee58..bc026210c21 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java @@ -54,8 +54,6 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.SynonymQuery; import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.join.ToChildBlockJoinQuery; -import org.apache.lucene.search.join.ToParentBlockJoinQuery; import org.apache.lucene.search.spans.FieldMaskingSpanQuery; import org.apache.lucene.search.spans.SpanFirstQuery; import org.apache.lucene.search.spans.SpanNearQuery; @@ -71,6 +69,31 @@ import org.apache.lucene.util.IOUtils; /** * Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether * {@link Term}s from the {@link Query} are contained in a supplied {@link TokenStream}. + * + * In order to support additional, by default unsupported queries, subclasses can override + * {@link #extract(Query, float, Map)} for extracting wrapped or delegate queries and + * {@link #extractUnknownQuery(Query, Map)} to process custom leaf queries: + *
+ * 
+ *    WeightedSpanTermExtractor extractor = new WeightedSpanTermExtractor() {
+ *        protected void extract(Query query, float boost, Map<String, WeightedSpanTerm>terms) throws IOException {
+ *          if (query instanceof QueryWrapper) {
+ *            extract(((QueryWrapper)query).getQuery(), boost, terms);
+ *          } else {
+ *            super.extract(query, boost, terms);
+ *          }
+ *        }
+ *
+ *        protected void extractUnknownQuery(Query query, Map<String, WeightedSpanTerm> terms) throws IOException {
+ *          if (query instanceOf CustomTermQuery) {
+ *            Term term = ((CustomTermQuery) query).getTerm();
+ *            terms.put(term.field(), new WeightedSpanTerm(1, term.text()));
+ *          }
+ *        }
+ *    };
+ * }
+ * 
+ * 
*/ public class WeightedSpanTermExtractor { @@ -85,12 +108,11 @@ public class WeightedSpanTermExtractor { private LeafReader internalReader = null; public WeightedSpanTermExtractor() { + this(null); } public WeightedSpanTermExtractor(String defaultField) { - if (defaultField != null) { - this.defaultField = defaultField; - } + this.defaultField = defaultField; } /** @@ -154,10 +176,6 @@ public class WeightedSpanTermExtractor { for (Query clause : ((DisjunctionMaxQuery) query)) { extract(clause, boost, terms); } - } else if (query instanceof ToParentBlockJoinQuery) { - extract(((ToParentBlockJoinQuery) query).getChildQuery(), boost, terms); - } else if (query instanceof ToChildBlockJoinQuery) { - extract(((ToChildBlockJoinQuery) query).getParentQuery(), boost, terms); } else if (query instanceof MultiPhraseQuery) { final MultiPhraseQuery mpq = (MultiPhraseQuery) query; final Term[][] termArrays = mpq.getTermArrays(); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java index 4845de69e70..ac7e9d0a3c4 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java @@ -31,8 +31,8 @@ import org.apache.lucene.search.highlight.Encoder; public class FastVectorHighlighter { public static final boolean DEFAULT_PHRASE_HIGHLIGHT = true; public static final boolean DEFAULT_FIELD_MATCH = true; - private final boolean phraseHighlight; - private final boolean fieldMatch; + protected final boolean phraseHighlight; + protected final boolean fieldMatch; private final FragListBuilder fragListBuilder; private final FragmentsBuilder fragmentsBuilder; private int phraseLimit = Integer.MAX_VALUE; @@ -80,7 +80,7 @@ public class FastVectorHighlighter { // TODO: should we deprecate this? // because if there is no reader, then we cannot rewrite MTQ. try { - return new FieldQuery( query, null, phraseHighlight, fieldMatch ); + return getFieldQuery(query, null); } catch (IOException e) { // should never be thrown when reader is null throw new RuntimeException (e); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java index cb502343c89..8584b404bbb 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java @@ -38,7 +38,6 @@ import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.SynonymQuery; import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.join.ToParentBlockJoinQuery; import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; /** @@ -62,7 +61,7 @@ public class FieldQuery { // The maximum number of different matching terms accumulated from any one MultiTermQuery private static final int MAX_MTQ_TERMS = 1024; - FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException { + public FieldQuery(Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch) throws IOException { this.fieldMatch = fieldMatch; Set flatQueries = new LinkedHashSet<>(); flatten( query, reader, flatQueries, 1f ); @@ -95,7 +94,7 @@ public class FieldQuery { this (query, null, phraseHighlight, fieldMatch); } - void flatten( Query sourceQuery, IndexReader reader, Collection flatQueries, float boost ) throws IOException{ + protected void flatten( Query sourceQuery, IndexReader reader, Collection flatQueries, float boost ) throws IOException { while (sourceQuery instanceof BoostQuery) { BoostQuery bq = (BoostQuery) sourceQuery; sourceQuery = bq.getQuery(); @@ -141,15 +140,10 @@ public class FieldQuery { flatten( q, reader, flatQueries, boost); } } else if (sourceQuery instanceof FunctionScoreQuery) { - final Query q = ((FunctionScoreQuery)sourceQuery).getWrappedQuery(); + final Query q = ((FunctionScoreQuery) sourceQuery).getWrappedQuery(); if (q != null) { flatten(q, reader, flatQueries, boost); } - } else if (sourceQuery instanceof ToParentBlockJoinQuery) { - Query childQuery = ((ToParentBlockJoinQuery) sourceQuery).getChildQuery(); - if (childQuery != null) { - flatten(childQuery, reader, flatQueries, boost); - } } else if (reader != null) { Query query = sourceQuery; Query rewritten; diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java index 89c81db01dc..898846fa43c 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java @@ -81,11 +81,6 @@ import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner; -import org.apache.lucene.search.join.BitSetProducer; -import org.apache.lucene.search.join.QueryBitSetProducer; -import org.apache.lucene.search.join.ScoreMode; -import org.apache.lucene.search.join.ToChildBlockJoinQuery; -import org.apache.lucene.search.join.ToParentBlockJoinQuery; import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanNotQuery; @@ -628,61 +623,6 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte } - public void testToParentBlockJoinQuery() throws Exception { - BitSetProducer parentFilter = new QueryBitSetProducer( - new TermQuery(new Term(FIELD_NAME, "parent"))); - - query = new ToParentBlockJoinQuery(new TermQuery(new Term(FIELD_NAME, "child")), - parentFilter, ScoreMode.None); - searcher = newSearcher(reader); - hits = searcher.search(query, 100); - int maxNumFragmentsRequired = 2; - - QueryScorer scorer = new QueryScorer(query, FIELD_NAME); - Highlighter highlighter = new Highlighter(this, scorer); - - for (int i = 0; i < hits.totalHits.value; i++) { - String text = "child document"; - TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text); - - highlighter.setTextFragmenter(new SimpleFragmenter(40)); - highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); - } - - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", - numHighlights == 1); - } - - public void testToChildBlockJoinQuery() throws Exception { - BitSetProducer parentFilter = new QueryBitSetProducer( - new TermQuery(new Term(FIELD_NAME, "parent"))); - - BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder(); - booleanQuery.add(new ToChildBlockJoinQuery(new TermQuery( - new Term(FIELD_NAME, "parent")), parentFilter), Occur.MUST); - booleanQuery.add(new TermQuery(new Term(FIELD_NAME, "child")), Occur.MUST); - query = booleanQuery.build(); - - searcher = newSearcher(reader); - hits = searcher.search(query, 100); - int maxNumFragmentsRequired = 2; - - QueryScorer scorer = new QueryScorer(query, FIELD_NAME); - Highlighter highlighter = new Highlighter(this, scorer); - - for (int i = 0; i < hits.totalHits.value; i++) { - String text = "parent document"; - final int docId = hits.scoreDocs[i].doc; - TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId); - - highlighter.setTextFragmenter(new SimpleFragmenter(40)); - highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); - } - - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", - numHighlights == 1); - } - public void testSimpleQueryScorerPhraseHighlighting2() throws Exception { PhraseQuery phraseQuery = new PhraseQuery(5, FIELD_NAME, "text", "piece", "long"); doSearching(phraseQuery); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java index 6b1e49d58bd..eb888f6e891 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java @@ -27,16 +27,12 @@ import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.ConstantScoreQuery; -import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.RegexpQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.WildcardQuery; -import org.apache.lucene.search.join.QueryBitSetProducer; -import org.apache.lucene.search.join.ScoreMode; -import org.apache.lucene.search.join.ToParentBlockJoinQuery; import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap; import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; import org.apache.lucene.util.BytesRef; @@ -954,15 +950,4 @@ public class FieldQueryTest extends AbstractTestCase { fq.flatten( query, reader, flatQueries, 1f ); assertCollectionQueries( flatQueries, tq( boost, "A" ) ); } - - public void testFlattenToParentBlockJoinQuery() throws Exception { - initBoost(); - Query childQuery = tq(boost, "a"); - Query query = new ToParentBlockJoinQuery(childQuery, new QueryBitSetProducer(new MatchAllDocsQuery()), ScoreMode.None); - FieldQuery fq = new FieldQuery(query, true, true ); - Set flatQueries = new HashSet<>(); - fq.flatten(query, reader, flatQueries, 1f); - assertCollectionQueries(flatQueries, tq(boost, "a")); - } - } diff --git a/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java b/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java index b8fbb255b58..088170dff5e 100644 --- a/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java +++ b/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java @@ -19,6 +19,7 @@ package org.apache.solr.highlight; import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -52,6 +53,10 @@ import org.apache.lucene.search.highlight.QueryTermScorer; import org.apache.lucene.search.highlight.Scorer; import org.apache.lucene.search.highlight.TextFragment; import org.apache.lucene.search.highlight.TokenSources; +import org.apache.lucene.search.highlight.WeightedSpanTerm; +import org.apache.lucene.search.highlight.WeightedSpanTermExtractor; +import org.apache.lucene.search.join.ToChildBlockJoinQuery; +import org.apache.lucene.search.join.ToParentBlockJoinQuery; import org.apache.lucene.search.vectorhighlight.BoundaryScanner; import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter; import org.apache.lucene.search.vectorhighlight.FieldQuery; @@ -238,7 +243,12 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf */ protected QueryScorer getSpanQueryScorer(Query query, String fieldName, TokenStream tokenStream, SolrQueryRequest request) { QueryScorer scorer = new QueryScorer(query, - request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false) ? fieldName : null); + request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false) ? fieldName : null) { + @Override + protected WeightedSpanTermExtractor newTermExtractor(String defaultField) { + return new CustomSpanTermExtractor(defaultField); + } + }; scorer.setExpandMultiTermQuery(request.getParams().getBool(HighlightParams.HIGHLIGHT_MULTI_TERM, true)); boolean defaultPayloads = true;//overwritten below @@ -256,6 +266,24 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf return scorer; } + private static class CustomSpanTermExtractor extends WeightedSpanTermExtractor { + public CustomSpanTermExtractor(String defaultField) { + super(defaultField); + } + + @Override + protected void extract(Query query, float boost, Map terms) throws IOException { + // these queries are not supported in lucene highlighting out of the box since 8.0 + if (query instanceof ToParentBlockJoinQuery) { + extract(((ToParentBlockJoinQuery) query).getChildQuery(), boost, terms); + } else if (query instanceof ToChildBlockJoinQuery) { + extract(((ToChildBlockJoinQuery) query).getParentQuery(), boost, terms); + } else { + super.extract(query, boost, terms); + } + } + } + /** * Return a {@link org.apache.lucene.search.highlight.Scorer} suitable for this Query and field. * @param query The current query @@ -469,7 +497,24 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf // FVH cannot process hl.usePhraseHighlighter parameter per-field basis params.getBool(HighlightParams.USE_PHRASE_HIGHLIGHTER, true), // FVH cannot process hl.requireFieldMatch parameter per-field basis - params.getBool(HighlightParams.FIELD_MATCH, false)); + params.getBool(HighlightParams.FIELD_MATCH, false)) { + @Override + public FieldQuery getFieldQuery(Query query, IndexReader reader) throws IOException { + return new FieldQuery(query, reader, phraseHighlight, fieldMatch) { + @Override + protected void flatten(Query sourceQuery, IndexReader reader, Collection flatQueries, float boost) throws IOException { + if (sourceQuery instanceof ToParentBlockJoinQuery) { + Query childQuery = ((ToParentBlockJoinQuery) sourceQuery).getChildQuery(); + if (childQuery != null) { + flatten(childQuery, reader, flatQueries, boost); + } + } else { + super.flatten(sourceQuery, reader, flatQueries, boost); + } + } + }; + } + }; fvh.setPhraseLimit(params.getInt(HighlightParams.PHRASE_LIMIT, SolrHighlighter.DEFAULT_PHRASE_LIMIT)); fvhContainer.fvh = fvh; fvhContainer.fieldQuery = fvh.getFieldQuery(query, reader);