LUCENE-8535: Drop out of the box Block-Join highlight support

Highlighter doesn't support ToParent and ToChildBlockJoinQuery out of the
box anymore. In oder to highlight on Block-Join Queries a custom WeightedSpanTermExtractor
should be used.
This commit is contained in:
Simon Willnauer 2018-10-18 10:15:57 +02:00
parent 167c65afad
commit 5a4fd86cce
10 changed files with 90 additions and 102 deletions

View File

@ -123,6 +123,10 @@ Changes in Runtime Behavior
* LUCENE-8505: IndexWriter#addIndices will now fail if the target index is sorted but
the candidate is not. (Jim Ferenczi)
* LUCENE-8535: Highlighter and FVH doesn't support ToParent and ToChildBlockJoinQuery out of the
box anymore. In order to highlight on Block-Join Queries a custom WeightedSpanTermExtractor / FieldQuery
should be used. (Simon Willnauer, Jim Ferenczi, Julie Tibshiran)
New Features
* LUCENE-8340: LongPoint#newDistanceQuery may be used to boost scores based on

View File

@ -140,3 +140,8 @@ a LowerCaseFilter
CharTokenizer now only performs tokenization. To perform any type of filtering
use a TokenFilter chain as you would with any other Tokenizer.
## Highlighter and FastVectorHighlighter no longer support ToParent/ToChildBlockJoinQuery
Both Highlighter and FastVectorHighlighter need a custom WeightedSpanTermExtractor or FieldQuery respectively
in order to support ToParent/ToChildBlockJoinQuery.

View File

@ -31,14 +31,12 @@
<path id="classpath">
<pathelement path="${memory.jar}"/>
<pathelement path="${queries.jar}"/>
<pathelement path="${join.jar}"/>
<path refid="base.classpath"/>
</path>
<path id="test.classpath">
<pathelement path="${memory.jar}"/>
<pathelement path="${queries.jar}"/>
<pathelement path="${join.jar}"/>
<pathelement path="${analyzers-common.jar}"/>
<path refid="test.base.classpath"/>
</path>

View File

@ -229,8 +229,7 @@ public class QueryScorer implements Scorer {
}
protected WeightedSpanTermExtractor newTermExtractor(String defaultField) {
return defaultField == null ? new WeightedSpanTermExtractor()
: new WeightedSpanTermExtractor(defaultField);
return new WeightedSpanTermExtractor(defaultField);
}
/*

View File

@ -54,8 +54,6 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.join.ToChildBlockJoinQuery;
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
import org.apache.lucene.search.spans.FieldMaskingSpanQuery;
import org.apache.lucene.search.spans.SpanFirstQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
@ -71,6 +69,31 @@ import org.apache.lucene.util.IOUtils;
/**
* Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether
* {@link Term}s from the {@link Query} are contained in a supplied {@link TokenStream}.
*
* In order to support additional, by default unsupported queries, subclasses can override
* {@link #extract(Query, float, Map)} for extracting wrapped or delegate queries and
* {@link #extractUnknownQuery(Query, Map)} to process custom leaf queries:
* <pre>
* <code>
* WeightedSpanTermExtractor extractor = new WeightedSpanTermExtractor() {
* protected void extract(Query query, float boost, Map&lt;String, WeightedSpanTerm&gt;terms) throws IOException {
* if (query instanceof QueryWrapper) {
* extract(((QueryWrapper)query).getQuery(), boost, terms);
* } else {
* super.extract(query, boost, terms);
* }
* }
*
* protected void extractUnknownQuery(Query query, Map&lt;String, WeightedSpanTerm&gt; terms) throws IOException {
* if (query instanceOf CustomTermQuery) {
* Term term = ((CustomTermQuery) query).getTerm();
* terms.put(term.field(), new WeightedSpanTerm(1, term.text()));
* }
* }
* };
* }
* </code>
* </pre>
*/
public class WeightedSpanTermExtractor {
@ -85,12 +108,11 @@ public class WeightedSpanTermExtractor {
private LeafReader internalReader = null;
public WeightedSpanTermExtractor() {
this(null);
}
public WeightedSpanTermExtractor(String defaultField) {
if (defaultField != null) {
this.defaultField = defaultField;
}
this.defaultField = defaultField;
}
/**
@ -154,10 +176,6 @@ public class WeightedSpanTermExtractor {
for (Query clause : ((DisjunctionMaxQuery) query)) {
extract(clause, boost, terms);
}
} else if (query instanceof ToParentBlockJoinQuery) {
extract(((ToParentBlockJoinQuery) query).getChildQuery(), boost, terms);
} else if (query instanceof ToChildBlockJoinQuery) {
extract(((ToChildBlockJoinQuery) query).getParentQuery(), boost, terms);
} else if (query instanceof MultiPhraseQuery) {
final MultiPhraseQuery mpq = (MultiPhraseQuery) query;
final Term[][] termArrays = mpq.getTermArrays();

View File

@ -31,8 +31,8 @@ import org.apache.lucene.search.highlight.Encoder;
public class FastVectorHighlighter {
public static final boolean DEFAULT_PHRASE_HIGHLIGHT = true;
public static final boolean DEFAULT_FIELD_MATCH = true;
private final boolean phraseHighlight;
private final boolean fieldMatch;
protected final boolean phraseHighlight;
protected final boolean fieldMatch;
private final FragListBuilder fragListBuilder;
private final FragmentsBuilder fragmentsBuilder;
private int phraseLimit = Integer.MAX_VALUE;
@ -80,7 +80,7 @@ public class FastVectorHighlighter {
// TODO: should we deprecate this?
// because if there is no reader, then we cannot rewrite MTQ.
try {
return new FieldQuery( query, null, phraseHighlight, fieldMatch );
return getFieldQuery(query, null);
} catch (IOException e) {
// should never be thrown when reader is null
throw new RuntimeException (e);

View File

@ -38,7 +38,6 @@ import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
/**
@ -62,7 +61,7 @@ public class FieldQuery {
// The maximum number of different matching terms accumulated from any one MultiTermQuery
private static final int MAX_MTQ_TERMS = 1024;
FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
public FieldQuery(Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch) throws IOException {
this.fieldMatch = fieldMatch;
Set<Query> flatQueries = new LinkedHashSet<>();
flatten( query, reader, flatQueries, 1f );
@ -95,7 +94,7 @@ public class FieldQuery {
this (query, null, phraseHighlight, fieldMatch);
}
void flatten( Query sourceQuery, IndexReader reader, Collection<Query> flatQueries, float boost ) throws IOException{
protected void flatten( Query sourceQuery, IndexReader reader, Collection<Query> flatQueries, float boost ) throws IOException {
while (sourceQuery instanceof BoostQuery) {
BoostQuery bq = (BoostQuery) sourceQuery;
sourceQuery = bq.getQuery();
@ -141,15 +140,10 @@ public class FieldQuery {
flatten( q, reader, flatQueries, boost);
}
} else if (sourceQuery instanceof FunctionScoreQuery) {
final Query q = ((FunctionScoreQuery)sourceQuery).getWrappedQuery();
final Query q = ((FunctionScoreQuery) sourceQuery).getWrappedQuery();
if (q != null) {
flatten(q, reader, flatQueries, boost);
}
} else if (sourceQuery instanceof ToParentBlockJoinQuery) {
Query childQuery = ((ToParentBlockJoinQuery) sourceQuery).getChildQuery();
if (childQuery != null) {
flatten(childQuery, reader, flatQueries, boost);
}
} else if (reader != null) {
Query query = sourceQuery;
Query rewritten;

View File

@ -81,11 +81,6 @@ import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner;
import org.apache.lucene.search.join.BitSetProducer;
import org.apache.lucene.search.join.QueryBitSetProducer;
import org.apache.lucene.search.join.ScoreMode;
import org.apache.lucene.search.join.ToChildBlockJoinQuery;
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanNotQuery;
@ -628,61 +623,6 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
}
public void testToParentBlockJoinQuery() throws Exception {
BitSetProducer parentFilter = new QueryBitSetProducer(
new TermQuery(new Term(FIELD_NAME, "parent")));
query = new ToParentBlockJoinQuery(new TermQuery(new Term(FIELD_NAME, "child")),
parentFilter, ScoreMode.None);
searcher = newSearcher(reader);
hits = searcher.search(query, 100);
int maxNumFragmentsRequired = 2;
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
Highlighter highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits.value; i++) {
String text = "child document";
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
numHighlights == 1);
}
public void testToChildBlockJoinQuery() throws Exception {
BitSetProducer parentFilter = new QueryBitSetProducer(
new TermQuery(new Term(FIELD_NAME, "parent")));
BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();
booleanQuery.add(new ToChildBlockJoinQuery(new TermQuery(
new Term(FIELD_NAME, "parent")), parentFilter), Occur.MUST);
booleanQuery.add(new TermQuery(new Term(FIELD_NAME, "child")), Occur.MUST);
query = booleanQuery.build();
searcher = newSearcher(reader);
hits = searcher.search(query, 100);
int maxNumFragmentsRequired = 2;
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
Highlighter highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits.value; i++) {
String text = "parent document";
final int docId = hits.scoreDocs[i].doc;
TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
numHighlights == 1);
}
public void testSimpleQueryScorerPhraseHighlighting2() throws Exception {
PhraseQuery phraseQuery = new PhraseQuery(5, FIELD_NAME, "text", "piece", "long");
doSearching(phraseQuery);

View File

@ -27,16 +27,12 @@ import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.join.QueryBitSetProducer;
import org.apache.lucene.search.join.ScoreMode;
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap;
import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
import org.apache.lucene.util.BytesRef;
@ -954,15 +950,4 @@ public class FieldQueryTest extends AbstractTestCase {
fq.flatten( query, reader, flatQueries, 1f );
assertCollectionQueries( flatQueries, tq( boost, "A" ) );
}
public void testFlattenToParentBlockJoinQuery() throws Exception {
initBoost();
Query childQuery = tq(boost, "a");
Query query = new ToParentBlockJoinQuery(childQuery, new QueryBitSetProducer(new MatchAllDocsQuery()), ScoreMode.None);
FieldQuery fq = new FieldQuery(query, true, true );
Set<Query> flatQueries = new HashSet<>();
fq.flatten(query, reader, flatQueries, 1f);
assertCollectionQueries(flatQueries, tq(boost, "a"));
}
}

View File

@ -19,6 +19,7 @@ package org.apache.solr.highlight;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
@ -52,6 +53,10 @@ import org.apache.lucene.search.highlight.QueryTermScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.search.highlight.WeightedSpanTerm;
import org.apache.lucene.search.highlight.WeightedSpanTermExtractor;
import org.apache.lucene.search.join.ToChildBlockJoinQuery;
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
import org.apache.lucene.search.vectorhighlight.FieldQuery;
@ -238,7 +243,12 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
*/
protected QueryScorer getSpanQueryScorer(Query query, String fieldName, TokenStream tokenStream, SolrQueryRequest request) {
QueryScorer scorer = new QueryScorer(query,
request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false) ? fieldName : null);
request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false) ? fieldName : null) {
@Override
protected WeightedSpanTermExtractor newTermExtractor(String defaultField) {
return new CustomSpanTermExtractor(defaultField);
}
};
scorer.setExpandMultiTermQuery(request.getParams().getBool(HighlightParams.HIGHLIGHT_MULTI_TERM, true));
boolean defaultPayloads = true;//overwritten below
@ -256,6 +266,24 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
return scorer;
}
private static class CustomSpanTermExtractor extends WeightedSpanTermExtractor {
public CustomSpanTermExtractor(String defaultField) {
super(defaultField);
}
@Override
protected void extract(Query query, float boost, Map<String, WeightedSpanTerm> terms) throws IOException {
// these queries are not supported in lucene highlighting out of the box since 8.0
if (query instanceof ToParentBlockJoinQuery) {
extract(((ToParentBlockJoinQuery) query).getChildQuery(), boost, terms);
} else if (query instanceof ToChildBlockJoinQuery) {
extract(((ToChildBlockJoinQuery) query).getParentQuery(), boost, terms);
} else {
super.extract(query, boost, terms);
}
}
}
/**
* Return a {@link org.apache.lucene.search.highlight.Scorer} suitable for this Query and field.
* @param query The current query
@ -469,7 +497,24 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf
// FVH cannot process hl.usePhraseHighlighter parameter per-field basis
params.getBool(HighlightParams.USE_PHRASE_HIGHLIGHTER, true),
// FVH cannot process hl.requireFieldMatch parameter per-field basis
params.getBool(HighlightParams.FIELD_MATCH, false));
params.getBool(HighlightParams.FIELD_MATCH, false)) {
@Override
public FieldQuery getFieldQuery(Query query, IndexReader reader) throws IOException {
return new FieldQuery(query, reader, phraseHighlight, fieldMatch) {
@Override
protected void flatten(Query sourceQuery, IndexReader reader, Collection<Query> flatQueries, float boost) throws IOException {
if (sourceQuery instanceof ToParentBlockJoinQuery) {
Query childQuery = ((ToParentBlockJoinQuery) sourceQuery).getChildQuery();
if (childQuery != null) {
flatten(childQuery, reader, flatQueries, boost);
}
} else {
super.flatten(sourceQuery, reader, flatQueries, boost);
}
}
};
}
};
fvh.setPhraseLimit(params.getInt(HighlightParams.PHRASE_LIMIT, SolrHighlighter.DEFAULT_PHRASE_LIMIT));
fvhContainer.fvh = fvh;
fvhContainer.fieldQuery = fvh.getFieldQuery(query, reader);