Fixed multi term queries support in postings highlighter for non top-level queries

In #4052 we added support for highlighting multi term queries using the postings highlighter. That worked only for top-level queries though, and not for multi term queries that are nested for instance within a bool query, or filtered query, or a constant score query.

The way we make this work is by walking the query structure and temporarily overriding the query rewrite method with a method that allows for multi terms extraction.

Closes #5102
This commit is contained in:
Luca Cavanna 2014-02-17 13:31:31 +01:00
parent e913b6626f
commit 4e6610a798
3 changed files with 136 additions and 23 deletions

View File

@ -57,12 +57,20 @@ highlighting using the postings highlighter on it:
} }
-------------------------------------------------- --------------------------------------------------
[NOTE]
Note that the postings highlighter is meant to perform simple query terms Note that the postings highlighter is meant to perform simple query terms
highlighting, regardless of their positions. That means that when used for highlighting, regardless of their positions. That means that when used for
instance in combination with a phrase query, it will highlight all the terms instance in combination with a phrase query, it will highlight all the terms
that the query is composed of, regardless of whether they are actually part of that the query is composed of, regardless of whether they are actually part of
a query match, effectively ignoring their positions. a query match, effectively ignoring their positions.
[WARNING]
The postings highlighter does support highlighting of multi term queries, like
prefix queries, wildcard queries and so on. On the other hand, this requires
the queries to be rewritten using a proper
<<query-dsl-multi-term-rewrite,rewrite method>> that supports multi term
extraction, which is a potentially expensive operation.
==== Fast vector highlighter ==== Fast vector highlighter

View File

@ -18,6 +18,7 @@
*/ */
package org.elasticsearch.search.highlight; package org.elasticsearch.search.highlight;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
@ -33,6 +34,8 @@ import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.UnicodeUtil;
import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.common.Strings; import org.elasticsearch.common.Strings;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.common.lucene.search.XFilteredQuery;
import org.elasticsearch.common.text.StringText; import org.elasticsearch.common.text.StringText;
import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.search.fetch.FetchPhaseExecutionException; import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
@ -144,25 +147,17 @@ public class PostingsHighlighter implements Highlighter {
} }
private static Query rewrite(HighlighterContext highlighterContext, IndexReader reader) throws IOException { private static Query rewrite(HighlighterContext highlighterContext, IndexReader reader) throws IOException {
//rewrite is expensive: if the query was already rewritten we try not to rewrite
boolean mustRewrite = !highlighterContext.query.queryRewritten();
Query original = highlighterContext.query.originalQuery(); Query original = highlighterContext.query.originalQuery();
MultiTermQuery originalMultiTermQuery = null; //we walk the query tree and when we encounter multi term queries we need to make sure the rewrite method
MultiTermQuery.RewriteMethod originalRewriteMethod = null; //supports multi term extraction. If not we temporarily override it (and restore it after the rewrite).
if (original instanceof MultiTermQuery) { List<Tuple<MultiTermQuery, MultiTermQuery.RewriteMethod>> modifiedMultiTermQueries = Lists.newArrayList();
originalMultiTermQuery = (MultiTermQuery) original; overrideMultiTermRewriteMethod(original, modifiedMultiTermQueries);
if (!allowsForTermExtraction(originalMultiTermQuery.getRewriteMethod())) {
originalRewriteMethod = originalMultiTermQuery.getRewriteMethod();
originalMultiTermQuery.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(50));
//we need to rewrite anyway if it is a multi term query which was rewritten with the wrong rewrite method
mustRewrite = true;
}
}
if (!mustRewrite) { //rewrite is expensive: if the query was already rewritten we try not to rewrite it again
//return the rewritten query if (highlighterContext.query.queryRewritten() && modifiedMultiTermQueries.size() == 0) {
//return the already rewritten query
return highlighterContext.query.query(); return highlighterContext.query.query();
} }
@ -172,16 +167,46 @@ public class PostingsHighlighter implements Highlighter {
query = rewrittenQuery; query = rewrittenQuery;
} }
if (originalMultiTermQuery != null) { //set back the original rewrite method after the rewrite is done
if (originalRewriteMethod != null) { for (Tuple<MultiTermQuery, MultiTermQuery.RewriteMethod> modifiedMultiTermQuery : modifiedMultiTermQueries) {
//set back the original rewrite method after the rewrite is done modifiedMultiTermQuery.v1().setRewriteMethod(modifiedMultiTermQuery.v2());
originalMultiTermQuery.setRewriteMethod(originalRewriteMethod);
}
} }
return query; return query;
} }
private static void overrideMultiTermRewriteMethod(Query query, List<Tuple<MultiTermQuery, MultiTermQuery.RewriteMethod>> modifiedMultiTermQueries) {
if (query instanceof MultiTermQuery) {
MultiTermQuery originalMultiTermQuery = (MultiTermQuery) query;
if (!allowsForTermExtraction(originalMultiTermQuery.getRewriteMethod())) {
MultiTermQuery.RewriteMethod originalRewriteMethod = originalMultiTermQuery.getRewriteMethod();
originalMultiTermQuery.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(50));
//we need to rewrite anyway if it is a multi term query which was rewritten with the wrong rewrite method
modifiedMultiTermQueries.add(Tuple.tuple(originalMultiTermQuery, originalRewriteMethod));
}
}
if (query instanceof BooleanQuery) {
BooleanQuery booleanQuery = (BooleanQuery) query;
for (BooleanClause booleanClause : booleanQuery) {
overrideMultiTermRewriteMethod(booleanClause.getQuery(), modifiedMultiTermQueries);
}
}
if (query instanceof XFilteredQuery) {
overrideMultiTermRewriteMethod(((XFilteredQuery) query).getQuery(), modifiedMultiTermQueries);
}
if (query instanceof FilteredQuery) {
overrideMultiTermRewriteMethod(((FilteredQuery) query).getQuery(), modifiedMultiTermQueries);
}
if (query instanceof ConstantScoreQuery) {
overrideMultiTermRewriteMethod(((ConstantScoreQuery) query).getQuery(), modifiedMultiTermQueries);
}
}
private static boolean allowsForTermExtraction(MultiTermQuery.RewriteMethod rewriteMethod) { private static boolean allowsForTermExtraction(MultiTermQuery.RewriteMethod rewriteMethod) {
return rewriteMethod instanceof TopTermsRewrite || rewriteMethod instanceof ScoringRewrite; return rewriteMethod instanceof TopTermsRewrite || rewriteMethod instanceof ScoringRewrite;
} }

View File

@ -48,6 +48,7 @@ import java.util.Map;
import static org.elasticsearch.action.search.SearchType.QUERY_THEN_FETCH; import static org.elasticsearch.action.search.SearchType.QUERY_THEN_FETCH;
import static org.elasticsearch.client.Requests.searchRequest; import static org.elasticsearch.client.Requests.searchRequest;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.elasticsearch.index.query.FilterBuilders.*;
import static org.elasticsearch.index.query.QueryBuilders.*; import static org.elasticsearch.index.query.QueryBuilders.*;
import static org.elasticsearch.search.builder.SearchSourceBuilder.highlight; import static org.elasticsearch.search.builder.SearchSourceBuilder.highlight;
import static org.elasticsearch.search.builder.SearchSourceBuilder.searchSource; import static org.elasticsearch.search.builder.SearchSourceBuilder.searchSource;
@ -1393,7 +1394,7 @@ public class HighlighterSearchTests extends ElasticsearchIntegrationTest {
.setSource("field4", "a quick fast blue car").get(); .setSource("field4", "a quick fast blue car").get();
refresh(); refresh();
source = searchSource().postFilter(FilterBuilders.typeFilter("type2")).query(matchPhrasePrefixQuery("field3", "fast bro")).from(0).size(60).explain(true) source = searchSource().postFilter(typeFilter("type2")).query(matchPhrasePrefixQuery("field3", "fast bro")).from(0).size(60).explain(true)
.highlight(highlight().field("field3").order("score").preTags("<x>").postTags("</x>")); .highlight(highlight().field("field3").order("score").preTags("<x>").postTags("</x>"));
searchResponse = client().search(searchRequest("test").source(source).searchType(QUERY_THEN_FETCH)).actionGet(); searchResponse = client().search(searchRequest("test").source(source).searchType(QUERY_THEN_FETCH)).actionGet();
@ -1401,7 +1402,7 @@ public class HighlighterSearchTests extends ElasticsearchIntegrationTest {
assertHighlight(searchResponse, 0, "field3", 0, 1, equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog")); assertHighlight(searchResponse, 0, "field3", 0, 1, equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog"));
logger.info("--> highlighting and searching on field4"); logger.info("--> highlighting and searching on field4");
source = searchSource().postFilter(FilterBuilders.typeFilter("type2")).query(matchPhrasePrefixQuery("field4", "the fast bro")).from(0).size(60).explain(true) source = searchSource().postFilter(typeFilter("type2")).query(matchPhrasePrefixQuery("field4", "the fast bro")).from(0).size(60).explain(true)
.highlight(highlight().field("field4").order("score").preTags("<x>").postTags("</x>")); .highlight(highlight().field("field4").order("score").preTags("<x>").postTags("</x>"));
searchResponse = client().search(searchRequest("test").source(source).searchType(QUERY_THEN_FETCH)).actionGet(); searchResponse = client().search(searchRequest("test").source(source).searchType(QUERY_THEN_FETCH)).actionGet();
@ -1409,7 +1410,7 @@ public class HighlighterSearchTests extends ElasticsearchIntegrationTest {
assertHighlight(searchResponse, 1, "field4", 0, 1, equalTo("<x>The quick brown</x> fox jumps over the lazy dog")); assertHighlight(searchResponse, 1, "field4", 0, 1, equalTo("<x>The quick brown</x> fox jumps over the lazy dog"));
logger.info("--> highlighting and searching on field4"); logger.info("--> highlighting and searching on field4");
source = searchSource().postFilter(FilterBuilders.typeFilter("type2")).query(matchPhrasePrefixQuery("field4", "a fast quick blue ca")).from(0).size(60).explain(true) source = searchSource().postFilter(typeFilter("type2")).query(matchPhrasePrefixQuery("field4", "a fast quick blue ca")).from(0).size(60).explain(true)
.highlight(highlight().field("field4").order("score").preTags("<x>").postTags("</x>")); .highlight(highlight().field("field4").order("score").preTags("<x>").postTags("</x>"));
searchResponse = client().search(searchRequest("test").source(source).searchType(QUERY_THEN_FETCH)).actionGet(); searchResponse = client().search(searchRequest("test").source(source).searchType(QUERY_THEN_FETCH)).actionGet();
@ -2419,6 +2420,85 @@ public class HighlighterSearchTests extends ElasticsearchIntegrationTest {
} }
} }
@Test
public void testPostingsHighlighterRegexpQueryWithinConstantScoreQuery() throws Exception {
assertAcked(client().admin().indices().prepareCreate("test").addMapping("type1", type1PostingsffsetsMapping()));
ensureGreen();
client().prepareIndex("test", "type1").setSource("field1", "The photography word will get highlighted").get();
refresh();
logger.info("--> highlighting and searching on field1");
for (String rewriteMethod : REWRITE_METHODS) {
SearchSourceBuilder source = searchSource().query(constantScoreQuery(regexpQuery("field1", "pho[a-z]+").rewrite(rewriteMethod)))
.highlight(highlight().field("field1"));
SearchResponse searchResponse = client().search(searchRequest("test").source(source)
.searchType(randomBoolean() ? SearchType.DFS_QUERY_THEN_FETCH : SearchType.QUERY_THEN_FETCH)).get();
assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("The <em>photography</em> word will get highlighted"));
}
}
@Test
public void testPostingsHighlighterMultiTermQueryMultipleLevels() throws Exception {
assertAcked(client().admin().indices().prepareCreate("test").addMapping("type1", type1PostingsffsetsMapping()));
ensureGreen();
client().prepareIndex("test", "type1").setSource("field1", "The photography word will get highlighted").get();
refresh();
logger.info("--> highlighting and searching on field1");
for (String rewriteMethod : REWRITE_METHODS) {
SearchSourceBuilder source = searchSource().query(boolQuery()
.should(constantScoreQuery(FilterBuilders.missingFilter("field1")))
.should(matchQuery("field1", "test"))
.should(filteredQuery(queryString("field1:photo*").rewrite(rewriteMethod), null)))
.highlight(highlight().field("field1"));
SearchResponse searchResponse = client().search(searchRequest("test").source(source)
.searchType(randomBoolean() ? SearchType.DFS_QUERY_THEN_FETCH : SearchType.QUERY_THEN_FETCH)).get();
assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("The <em>photography</em> word will get highlighted"));
}
}
@Test
public void testPostingsHighlighterPrefixQueryWithinBooleanQuery() throws Exception {
assertAcked(client().admin().indices().prepareCreate("test").addMapping("type1", type1PostingsffsetsMapping()));
ensureGreen();
client().prepareIndex("test", "type1").setSource("field1", "The photography word will get highlighted").get();
refresh();
logger.info("--> highlighting and searching on field1");
for (String rewriteMethod : REWRITE_METHODS) {
SearchSourceBuilder source = searchSource().query(boolQuery().must(prefixQuery("field1", "photo").rewrite(rewriteMethod)).should(matchQuery("field1", "test").minimumShouldMatch("0")))
.highlight(highlight().field("field1"));
SearchResponse searchResponse = client().search(searchRequest("test").source(source)
.searchType(randomBoolean() ? SearchType.DFS_QUERY_THEN_FETCH : SearchType.QUERY_THEN_FETCH)).get();
assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("The <em>photography</em> word will get highlighted"));
}
}
@Test
public void testPostingsHighlighterQueryStringWithinFilteredQuery() throws Exception {
assertAcked(client().admin().indices().prepareCreate("test").addMapping("type1", type1PostingsffsetsMapping()));
ensureGreen();
client().prepareIndex("test", "type1").setSource("field1", "The photography word will get highlighted").get();
refresh();
logger.info("--> highlighting and searching on field1");
for (String rewriteMethod : REWRITE_METHODS) {
SearchSourceBuilder source = searchSource().query(filteredQuery(queryString("field1:photo*").rewrite(rewriteMethod), missingFilter("field_null")))
.highlight(highlight().field("field1"));
SearchResponse searchResponse = client().search(searchRequest("test").source(source)
.searchType(randomBoolean() ? SearchType.DFS_QUERY_THEN_FETCH : SearchType.QUERY_THEN_FETCH)).get();
assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("The <em>photography</em> word will get highlighted"));
}
}
@Test @Test
@Slow @Slow
public void testPostingsHighlighterManyDocs() throws Exception { public void testPostingsHighlighterManyDocs() throws Exception {