percolator: Add support dor MatchNoDocsQuery in query terms extract service

Before the query extraction would have been aborted and the percolator query would be marked as unknown.
This resulted in a situation that these queries always need to be evaluated by the memory index at search time.
By adding support for this query many more percolator query candidate hits can skip the expensive memory index verification step. For example the `match` query parser returns a MatchNoDocsQuery if the query terms are removed by text analysis (lets query text only contained stop words).
This commit is contained in:
Martijn van Groningen 2016-05-20 17:28:00 +02:00
parent 7cf758943e
commit c1a0929123
3 changed files with 52 additions and 11 deletions

View File

@ -37,19 +37,16 @@ import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.FieldMaskingSpanQuery;
import org.apache.lucene.search.spans.SpanContainingQuery;
import org.apache.lucene.search.spans.SpanFirstQuery; import org.apache.lucene.search.spans.SpanFirstQuery;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanNotQuery; import org.apache.lucene.search.spans.SpanNotQuery;
import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanWithinQuery;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.BytesRefBuilder;
import org.elasticsearch.common.logging.LoggerMessageFormat; import org.elasticsearch.common.logging.LoggerMessageFormat;
import org.elasticsearch.common.lucene.search.MatchNoDocsQuery;
import org.elasticsearch.index.mapper.ParseContext; import org.elasticsearch.index.mapper.ParseContext;
import java.io.IOException; import java.io.IOException;
@ -75,8 +72,8 @@ public final class ExtractQueryTermsService {
* @param query The query to extract terms from * @param query The query to extract terms from
* @param document The document to add the extracted terms to * @param document The document to add the extracted terms to
* @param queryTermsFieldField The field in the document holding the extracted terms * @param queryTermsFieldField The field in the document holding the extracted terms
* @param unknownQueryField The field used to mark a document that not all query terms could be extracted. For example * @param unknownQueryField The field used to mark a document that not all query terms could be extracted.
* the query contained an unsupported query (e.g. WildcardQuery). * For example the query contained an unsupported query (e.g. WildcardQuery).
* @param fieldType The field type for the query metadata field * @param fieldType The field type for the query metadata field
*/ */
public static void extractQueryTerms(Query query, ParseContext.Document document, String queryTermsFieldField, String unknownQueryField, FieldType fieldType) { public static void extractQueryTerms(Query query, ParseContext.Document document, String queryTermsFieldField, String unknownQueryField, FieldType fieldType) {
@ -106,7 +103,10 @@ public final class ExtractQueryTermsService {
* an UnsupportedQueryException is thrown. * an UnsupportedQueryException is thrown.
*/ */
static Set<Term> extractQueryTerms(Query query) { static Set<Term> extractQueryTerms(Query query) {
if (query instanceof TermQuery) { if (query instanceof MatchNoDocsQuery) {
// no terms to extract as this query matches no docs
return Collections.emptySet();
} else if (query instanceof TermQuery) {
return Collections.singleton(((TermQuery) query).getTerm()); return Collections.singleton(((TermQuery) query).getTerm());
} else if (query instanceof TermsQuery) { } else if (query instanceof TermsQuery) {
Set<Term> terms = new HashSet<>(); Set<Term> terms = new HashSet<>();

View File

@ -41,6 +41,7 @@ import org.apache.lucene.search.spans.SpanNotQuery;
import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.lucene.search.MatchNoDocsQuery;
import org.elasticsearch.index.mapper.ParseContext; import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTestCase;
@ -296,6 +297,23 @@ public class ExtractQueryTermsServiceTests extends ESTestCase {
assertTermsEqual(terms, spanTermQuery1.getTerm()); assertTermsEqual(terms, spanTermQuery1.getTerm());
} }
public void testExtractQueryMetadata_matchNoDocsQuery() {
Set<Term> terms = ExtractQueryTermsService.extractQueryTerms(new MatchNoDocsQuery("sometimes there is no reason at all"));
assertEquals(0, terms.size());
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.add(new TermQuery(new Term("field", "value")), BooleanClause.Occur.MUST);
bq.add(new MatchNoDocsQuery("sometimes there is no reason at all"), BooleanClause.Occur.MUST);
terms = ExtractQueryTermsService.extractQueryTerms(bq.build());
assertEquals(0, terms.size());
bq = new BooleanQuery.Builder();
bq.add(new TermQuery(new Term("field", "value")), BooleanClause.Occur.SHOULD);
bq.add(new MatchNoDocsQuery("sometimes there is no reason at all"), BooleanClause.Occur.SHOULD);
terms = ExtractQueryTermsService.extractQueryTerms(bq.build());
assertTermsEqual(terms, new Term("field", "value"));
}
public void testExtractQueryMetadata_unsupportedQuery() { public void testExtractQueryMetadata_unsupportedQuery() {
TermRangeQuery termRangeQuery = new TermRangeQuery("_field", null, null, true, false); TermRangeQuery termRangeQuery = new TermRangeQuery("_field", null, null, true, false);

View File

@ -334,3 +334,26 @@ At search time, the document specified in the request gets parsed into a Lucene
temporary Lucene index. This in-memory index can just hold this one document and it is optimized for that. After this temporary Lucene index. This in-memory index can just hold this one document and it is optimized for that. After this
a special query is build based on the terms in the in-memory index that select candidate percolator queries based on a special query is build based on the terms in the in-memory index that select candidate percolator queries based on
their indexed query terms. These queries are then evaluated by the in-memory index if they actually match. their indexed query terms. These queries are then evaluated by the in-memory index if they actually match.
The selecting of candidate percolator queries matches is an important performance optimization during the execution
of the `percolate` query as it can significantly reduce the number of candidate matches the in-memory index need to
evaluate. The reason the `percolate` query can do this is because during indexing of the percolator queries the query
terms are being extracted and indexed with the percolator query. Unfortunately the percolator cannot extract terms from
all queries (for example the `wildcard` or `geo_shape` query) and as a result of that in certain cases the percolator
can't do the selecting optimization (for example if an unsupported query is defined in a required clause of a boolean query
or the unsupported query is the only query in the percolator document). These queries are marked by the percolator and
can be found by running the following search:
[source,js]
--------------------------------------------------
curl -XGET "http://localhost:9200/_search" -d'
{
"query": {
"term" : {
"query.unknown_query" : ""
}
}
}'
--------------------------------------------------
NOTE: The above example assumes that there is a `query` field of type `percolator` in the mappings.