SOLR-13890: Add "top-level" DV "terms" implementation (#1151)

{!terms} queries have a docValues-based implementation that uses per-segment DV structures.  This does well with a small to moderate (a few hundred) number of query terms, but doesn't well scale beyond that due to repetitive seeks done on each segment.

This commit introduces an implementation that uses a "top-level" docValues structure, which scales much better to very large {!terms} queries (many hundreds, thousands of terms).
This commit is contained in:
Jason Gerlowski 2020-01-13 06:43:21 -05:00 committed by GitHub
parent e5cff170b7
commit 6e4756fd48
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 312 additions and 14 deletions

View File

@ -193,6 +193,11 @@ Improvements
* SOLR-14154: Return correct isolation level when retrieving it from the SQL Connection (Nick Vercammen, Kevin Risden)
* SOLR-13890: Add "top-level" DV implementation for {!terms} queries. This approach tends to be more efficient for
queries with very large numbers of terms. The new implementation is used by default for method=docValuesTermsFilter
terms queries that are searching 500 or more terms. Users wishing to ignore this heuristic can choose the
docValuesTermsFilterTopLevel or docValuesTermsFilterPerSegment methods directly. (Jason Gerlowski, Joel Bernstein, David Smiley)
Optimizations
---------------------
(No changes)

View File

@ -16,29 +16,31 @@
*/
package org.apache.solr.search;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Arrays;
import java.util.Locale;
import java.util.regex.Pattern;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PrefixCodedTerms;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DocValuesTermsQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.*;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.LongBitSet;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.PointField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Finds documents whose specified field has any of the specified values. It's like
@ -52,6 +54,7 @@ import org.apache.solr.schema.PointField;
* Note that if no values are specified then the query matches no documents.
*/
public class TermsQParserPlugin extends QParserPlugin {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public static final String NAME = "terms";
/** The separator to use in the underlying suggester */
@ -88,10 +91,29 @@ public class TermsQParserPlugin extends QParserPlugin {
docValuesTermsFilter {//on 4x this is FieldCacheTermsFilter but we use the 5x name any way
@Override
Query makeFilter(String fname, BytesRef[] byteRefs) {
return new DocValuesTermsQuery(fname, byteRefs);//constant scores
// TODO Further tune this heuristic number
return (byteRefs.length > 700) ? docValuesTermsFilterTopLevel.makeFilter(fname, byteRefs) : docValuesTermsFilterPerSegment.makeFilter(fname, byteRefs);
}
},
docValuesTermsFilterTopLevel {
@Override
Query makeFilter(String fname, BytesRef[] byteRefs) {
return disableCacheByDefault(new TopLevelDocValuesTermsQuery(fname, byteRefs));
}
},
docValuesTermsFilterPerSegment {
@Override
Query makeFilter(String fname, BytesRef[] byteRefs) {
return disableCacheByDefault(new DocValuesTermsQuery(fname, byteRefs));
}
};
private static Query disableCacheByDefault(Query q) {
final WrappedQuery wrappedQuery = new WrappedQuery(q);
wrappedQuery.setCache(false);
return wrappedQuery;
}
abstract Query makeFilter(String fname, BytesRef[] byteRefs);
}
@ -101,7 +123,7 @@ public class TermsQParserPlugin extends QParserPlugin {
@Override
public Query parse() throws SyntaxError {
String fname = localParams.get(QueryParsing.F);
FieldType ft = req.getSchema().getFieldTypeNoEx(fname);
FieldType ft = req.getSchema().getFieldType(fname);
String separator = localParams.get(SEPARATOR, ",");
String qstr = localParams.get(QueryParsing.V);//never null
Method method = Method.valueOf(localParams.get(METHOD, Method.termsFilter.name()));
@ -119,7 +141,7 @@ public class TermsQParserPlugin extends QParserPlugin {
if (ft.isPointField()) {
if (localParams.get(METHOD) != null) {
throw new IllegalArgumentException(
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
String.format(Locale.ROOT, "Method '%s' not supported in TermsQParser when using PointFields", localParams.get(METHOD)));
}
return ((PointField)ft).getSetQuery(this, req.getSchema().getField(fname), Arrays.asList(splitVals));
@ -142,4 +164,100 @@ public class TermsQParserPlugin extends QParserPlugin {
}
};
}
private static class TopLevelDocValuesTermsQuery extends DocValuesTermsQuery {
private final String fieldName;
private SortedSetDocValues topLevelDocValues;
private LongBitSet topLevelTermOrdinals;
private boolean matchesAtLeastOneTerm = false;
public TopLevelDocValuesTermsQuery(String field, BytesRef... terms) {
super(field, terms);
this.fieldName = field;
}
public Weight createWeight(IndexSearcher searcher, final ScoreMode scoreMode, float boost) throws IOException {
if (! (searcher instanceof SolrIndexSearcher)) {
log.debug("Falling back to DocValuesTermsQuery because searcher [{}] is not the required SolrIndexSearcher", searcher);
return super.createWeight(searcher, scoreMode, boost);
}
topLevelDocValues = DocValues.getSortedSet(((SolrIndexSearcher)searcher).getSlowAtomicReader(), fieldName);
topLevelTermOrdinals = new LongBitSet(topLevelDocValues.getValueCount());
PrefixCodedTerms.TermIterator iterator = getTerms().iterator();
long lastTermOrdFound = 0;
for(BytesRef term = iterator.next(); term != null; term = iterator.next()) {
long currentTermOrd = lookupTerm(topLevelDocValues, term, lastTermOrdFound);
if (currentTermOrd >= 0L) {
matchesAtLeastOneTerm = true;
topLevelTermOrdinals.set(currentTermOrd);
lastTermOrdFound = currentTermOrd;
}
}
return new ConstantScoreWeight(this, boost) {
public Scorer scorer(LeafReaderContext context) throws IOException {
if (! matchesAtLeastOneTerm) {
return null;
}
SortedSetDocValues segmentDocValues = context.reader().getSortedSetDocValues(fieldName);
if (segmentDocValues == null) {
return null;
}
final int docBase = context.docBase;
return new ConstantScoreScorer(this, this.score(), scoreMode, new TwoPhaseIterator(segmentDocValues) {
public boolean matches() throws IOException {
topLevelDocValues.advanceExact(docBase + approximation.docID());
for(long ord = topLevelDocValues.nextOrd(); ord != -1L; ord = topLevelDocValues.nextOrd()) {
if (topLevelTermOrdinals.get(ord)) {
return true;
}
}
return false;
}
public float matchCost() {
return 10.0F;
}
});
}
public boolean isCacheable(LeafReaderContext ctx) {
return DocValues.isCacheable(ctx, new String[]{fieldName});
}
};
}
/*
* Same binary-search based implementation as SortedSetDocValues.lookupTerm(BytesRef), but with an
* optimization to narrow the search space where possible by providing a startOrd instead of begining each search
* at 0.
*/
private long lookupTerm(SortedSetDocValues docValues, BytesRef key, long startOrd) throws IOException {
long low = startOrd;
long high = docValues.getValueCount()-1;
while (low <= high) {
long mid = (low + high) >>> 1;
final BytesRef term = docValues.lookupOrd(mid);
int cmp = term.compareTo(key);
if (cmp < 0) {
low = mid + 1;
} else if (cmp > 0) {
high = mid - 1;
} else {
return mid; // key found
}
}
return -(low + 1); // key not found.
}
}
}

View File

@ -0,0 +1,163 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.junit.BeforeClass;
import org.junit.Test;
public class TestTermsQParserPlugin extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig.xml", "schema.xml");
assertU(adoc("id","1", "author_s", "Lev Grossman", "t_title", "The Magicians", "cat_s", "fantasy", "pubyear_i", "2009"));
assertU(adoc("id", "2", "author_s", "Robert Jordan", "t_title", "The Eye of the World", "cat_s", "fantasy", "cat_s", "childrens", "pubyear_i", "1990"));
assertU(adoc("id", "3", "author_s", "Robert Jordan", "t_title", "The Great Hunt", "cat_s", "fantasy", "cat_s", "childrens", "pubyear_i", "1990"));
assertU(adoc("id", "4", "author_s", "N.K. Jemisin", "t_title", "The Fifth Season", "cat_s", "fantasy", "pubyear_i", "2015"));
assertU(commit());
assertU(adoc("id", "5", "author_s", "Ursula K. Le Guin", "t_title", "The Dispossessed", "cat_s", "scifi", "pubyear_i", "1974"));
assertU(adoc("id", "6", "author_s", "Ursula K. Le Guin", "t_title", "The Left Hand of Darkness", "cat_s", "scifi", "pubyear_i", "1969"));
assertU(adoc("id", "7", "author_s", "Isaac Asimov", "t_title", "Foundation", "cat_s", "scifi", "pubyear_i", "1951"));
assertU(commit());
}
@Test
public void testTextTermsQuery() {
// Single term value
ModifiableSolrParams params = new ModifiableSolrParams();
params.add("q", "{!terms f=t_title}left");
params.add("sort", "id asc");
assertQ(req(params, "indent", "on"), "*[count(//doc)=1]",
"//result/doc[1]/str[@name='id'][.='6']"
);
// Multiple term values
params = new ModifiableSolrParams();
params.add("q", "{!terms f=t_title}left,hunt");
params.add("sort", "id asc");
assertQ(req(params, "indent", "on"), "*[count(//doc)=2]",
"//result/doc[1]/str[@name='id'][.='3']",
"//result/doc[2]/str[@name='id'][.='6']"
);
}
@Test
public void testTermsUsingNonDefaultSeparator() {
ModifiableSolrParams params = new ModifiableSolrParams();
params.add("q", "{!terms f=cat_s separator=|}childrens|scifi");
params.add("sort", "id asc");
assertQ(req(params, "indent", "on"), "*[count(//doc)=5]",
"//result/doc[1]/str[@name='id'][.='2']",
"//result/doc[2]/str[@name='id'][.='3']",
"//result/doc[3]/str[@name='id'][.='5']",
"//result/doc[4]/str[@name='id'][.='6']",
"//result/doc[5]/str[@name='id'][.='7']"
);
}
class TermsParams {
public String method;
public boolean cache;
public TermsParams(String method, boolean cache) {
this.method = method;
this.cache = cache;
}
public String buildQuery(String fieldName, String commaDelimitedTerms) {
return "{!terms f=" + fieldName + " method=" + method + " cache=" + cache + "}" + commaDelimitedTerms;
}
}
@Test
public void testTermsMethodEquivalency() {
// Run queries with a variety of 'method' and postfilter options.
final TermsParams[] methods = new TermsParams[] {
new TermsParams("termsFilter", true),
new TermsParams("termsFilter", false),
new TermsParams("booleanQuery", true),
new TermsParams("booleanQuery", false),
new TermsParams("automaton", true),
new TermsParams("automaton", false),
new TermsParams("docValuesTermsFilter", true),
new TermsParams("docValuesTermsFilter", false),
new TermsParams("docValuesTermsFilterTopLevel", true),
new TermsParams("docValuesTermsFilterTopLevel", false),
new TermsParams("docValuesTermsFilterPerSegment", true),
new TermsParams("docValuesTermsFilterPerSegment", false)
};
for (TermsParams method : methods) {
// Single-valued field, single term value
ModifiableSolrParams params = new ModifiableSolrParams();
params.add("q", method.buildQuery("author_s", "Robert Jordan"));
params.add("sort", "id asc");
assertQ(req(params, "indent", "on"), "*[count(//doc)=2]",
"//result/doc[1]/str[@name='id'][.='2']",
"//result/doc[2]/str[@name='id'][.='3']"
);
// Single-valued field, multiple term values
params = new ModifiableSolrParams();
params.add("q", method.buildQuery("author_s", "Robert Jordan,Isaac Asimov"));
params.add("sort", "id asc");
assertQ(req(params, "indent", "on"), "*[count(//doc)=3]",
"//result/doc[1]/str[@name='id'][.='2']",
"//result/doc[2]/str[@name='id'][.='3']",
"//result/doc[3]/str[@name='id'][.='7']"
);
// Multi-valued field, single term value
params = new ModifiableSolrParams();
params.add("q", method.buildQuery("cat_s", "childrens"));
params.add("sort", "id asc");
assertQ(req(params, "indent", "on"), "*[count(//doc)=2]",
"//result/doc[1]/str[@name='id'][.='2']",
"//result/doc[2]/str[@name='id'][.='3']"
);
// Multi-valued field, multiple term values
params = new ModifiableSolrParams();
params.add("q", method.buildQuery("cat_s", "childrens,scifi"));
params.add("sort", "id asc");
assertQ(req(params, "indent", "on"), "*[count(//doc)=5]",
"//result/doc[1]/str[@name='id'][.='2']",
"//result/doc[2]/str[@name='id'][.='3']",
"//result/doc[3]/str[@name='id'][.='5']",
"//result/doc[4]/str[@name='id'][.='6']",
"//result/doc[5]/str[@name='id'][.='7']"
);
// Numeric field
params = new ModifiableSolrParams();
params.add("q", method.buildQuery("pubyear_i", "2009"));
params.add("sort", "id asc");
// Test schema randomizes between Trie and Point. "terms" is supported for "trie" but not "Point"
final String numericFieldType = System.getProperty("solr.tests.IntegerFieldType");
if (numericFieldType.contains("Point")) {
assertQEx("Expected 'terms' query on PointField to fail", req(params, "indent", "on"), 400);
} else {
assertQ(req(params, "indent", "on"), "*[count(//doc)=1]", "//result/doc[1]/str[@name='id'][.='1']");
}
}
}
}

View File

@ -188,7 +188,7 @@ A list of queries that *must not* appear in matching documents.
A list of queries *should* appear in matching documents. For a BooleanQuery with no `must` queries, one or more `should` queries must match a document for the BooleanQuery to match.
`filter`::
A list of queries that *must* appear in matching documents. However, unlike `must`, the score of filter queries is ignored. Also, these queries are cached in filter cache. To avoid caching add either `cache=false` as local parameter, or `"cache":"false"` property to underneath Query DLS Object.
A list of queries that *must* appear in matching documents. However, unlike `must`, the score of filter queries is ignored. Also, these queries are cached in filter cache. To avoid caching add either `cache=false` as local parameter, or `"cache":"false"` property to underneath Query DLS Object.
*Examples*
@ -1031,7 +1031,19 @@ The field on which to search. This parameter is required.
Separator to use when parsing the input. If set to " " (a single blank space), will trim additional white space from the input terms. Defaults to a comma (`,`).
`method`::
The internal query-building implementation: `termsFilter`, `booleanQuery`, `automaton`, or `docValuesTermsFilter`. Defaults to `termsFilter`.
An optional parameter used to determine which of several query implementations should be used by Solr. Options are restricted to: `termsFilter`, `booleanQuery`, `automaton`, `docValuesTermsFilterPerSegment`, `docValuesTermsFilterTopLevel` or `docValuesTermsFilter`. If unspecified, the default value is `termsFilter`. Each implementation has its own performance characteristics, and users are encouraged to experiment to determine which implementation is most performant for their use-case. Heuristics are given below.
+
`booleanQuery` creates a `BooleanQuery` representing the request. Scales well with index size, but poorly with the number of terms being searched for.
+
`termsFilter` the default `method`. Uses a `BooleanQuery` or a `TermInSetQuery` depending on the number of terms. Scales well with index size, but only moderately with the number of query terms.
+
`docValuesTermsFilter` can only be used on fields with docValues data. The `cache` parameter is false by default. Chooses between the `docValuesTermsFilterTopLevel` and `docValuesTermsFilterPerSegment` methods using the number of query terms as a rough heuristic. Users should typically use this method instead of using `docValuesTermsFilterTopLevel` or `docValuesTermsFilterPerSegment` directly, unless they've done performance testing to validate one of the methods on queries of all sizes. Depending on the implementation picked, this method may rely on expensive data structures which are lazily populated after each commit. If you commit frequently and your use-case can tolerate a static warming query, consider adding one to `solrconfig.xml` so that this work is done as a part of the commit itself and not attached directly to user requests.
+
`docValuesTermsFilterTopLevel` can only be used on fields with docValues data. The `cache` parameter is false by default. Uses top-level docValues data structures to find results. These data structures are more efficient as the number of query terms grows high (over several hundred). But they are also expensive to build and need to be populated lazily after each commit, causing a sometimes-noticeable slowdown on the first query after each commit. If you commit frequently and your use-case can tolerate a static warming query, consider adding one to `solrconfig.xml` so that this work is done as a part of the commit itself and not attached directly to user requests.
+
`docValuesTermsFilterPerSegment` can only be used on fields with docValues data. The `cache` parameter is false by default. It is more efficient than the "top-level" alternative with small to medium (~500) numbers of query terms, and doesn't suffer a slowdown on queries immediately following a commit (as `docValuesTermsFilterTopLevel` does - see above). But it is less performant on very large numbers of query terms.
+
`automaton` creates an `AutomatonQuery` representing the request with each term forming a union. Scales well with index size and moderately with the number of query terms.
*Examples*