SOLR-13892: Add 'top-level' docValues Join implementation (#1171)

This commit is contained in:
Jason Gerlowski 2020-01-31 11:21:01 -05:00 committed by GitHub
parent 9ceaff913e
commit 719b38c8d8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 573 additions and 121 deletions

View File

@ -166,6 +166,8 @@ New Features
* SOLR-13749: New cross collection join filter (XCJF) (Dan Fox, Kevin Watters, via Gus Heck)
* SOLR-13892: New "top-level" docValues join implementation (Jason Gerlowski, Joel Bernstein)
Improvements
---------------------
* SOLR-14120: Define JavaScript methods 'includes' and 'startsWith' to ensure AdminUI can be displayed when using

View File

@ -18,6 +18,7 @@ package org.apache.solr.search;
import java.io.Closeable;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@ -59,35 +60,72 @@ import org.apache.solr.search.join.GraphPointsCollector;
import org.apache.solr.search.join.ScoreJoinQParserPlugin;
import org.apache.solr.util.RTimer;
import org.apache.solr.util.RefCounted;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class JoinQParserPlugin extends QParserPlugin {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public static final String NAME = "join";
/** Choose the internal algorithm */
private static final String METHOD = "method";
@Override
public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
return new QParser(qstr, localParams, params, req) {
private static class JoinParams {
final String fromField;
final String fromCore;
final Query fromQuery;
final long fromCoreOpenTime;
final String toField;
@Override
public Query parse() throws SyntaxError {
if(localParams!=null && localParams.get(ScoreJoinQParserPlugin.SCORE)!=null){
return new ScoreJoinQParserPlugin().createParser(qstr, localParams, params, req).parse();
}else{
return parseJoin();
public JoinParams(String fromField, String fromCore, Query fromQuery, long fromCoreOpenTime, String toField) {
this.fromField = fromField;
this.fromCore = fromCore;
this.fromQuery = fromQuery;
this.fromCoreOpenTime = fromCoreOpenTime;
this.toField = toField;
}
}
Query parseJoin() throws SyntaxError {
final String fromField = getParam("from");
final String fromIndex = getParam("fromIndex");
final String toField = getParam("to");
final String v = localParams.get("v");
private enum Method {
index {
@Override
Query makeFilter(QParser qparser) throws SyntaxError {
final JoinParams jParams = parseJoin(qparser);
final JoinQuery q = new JoinQuery(jParams.fromField, jParams.toField, jParams.fromCore, jParams.fromQuery);
q.fromCoreOpenTime = jParams.fromCoreOpenTime;
return q;
}
},
dvWithScore {
@Override
Query makeFilter(QParser qparser) throws SyntaxError {
return new ScoreJoinQParserPlugin().createParser(qparser.qstr, qparser.localParams, qparser.params, qparser.req).parse();
}
},
topLevelDV {
@Override
Query makeFilter(QParser qparser) throws SyntaxError {
final JoinParams jParams = parseJoin(qparser);
final JoinQuery q = new TopLevelJoinQuery(jParams.fromField, jParams.toField, jParams.fromCore, jParams.fromQuery);
q.fromCoreOpenTime = jParams.fromCoreOpenTime;
return q;
}
};
abstract Query makeFilter(QParser qparser) throws SyntaxError;
JoinParams parseJoin(QParser qparser) throws SyntaxError {
final String fromField = qparser.getParam("from");
final String fromIndex = qparser.getParam("fromIndex");
final String toField = qparser.getParam("to");
final String v = qparser.localParams.get(QueryParsing.V);
final String coreName;
Query fromQuery;
long fromCoreOpenTime = 0;
if (fromIndex != null && !fromIndex.equals(req.getCore().getCoreDescriptor().getName()) ) {
CoreContainer container = req.getCore().getCoreContainer();
if (fromIndex != null && !fromIndex.equals(qparser.req.getCore().getCoreDescriptor().getName()) ) {
CoreContainer container = qparser.req.getCore().getCoreContainer();
// if in SolrCloud mode, fromIndex should be the name of a single-sharded collection
coreName = ScoreJoinQParserPlugin.getCoreName(fromIndex, container);
@ -99,7 +137,7 @@ public class JoinQParserPlugin extends QParserPlugin {
}
RefCounted<SolrIndexSearcher> fromHolder = null;
LocalSolrQueryRequest otherReq = new LocalSolrQueryRequest(fromCore, params);
LocalSolrQueryRequest otherReq = new LocalSolrQueryRequest(fromCore, qparser.params);
try {
QParser parser = QParser.getParser(v, otherReq);
fromQuery = parser.getQuery();
@ -112,14 +150,34 @@ public class JoinQParserPlugin extends QParserPlugin {
}
} else {
coreName = null;
QParser fromQueryParser = subQuery(v, null);
QParser fromQueryParser = qparser.subQuery(v, null);
fromQueryParser.setIsFilter(true);
fromQuery = fromQueryParser.getQuery();
}
JoinQuery jq = new JoinQuery(fromField, toField, coreName == null ? fromIndex : coreName, fromQuery);
jq.fromCoreOpenTime = fromCoreOpenTime;
return jq;
final String indexToUse = coreName == null ? fromIndex : coreName;
return new JoinParams(fromField, indexToUse, fromQuery, fromCoreOpenTime, toField);
}
}
@Override
public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
return new QParser(qstr, localParams, params, req) {
@Override
public Query parse() throws SyntaxError {
if (localParams != null && localParams.get(METHOD) != null) {
// TODO Make sure 'method' is valid value here and give users a nice error
final Method explicitMethod = Method.valueOf(localParams.get(METHOD));
return explicitMethod.makeFilter(this);
}
// Legacy join behavior before introduction of SOLR-13892
if(localParams!=null && localParams.get(ScoreJoinQParserPlugin.SCORE)!=null) {
return new ScoreJoinQParserPlugin().createParser(qstr, localParams, params, req).parse();
} else {
return Method.index.makeFilter(this);
}
}
};
}
@ -175,7 +233,7 @@ class JoinQuery extends Query {
return new JoinQueryWeight((SolrIndexSearcher) searcher, scoreMode, boost);
}
private class JoinQueryWeight extends ConstantScoreWeight {
protected class JoinQueryWeight extends ConstantScoreWeight {
SolrIndexSearcher fromSearcher;
RefCounted<SolrIndexSearcher> fromRef;
SolrIndexSearcher toSearcher;
@ -586,5 +644,4 @@ class JoinQuery extends Query {
h = h * 31 + (int) fromCoreOpenTime;
return h;
}
}

View File

@ -0,0 +1,221 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongBitSet;
import org.apache.solr.common.SolrException;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.join.MultiValueTermOrdinalCollector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* {@link JoinQuery} implementation using global (top-level) DocValues ordinals to efficiently compare values in the "from" and "to" fields.
*/
public class TopLevelJoinQuery extends JoinQuery {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public TopLevelJoinQuery(String fromField, String toField, String coreName, Query subQuery) {
super(fromField, toField, coreName, subQuery);
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
if (! (searcher instanceof SolrIndexSearcher)) {
log.debug("Falling back to JoinQueryWeight because searcher [{}] is not the required SolrIndexSearcher", searcher);
return super.createWeight(searcher, scoreMode, boost);
}
final SolrIndexSearcher solrSearcher = (SolrIndexSearcher) searcher;
final JoinQueryWeight weight = new JoinQueryWeight(solrSearcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f);
final SolrIndexSearcher fromSearcher = weight.fromSearcher;
final SolrIndexSearcher toSearcher = weight.toSearcher;
try {
final SortedSetDocValues topLevelFromDocValues = validateAndFetchDocValues(fromSearcher, fromField, "from");
final SortedSetDocValues topLevelToDocValues = validateAndFetchDocValues(toSearcher, toField, "to");
if (topLevelFromDocValues.getValueCount() == 0 || topLevelToDocValues.getValueCount() == 0) {
return createNoMatchesWeight(boost);
}
final LongBitSet fromOrdBitSet = findFieldOrdinalsMatchingQuery(q, fromField, fromSearcher, topLevelFromDocValues);
final LongBitSet toOrdBitSet = new LongBitSet(topLevelToDocValues.getValueCount());
final BitsetBounds toBitsetBounds = convertFromOrdinalsIntoToField(fromOrdBitSet, topLevelFromDocValues, toOrdBitSet, topLevelToDocValues);
final boolean toMultivalued = toSearcher.getSchema().getFieldOrNull(toField).multiValued();
return new ConstantScoreWeight(this, boost) {
public Scorer scorer(LeafReaderContext context) throws IOException {
if (toBitsetBounds.lower == BitsetBounds.NO_MATCHES) {
return null;
}
final DocIdSetIterator toApproximation = (toMultivalued) ? context.reader().getSortedSetDocValues(toField) :
context.reader().getSortedDocValues(toField);
if (toApproximation == null) {
return null;
}
final int docBase = context.docBase;
return new ConstantScoreScorer(this, this.score(), scoreMode, new TwoPhaseIterator(toApproximation) {
public boolean matches() throws IOException {
final boolean hasDoc = topLevelToDocValues.advanceExact(docBase + approximation.docID());
if (hasDoc) {
for (long ord = topLevelToDocValues.nextOrd(); ord != -1L; ord = topLevelToDocValues.nextOrd()) {
if (toOrdBitSet.get(ord)) {
return true;
}
}
}
return false;
}
public float matchCost() {
return 10.0F;
}
});
}
public boolean isCacheable(LeafReaderContext ctx) {
return false;
}
};
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private Weight createNoMatchesWeight(float boost) {
return new ConstantScoreWeight(this, boost) {
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
return null;
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return false;
}
};
}
private SortedSetDocValues validateAndFetchDocValues(SolrIndexSearcher solrSearcher, String fieldName, String querySide) throws IOException {
final IndexSchema schema = solrSearcher.getSchema();
final SchemaField field = schema.getFieldOrNull(fieldName);
if (field == null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, querySide + " field '" + fieldName + "' does not exist");
}
if (!field.hasDocValues()) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"'top-level' join queries require both 'from' and 'to' fields to have docValues, but " + querySide +
" field [" + fieldName + "] does not.");
}
final LeafReader leafReader = solrSearcher.getSlowAtomicReader();
if (field.multiValued()) {
return DocValues.getSortedSet(leafReader, fieldName);
}
return DocValues.singleton(DocValues.getSorted(leafReader, fieldName));
}
private static LongBitSet findFieldOrdinalsMatchingQuery(Query q, String field, SolrIndexSearcher searcher, SortedSetDocValues docValues) throws IOException {
final LongBitSet fromOrdBitSet = new LongBitSet(docValues.getValueCount());
final Collector fromCollector = new MultiValueTermOrdinalCollector(field, docValues, fromOrdBitSet);
searcher.search(q, fromCollector);
return fromOrdBitSet;
}
private BitsetBounds convertFromOrdinalsIntoToField(LongBitSet fromOrdBitSet, SortedSetDocValues fromDocValues,
LongBitSet toOrdBitSet, SortedSetDocValues toDocValues) throws IOException {
long fromOrdinal = 0;
long firstToOrd = BitsetBounds.NO_MATCHES;
long lastToOrd = 0;
while (fromOrdinal < fromOrdBitSet.length() && (fromOrdinal = fromOrdBitSet.nextSetBit(fromOrdinal)) >= 0) {
final BytesRef fromBytesRef = fromDocValues.lookupOrd(fromOrdinal);
final long toOrdinal = lookupTerm(toDocValues, fromBytesRef, lastToOrd);
if (toOrdinal >= 0) {
toOrdBitSet.set(toOrdinal);
if (firstToOrd == BitsetBounds.NO_MATCHES) firstToOrd = toOrdinal;
lastToOrd = toOrdinal;
}
fromOrdinal++;
}
return new BitsetBounds(firstToOrd, lastToOrd);
}
/*
* Same binary-search based implementation as SortedSetDocValues.lookupTerm(BytesRef), but with an
* optimization to narrow the search space where possible by providing a startOrd instead of beginning each search
* at 0.
*/
private long lookupTerm(SortedSetDocValues docValues, BytesRef key, long startOrd) throws IOException {
long low = startOrd;
long high = docValues.getValueCount()-1;
while (low <= high) {
long mid = (low + high) >>> 1;
final BytesRef term = docValues.lookupOrd(mid);
int cmp = term.compareTo(key);
if (cmp < 0) {
low = mid + 1;
} else if (cmp > 0) {
high = mid - 1;
} else {
return mid; // key found
}
}
return -(low + 1); // key not found.
}
private static class BitsetBounds {
public static final long NO_MATCHES = -1L;
public final long lower;
public final long upper;
public BitsetBounds(long lower, long upper) {
this.lower = lower;
this.upper = upper;
}
}
}

View File

@ -0,0 +1,65 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.join;
import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.util.LongBitSet;
/**
* Populates a bitset of (top-level) ordinals based on field values in a multi-valued field.
*/
public class MultiValueTermOrdinalCollector extends SimpleCollector {
private int docBase;
private SortedSetDocValues topLevelDocValues;
private final String fieldName;
// Records all ordinals found during collection
private final LongBitSet topLevelDocValuesBitSet;
public MultiValueTermOrdinalCollector(String fieldName, SortedSetDocValues topLevelDocValues, LongBitSet topLevelDocValuesBitSet) {
this.fieldName = fieldName;
this.topLevelDocValues = topLevelDocValues;
this.topLevelDocValuesBitSet = topLevelDocValuesBitSet;
}
public ScoreMode scoreMode() {
return ScoreMode.COMPLETE_NO_SCORES;
}
@Override
public void doSetNextReader(LeafReaderContext context) throws IOException {
this.docBase = context.docBase;
}
@Override
public void collect(int doc) throws IOException {
final int globalDoc = docBase + doc;
if (topLevelDocValues.advanceExact(globalDoc)) {
long ord = SortedSetDocValues.NO_MORE_ORDS;
while ((ord = topLevelDocValues.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
topLevelDocValuesBitSet.set(ord);
}
}
}
}

View File

@ -705,6 +705,8 @@
<dynamicField name="*_ss" type="string" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_s_dv" type="string" indexed="true" stored="true" docValues="true"/>
<dynamicField name="*_sdv" type="string" indexed="false" stored="false" docValues="true" useDocValuesAsStored="true"/>
<!-- 'indexed' prefix necessary to distinguish it from _sdv above, which is indexed in many schemas but not here. -->
<dynamicField name="*_indexed_sdv" type="string" indexed="true" stored="false" docValues="true" useDocValuesAsStored="true"/>
<dynamicField name="*_ss_dv" type="string" indexed="true" stored="true" docValues="true" multiValued="true"/>
<dynamicField name="*_t" type="text" indexed="true" stored="true"/>
<dynamicField name="*_tt" type="text" indexed="true" stored="true"/>

View File

@ -27,6 +27,7 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.Utils;
import org.apache.solr.request.SolrQueryRequest;
@ -51,59 +52,59 @@ public class TestJoin extends SolrTestCaseJ4 {
initCore("solrconfig.xml","schema12.xml");
}
private static final String PRIMARY_DEPT_FIELD = "primary_dept_indexed_sdv";
private static final String DEPT_FIELD = "dept_ss_dv";
private static final String DEPT_ID_FIELD = "dept_id_indexed_sdv";
@Test
public void testJoin() throws Exception {
assertU(add(doc("id", "1","name", "john", "title", "Director", "dept_s","Engineering")));
assertU(add(doc("id", "2","name", "mark", "title", "VP", "dept_s","Marketing")));
assertU(add(doc("id", "3","name", "nancy", "title", "MTS", "dept_s","Sales")));
assertU(add(doc("id", "4","name", "dave", "title", "MTS", "dept_s","Support", "dept_s","Engineering")));
assertU(add(doc("id", "5","name", "tina", "title", "VP", "dept_s","Engineering")));
private void indexEmployeeDocs() {
assertU(add(doc("id", "1","name", "john", "title", "Director", PRIMARY_DEPT_FIELD, "Engineering", DEPT_FIELD,"Engineering")));
assertU(add(doc("id", "2","name", "mark", "title", "VP", PRIMARY_DEPT_FIELD, "Marketing", DEPT_FIELD,"Marketing")));
assertU(add(doc("id", "3","name", "nancy", "title", "MTS", PRIMARY_DEPT_FIELD, "Sales", DEPT_FIELD,"Sales")));
assertU(add(doc("id", "4","name", "dave", "title", "MTS", PRIMARY_DEPT_FIELD, "Support", DEPT_FIELD,"Support", DEPT_FIELD,"Engineering")));
assertU(add(doc("id", "5","name", "tina", "title", "VP", PRIMARY_DEPT_FIELD, "Engineering", DEPT_FIELD,"Engineering")));
assertU(add(doc("id","10", "dept_id_s", "Engineering", "text","These guys develop stuff")));
assertU(add(doc("id","11", "dept_id_s", "Marketing", "text","These guys make you look good")));
assertU(add(doc("id","12", "dept_id_s", "Sales", "text","These guys sell stuff")));
assertU(add(doc("id","13", "dept_id_s", "Support", "text","These guys help customers")));
assertU(add(doc("id","10", DEPT_ID_FIELD, "Engineering", "text","These guys develop stuff")));
assertU(add(doc("id","11", DEPT_ID_FIELD, "Marketing", "text","These guys make you look good")));
assertU(add(doc("id","12", DEPT_ID_FIELD, "Sales", "text","These guys sell stuff")));
assertU(add(doc("id","13", DEPT_ID_FIELD, "Support", "text","These guys help customers")));
assertU(commit());
}
/*
* Exercises behavior shared by all join methods.
*/
@Test
public void testJoinAllMethods() throws Exception {
indexEmployeeDocs();
ModifiableSolrParams p = params("sort","id asc");
// test debugging
assertJQ(req(p, "q","{!join from=dept_s to=dept_id_s}title:MTS", "fl","id", "debugQuery","true")
,"/debug/join/{!join from=dept_s to=dept_id_s}title:MTS=={'_MATCH_':'fromSetSize,toSetSize', 'fromSetSize':2, 'toSetSize':3}"
);
assertJQ(req(p, "q","{!join from=dept_s to=dept_id_s}title:MTS", "fl","id")
assertJQ(req(p, "q", buildJoinRequest(DEPT_FIELD, DEPT_ID_FIELD, "title:MTS"), "fl","id")
,"/response=={'numFound':3,'start':0,'docs':[{'id':'10'},{'id':'12'},{'id':'13'}]}"
);
// empty from
assertJQ(req(p, "q","{!join from=noexist_s to=dept_id_s}*:*", "fl","id")
assertJQ(req(p, "q", buildJoinRequest("noexist_ss_dv", DEPT_ID_FIELD, "*:*", "fl","id"))
,"/response=={'numFound':0,'start':0,'docs':[]}"
);
// empty to
assertJQ(req(p, "q","{!join from=dept_s to=noexist_s}*:*", "fl","id")
assertJQ(req(p, "q", buildJoinRequest(DEPT_FIELD, "noexist_ss_dv", "*:*"), "fl","id")
,"/response=={'numFound':0,'start':0,'docs':[]}"
);
// self join... return everyone with she same title as Dave
assertJQ(req(p, "q","{!join from=title to=title}name:dave", "fl","id")
,"/response=={'numFound':2,'start':0,'docs':[{'id':'3'},{'id':'4'}]}"
);
// find people that develop stuff
assertJQ(req(p, "q","{!join from=dept_id_s to=dept_s}text:develop", "fl","id")
// self join... return everyone in same dept(s) as Dave
assertJQ(req(p, "q", buildJoinRequest(DEPT_FIELD, DEPT_FIELD, "name:dave"), "fl","id")
,"/response=={'numFound':3,'start':0,'docs':[{'id':'1'},{'id':'4'},{'id':'5'}]}"
);
// self join on multivalued text field
assertJQ(req(p, "q","{!join from=title to=title}name:dave", "fl","id")
,"/response=={'numFound':2,'start':0,'docs':[{'id':'3'},{'id':'4'}]}"
// from single-value to multi-value
assertJQ(req(p, "q", buildJoinRequest(DEPT_ID_FIELD, DEPT_FIELD, "text:develop"), "fl","id")
,"/response=={'numFound':3,'start':0,'docs':[{'id':'1'},{'id':'4'},{'id':'5'}]}"
);
assertJQ(req(p, "q","{!join from=dept_s to=dept_id_s}title:MTS", "fl","id", "debugQuery","true")
// from multi-value to single-value
assertJQ(req(p, "q",buildJoinRequest(DEPT_FIELD, DEPT_ID_FIELD, "title:MTS"), "fl","id", "debugQuery","true")
,"/response=={'numFound':3,'start':0,'docs':[{'id':'10'},{'id':'12'},{'id':'13'}]}"
);
@ -112,38 +113,66 @@ public class TestJoin extends SolrTestCaseJ4 {
"/response=={'numFound':2,'start':0,'docs':[{'id':'10'},{'id':'13'}]}";
// straight forward query
assertJQ(req(p, "q","{!join from=dept_s to=dept_id_s}name:dave",
"fl","id"),
assertJQ(req(p, "q", buildJoinRequest(DEPT_FIELD, DEPT_ID_FIELD, "name:dave"), "fl","id"),
davesDepartments);
// variable deref for sub-query parsing
assertJQ(req(p, "q","{!join from=dept_s to=dept_id_s v=$qq}",
"qq","{!dismax}dave",
"qf","name",
"fl","id",
"debugQuery","true"),
// variable deref in 'from' query
assertJQ(req(p, "q", buildJoinRequest(DEPT_FIELD, DEPT_ID_FIELD, "$qq"), "qq","{!dismax}dave", "qf","name",
"fl","id", "debugQuery","true"),
davesDepartments);
// variable deref for sub-query parsing w/localparams
assertJQ(req(p, "q","{!join from=dept_s to=dept_id_s v=$qq}",
"qq","{!dismax qf=name}dave",
"fl","id",
"debugQuery","true"),
// variable deref in 'from' query (w/ localparams)
assertJQ(req(p, "q", buildJoinRequest(DEPT_FIELD, DEPT_ID_FIELD, "$qq"), "qq","{!dismax qf=name}dave",
"fl","id", "debugQuery","true"),
davesDepartments);
// defType local param to control sub-query parsing
assertJQ(req(p, "q","{!join from=dept_s to=dept_id_s defType=dismax}dave",
"qf","name",
"fl","id",
"debugQuery","true"),
assertJQ(req(p, "q", buildJoinRequest(DEPT_FIELD, DEPT_ID_FIELD, "dave", "defType=dismax"), "qf","name",
"fl","id", "debugQuery","true"),
davesDepartments);
// find people that develop stuff - but limit via filter query to a name of "john"
// this tests filters being pushed down to queries (SOLR-3062)
assertJQ(req(p, "q","{!join from=dept_id_s to=dept_s}text:develop", "fl","id", "fq", "name:john")
assertJQ(req(p, "q", buildJoinRequest(DEPT_ID_FIELD, DEPT_FIELD, "text:develop"), "fl","id", "fq", "name:john")
,"/response=={'numFound':1,'start':0,'docs':[{'id':'1'}]}"
);
}
/*
* Exercises behavior specific to method=topLevel join queries
*/
@Test
public void testTopLevelDVJoin() throws Exception {
indexEmployeeDocs();
ModifiableSolrParams p = params("sort","id asc");
// "from" field missing docValues
expectThrows(SolrException.class, () -> {
h.query(req(p, "q", "{!join from=nodocvalues_s to=dept_ss_dv method=topLevelDV}*:*", "fl","id"));
});
// "to" field missing docValues
expectThrows(SolrException.class, () -> {
h.query(req(p, "q", "{!join from=dept_ss_dv to=nodocvalues_s method=topLevelDV}*:*", "fl","id"));
});
}
@Test
public void testIndexJoin() throws Exception {
indexEmployeeDocs();
ModifiableSolrParams p = params("sort","id asc");
// Debugging information
assertJQ(req(p, "q", "{!join from=dept_ss_dv to=dept_id_indexed_sdv}title:MTS", "fl","id", "debugQuery","true")
,"/debug/join/{!join from=dept_ss_dv to=dept_id_indexed_sdv}title:MTS=={'_MATCH_':'fromSetSize,toSetSize', 'fromSetSize':2, 'toSetSize':3}"
);
// non-DV/text field.
assertJQ(req(p, "q","{!join from=title to=title}name:dave", "fl","id")
,"/response=={'numFound':2,'start':0,'docs':[{'id':'3'},{'id':'4'}]}"
);
}
@ -288,4 +317,21 @@ public class TestJoin extends SolrTestCaseJ4 {
return ids;
}
private static String buildJoinRequest(String fromField, String toField, String fromQuery, String... otherLocalParams) {
final String baseJoinParams = "from=" + fromField + " to=" + toField + " v=" + fromQuery;
final String optionalParamsJoined = (otherLocalParams != null && otherLocalParams.length > 0) ? String.join(" ", otherLocalParams) : " ";
final String allProvidedParams = baseJoinParams + " " + optionalParamsJoined;
final int joinMethod = random().nextInt(4);
switch (joinMethod) {
case 0: // No explicit method specified
return "{!join " + allProvidedParams + " }";
case 1: // method=persegment
return "{!join " + allProvidedParams + " method=index}";
case 2: // method=score
return "{!join " + allProvidedParams + " method=dvWithScore score=none}";
default: // method=toplevel
return "{!join " + allProvidedParams + " method=topLevelDV}";
}
}
}

View File

@ -591,36 +591,95 @@ The hash range query parser uses a special cache to improve the speedup of the q
== Join Query Parser
`JoinQParser` extends the `QParserPlugin`. It allows normalizing relationships between documents with a join operation. This is different from the concept of a join in a relational database because no information is being truly joined. An appropriate SQL analogy would be an "inner query".
The Join query parser allows users to run queries that normalize relationships between documents.
Solr runs a subquery of the user's choosing (the `v` param), identifies all the values that matching documents have in a field of interest (the `from` param), and then returns documents where those values are contained in a second field of interest (the `to` param).
Examples:
Find all products containing the word "ipod", join them against manufacturer docs and return the list of manufacturers:
In practice, these semantics are much like "inner queries" in a SQL engine.
As an example, consider the Solr query below:
[source,text]
----
{!join from=manu_id_s to=id}ipod
/solr/techproducts/select?q={!join from=manu_id_s to=id}title:ipod
----
Find all manufacturer docs named "belkin", join them against product docs, and filter the list to only products with a price less than $12:
This query, which returns a document for each manufacturer that makes a product with "ipod" in the title, is semantically identical to the SQL query below:
[source,text]
----
q = {!join from=id to=manu_id_s}compName_s:Belkin
fq = price:[* TO 12]
SELECT *
FROM techproducts
WHERE id IN (
SELECT manu_id_s
FROM techproducts
WHERE title='ipod'
)
----
The join operation is done on a term basis, so the "from" and "to" fields must use compatible field types. For example: joining between a `StrField` and a `IntPointField` will not work, likewise joining between a `StrField` and a `TextField` that uses `LowerCaseFilterFactory` will only work for values that are already lower cased in the string field.
The join operation is done on a term basis, so the `from` and `to` fields must use compatible field types.
For example: joining between a `StrField` and a `IntPointField` will not work.
Likewise joining between a `StrField` and a `TextField` that uses `LowerCaseFilterFactory` will only work for values that are already lower cased in the string field.
=== Join Parser Scoring
=== Parameters
You can optionally use the `score` parameter to return scores of the subordinate query. The values to use for this parameter define the type of aggregation, which are `avg` (average), `max` (maximum), `min` (minimum) `total`, or `none`.
This query parser takes the following parameters:
.Score parameter and single value numerics
`from`::
The name of a field which contains values to look for in the "to" field.
Can be single or multi-valued, but must have a field type compatible with the field represented in the "to" field.
This parameter is required.
`to`::
The name of a field whose value(s) will be checked against those found in the "from" field.
Can be single or multi-valued, but must have a field type compatible with the "from" field.
This parameter is required.
`fromIndex`::
The name of the index to run the "from" query (`v` parameter) on and where "from" values are gathered.
Must be located on the same node as the core processing the request.
This parameter is optional; it defaults to the value of the processing core if not specified.
See <<Joining Across Collections,Joining Across Collections>> below for more information.
`score`::
An optional parameter that instructs Solr to return information about the "from" query scores.
The value of this parameter controls what type of aggregation information is returned.
Options include `avg` (average), `max` (maximum), `min` (minimum), `total` (total), or `none`.
+
If `method` is not specified but `score` is, then the `dvWithScore` method is used.
If `method` is specified and is not `dvWithScore`, then the `score` value is ignored.
See the `method` parameter documentation below for more details.
`method`::
An optional parameter used to determine which of several query implementations should be used by Solr.
Options are restricted to: `index`, `dvWithScore`, and `topLevelDV`.
If unspecified the default value is `index`, unless the `score` parameter is present which overrides it to `dvWithScore`.
Each implementation has its own performance characteristics, and users are encouraged to experiment to determine which implementation is most performant for their use-case.
Details and performance heuristics are given below.
+
`index` the default `method` unless the `score` parameter is specified.
Uses the terms index structures to process the request.
Performance scales with the cardinality and number of postings (term occurrences) in the "from" field.
Consider this method when the "from" field has low cardinality, when the "to" side returns a large number of documents, or when sporadic post-commit slowdowns cannot be tolerated (this is a disadvantage of other methods that `index` avoids).
+
`dvWithScore` returns an optional "score" statistic alongside result documents.
Uses docValues structures if available, but falls back to the field cache when necessary.
The first access to the field cache slows down the initial requests following a commit and takes up additional space on the JVM heap, so docValues are recommended in most situations.
Performance scales linearly with the number of values matched in the "from" field.
This method must be used if score information is required, and should also be considered when the "from" query matches few documents, regardless of the number of "to" side documents returned.
+
.dvWithScore and single value numerics
[WARNING]
====
Specifying `score` local parameter switches the join algorithm. This might have performance implication on large indices, but it's more important that this algorithm won't work for single value numeric field starting from 7.0. Users are encouraged to change field types to string and rebuild indexes during migration.
The `dvWithScore` method doesn't support single value numeric fields. Users migrating from versions prior to 7.0 are encouraged to change field types to string and rebuild indexes during migration.
====
+
`topLevelDV` can only be used when `to` and `from` fields have docValues data, and does not currently support numeric fields.
Uses top-level docValues data structures to find results.
These data structures outperform other methods as the number of values matched in the `from` field grows high.
But they are also expensive to build and need to be lazily populated after each commit, causing a sometimes-noticeable slowdown on the first query to use them after each commit.
If you commit frequently and your use-case can tolerate a static warming query, consider adding one to `solrconfig.xml` so that this work is done as a part of the commit itself and not attached directly to user requests.
Consider this method when the "from" query matches a large number of documents and the "to" result set is small to moderate in size, but only if sporadic post-commit slowness is tolerable.
=== Joining Across Collections