From 719b38c8d8c22a50a3738b852bffb5904cc8fcfa Mon Sep 17 00:00:00 2001 From: Jason Gerlowski Date: Fri, 31 Jan 2020 11:21:01 -0500 Subject: [PATCH] SOLR-13892: Add 'top-level' docValues Join implementation (#1171) --- solr/CHANGES.txt | 2 + .../apache/solr/search/JoinQParserPlugin.java | 159 +++++++++---- .../apache/solr/search/TopLevelJoinQuery.java | 221 ++++++++++++++++++ .../join/MultiValueTermOrdinalCollector.java | 65 ++++++ .../solr/collection1/conf/schema12.xml | 2 + .../src/test/org/apache/solr/TestJoin.java | 160 ++++++++----- solr/solr-ref-guide/src/other-parsers.adoc | 85 +++++-- 7 files changed, 573 insertions(+), 121 deletions(-) create mode 100644 solr/core/src/java/org/apache/solr/search/TopLevelJoinQuery.java create mode 100644 solr/core/src/java/org/apache/solr/search/join/MultiValueTermOrdinalCollector.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 3fa0e911b82..fcbac162a3e 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -166,6 +166,8 @@ New Features * SOLR-13749: New cross collection join filter (XCJF) (Dan Fox, Kevin Watters, via Gus Heck) + * SOLR-13892: New "top-level" docValues join implementation (Jason Gerlowski, Joel Bernstein) + Improvements --------------------- * SOLR-14120: Define JavaScript methods 'includes' and 'startsWith' to ensure AdminUI can be displayed when using diff --git a/solr/core/src/java/org/apache/solr/search/JoinQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/JoinQParserPlugin.java index c6fb0dec415..8622d84b0fa 100644 --- a/solr/core/src/java/org/apache/solr/search/JoinQParserPlugin.java +++ b/solr/core/src/java/org/apache/solr/search/JoinQParserPlugin.java @@ -18,6 +18,7 @@ package org.apache.solr.search; import java.io.Closeable; import java.io.IOException; +import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -59,67 +60,124 @@ import org.apache.solr.search.join.GraphPointsCollector; import org.apache.solr.search.join.ScoreJoinQParserPlugin; import org.apache.solr.util.RTimer; import org.apache.solr.util.RefCounted; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class JoinQParserPlugin extends QParserPlugin { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + public static final String NAME = "join"; + /** Choose the internal algorithm */ + private static final String METHOD = "method"; + + private static class JoinParams { + final String fromField; + final String fromCore; + final Query fromQuery; + final long fromCoreOpenTime; + final String toField; + + public JoinParams(String fromField, String fromCore, Query fromQuery, long fromCoreOpenTime, String toField) { + this.fromField = fromField; + this.fromCore = fromCore; + this.fromQuery = fromQuery; + this.fromCoreOpenTime = fromCoreOpenTime; + this.toField = toField; + } + } + + private enum Method { + index { + @Override + Query makeFilter(QParser qparser) throws SyntaxError { + final JoinParams jParams = parseJoin(qparser); + final JoinQuery q = new JoinQuery(jParams.fromField, jParams.toField, jParams.fromCore, jParams.fromQuery); + q.fromCoreOpenTime = jParams.fromCoreOpenTime; + return q; + } + }, + dvWithScore { + @Override + Query makeFilter(QParser qparser) throws SyntaxError { + return new ScoreJoinQParserPlugin().createParser(qparser.qstr, qparser.localParams, qparser.params, qparser.req).parse(); + } + }, + topLevelDV { + @Override + Query makeFilter(QParser qparser) throws SyntaxError { + final JoinParams jParams = parseJoin(qparser); + final JoinQuery q = new TopLevelJoinQuery(jParams.fromField, jParams.toField, jParams.fromCore, jParams.fromQuery); + q.fromCoreOpenTime = jParams.fromCoreOpenTime; + return q; + } + }; + + abstract Query makeFilter(QParser qparser) throws SyntaxError; + + JoinParams parseJoin(QParser qparser) throws SyntaxError { + final String fromField = qparser.getParam("from"); + final String fromIndex = qparser.getParam("fromIndex"); + final String toField = qparser.getParam("to"); + final String v = qparser.localParams.get(QueryParsing.V); + final String coreName; + + Query fromQuery; + long fromCoreOpenTime = 0; + + if (fromIndex != null && !fromIndex.equals(qparser.req.getCore().getCoreDescriptor().getName()) ) { + CoreContainer container = qparser.req.getCore().getCoreContainer(); + + // if in SolrCloud mode, fromIndex should be the name of a single-sharded collection + coreName = ScoreJoinQParserPlugin.getCoreName(fromIndex, container); + + final SolrCore fromCore = container.getCore(coreName); + if (fromCore == null) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "Cross-core join: no such core " + coreName); + } + + RefCounted fromHolder = null; + LocalSolrQueryRequest otherReq = new LocalSolrQueryRequest(fromCore, qparser.params); + try { + QParser parser = QParser.getParser(v, otherReq); + fromQuery = parser.getQuery(); + fromHolder = fromCore.getRegisteredSearcher(); + if (fromHolder != null) fromCoreOpenTime = fromHolder.get().getOpenNanoTime(); + } finally { + otherReq.close(); + fromCore.close(); + if (fromHolder != null) fromHolder.decref(); + } + } else { + coreName = null; + QParser fromQueryParser = qparser.subQuery(v, null); + fromQueryParser.setIsFilter(true); + fromQuery = fromQueryParser.getQuery(); + } + + final String indexToUse = coreName == null ? fromIndex : coreName; + return new JoinParams(fromField, indexToUse, fromQuery, fromCoreOpenTime, toField); + } + } @Override public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) { return new QParser(qstr, localParams, params, req) { - + @Override public Query parse() throws SyntaxError { - if(localParams!=null && localParams.get(ScoreJoinQParserPlugin.SCORE)!=null){ + if (localParams != null && localParams.get(METHOD) != null) { + // TODO Make sure 'method' is valid value here and give users a nice error + final Method explicitMethod = Method.valueOf(localParams.get(METHOD)); + return explicitMethod.makeFilter(this); + } + + // Legacy join behavior before introduction of SOLR-13892 + if(localParams!=null && localParams.get(ScoreJoinQParserPlugin.SCORE)!=null) { return new ScoreJoinQParserPlugin().createParser(qstr, localParams, params, req).parse(); - }else{ - return parseJoin(); - } - } - - Query parseJoin() throws SyntaxError { - final String fromField = getParam("from"); - final String fromIndex = getParam("fromIndex"); - final String toField = getParam("to"); - final String v = localParams.get("v"); - final String coreName; - - Query fromQuery; - long fromCoreOpenTime = 0; - - if (fromIndex != null && !fromIndex.equals(req.getCore().getCoreDescriptor().getName()) ) { - CoreContainer container = req.getCore().getCoreContainer(); - - // if in SolrCloud mode, fromIndex should be the name of a single-sharded collection - coreName = ScoreJoinQParserPlugin.getCoreName(fromIndex, container); - - final SolrCore fromCore = container.getCore(coreName); - if (fromCore == null) { - throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, - "Cross-core join: no such core " + coreName); - } - - RefCounted fromHolder = null; - LocalSolrQueryRequest otherReq = new LocalSolrQueryRequest(fromCore, params); - try { - QParser parser = QParser.getParser(v, otherReq); - fromQuery = parser.getQuery(); - fromHolder = fromCore.getRegisteredSearcher(); - if (fromHolder != null) fromCoreOpenTime = fromHolder.get().getOpenNanoTime(); - } finally { - otherReq.close(); - fromCore.close(); - if (fromHolder != null) fromHolder.decref(); - } } else { - coreName = null; - QParser fromQueryParser = subQuery(v, null); - fromQueryParser.setIsFilter(true); - fromQuery = fromQueryParser.getQuery(); + return Method.index.makeFilter(this); } - - JoinQuery jq = new JoinQuery(fromField, toField, coreName == null ? fromIndex : coreName, fromQuery); - jq.fromCoreOpenTime = fromCoreOpenTime; - return jq; } }; } @@ -175,7 +233,7 @@ class JoinQuery extends Query { return new JoinQueryWeight((SolrIndexSearcher) searcher, scoreMode, boost); } - private class JoinQueryWeight extends ConstantScoreWeight { + protected class JoinQueryWeight extends ConstantScoreWeight { SolrIndexSearcher fromSearcher; RefCounted fromRef; SolrIndexSearcher toSearcher; @@ -586,5 +644,4 @@ class JoinQuery extends Query { h = h * 31 + (int) fromCoreOpenTime; return h; } - } diff --git a/solr/core/src/java/org/apache/solr/search/TopLevelJoinQuery.java b/solr/core/src/java/org/apache/solr/search/TopLevelJoinQuery.java new file mode 100644 index 00000000000..428c229b0d2 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/TopLevelJoinQuery.java @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.search; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; + +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.ConstantScoreScorer; +import org.apache.lucene.search.ConstantScoreWeight; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.search.Weight; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LongBitSet; +import org.apache.solr.common.SolrException; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.search.join.MultiValueTermOrdinalCollector; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * {@link JoinQuery} implementation using global (top-level) DocValues ordinals to efficiently compare values in the "from" and "to" fields. + */ +public class TopLevelJoinQuery extends JoinQuery { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + public TopLevelJoinQuery(String fromField, String toField, String coreName, Query subQuery) { + super(fromField, toField, coreName, subQuery); + } + + @Override + public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { + if (! (searcher instanceof SolrIndexSearcher)) { + log.debug("Falling back to JoinQueryWeight because searcher [{}] is not the required SolrIndexSearcher", searcher); + return super.createWeight(searcher, scoreMode, boost); + } + + final SolrIndexSearcher solrSearcher = (SolrIndexSearcher) searcher; + final JoinQueryWeight weight = new JoinQueryWeight(solrSearcher, ScoreMode.COMPLETE_NO_SCORES, 1.0f); + final SolrIndexSearcher fromSearcher = weight.fromSearcher; + final SolrIndexSearcher toSearcher = weight.toSearcher; + + try { + final SortedSetDocValues topLevelFromDocValues = validateAndFetchDocValues(fromSearcher, fromField, "from"); + final SortedSetDocValues topLevelToDocValues = validateAndFetchDocValues(toSearcher, toField, "to"); + if (topLevelFromDocValues.getValueCount() == 0 || topLevelToDocValues.getValueCount() == 0) { + return createNoMatchesWeight(boost); + } + + final LongBitSet fromOrdBitSet = findFieldOrdinalsMatchingQuery(q, fromField, fromSearcher, topLevelFromDocValues); + final LongBitSet toOrdBitSet = new LongBitSet(topLevelToDocValues.getValueCount()); + final BitsetBounds toBitsetBounds = convertFromOrdinalsIntoToField(fromOrdBitSet, topLevelFromDocValues, toOrdBitSet, topLevelToDocValues); + + final boolean toMultivalued = toSearcher.getSchema().getFieldOrNull(toField).multiValued(); + return new ConstantScoreWeight(this, boost) { + public Scorer scorer(LeafReaderContext context) throws IOException { + if (toBitsetBounds.lower == BitsetBounds.NO_MATCHES) { + return null; + } + + final DocIdSetIterator toApproximation = (toMultivalued) ? context.reader().getSortedSetDocValues(toField) : + context.reader().getSortedDocValues(toField); + if (toApproximation == null) { + return null; + } + + final int docBase = context.docBase; + return new ConstantScoreScorer(this, this.score(), scoreMode, new TwoPhaseIterator(toApproximation) { + public boolean matches() throws IOException { + final boolean hasDoc = topLevelToDocValues.advanceExact(docBase + approximation.docID()); + if (hasDoc) { + for (long ord = topLevelToDocValues.nextOrd(); ord != -1L; ord = topLevelToDocValues.nextOrd()) { + if (toOrdBitSet.get(ord)) { + return true; + } + } + } + return false; + } + + public float matchCost() { + return 10.0F; + } + }); + + } + + public boolean isCacheable(LeafReaderContext ctx) { + return false; + } + }; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private Weight createNoMatchesWeight(float boost) { + return new ConstantScoreWeight(this, boost) { + @Override + public Scorer scorer(LeafReaderContext context) throws IOException { + return null; + } + + @Override + public boolean isCacheable(LeafReaderContext ctx) { + return false; + } + }; + } + + private SortedSetDocValues validateAndFetchDocValues(SolrIndexSearcher solrSearcher, String fieldName, String querySide) throws IOException { + final IndexSchema schema = solrSearcher.getSchema(); + final SchemaField field = schema.getFieldOrNull(fieldName); + if (field == null) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, querySide + " field '" + fieldName + "' does not exist"); + } + + if (!field.hasDocValues()) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "'top-level' join queries require both 'from' and 'to' fields to have docValues, but " + querySide + + " field [" + fieldName + "] does not."); + } + + final LeafReader leafReader = solrSearcher.getSlowAtomicReader(); + if (field.multiValued()) { + return DocValues.getSortedSet(leafReader, fieldName); + } + return DocValues.singleton(DocValues.getSorted(leafReader, fieldName)); + } + + private static LongBitSet findFieldOrdinalsMatchingQuery(Query q, String field, SolrIndexSearcher searcher, SortedSetDocValues docValues) throws IOException { + final LongBitSet fromOrdBitSet = new LongBitSet(docValues.getValueCount()); + final Collector fromCollector = new MultiValueTermOrdinalCollector(field, docValues, fromOrdBitSet); + + searcher.search(q, fromCollector); + + return fromOrdBitSet; + } + + private BitsetBounds convertFromOrdinalsIntoToField(LongBitSet fromOrdBitSet, SortedSetDocValues fromDocValues, + LongBitSet toOrdBitSet, SortedSetDocValues toDocValues) throws IOException { + long fromOrdinal = 0; + long firstToOrd = BitsetBounds.NO_MATCHES; + long lastToOrd = 0; + + while (fromOrdinal < fromOrdBitSet.length() && (fromOrdinal = fromOrdBitSet.nextSetBit(fromOrdinal)) >= 0) { + final BytesRef fromBytesRef = fromDocValues.lookupOrd(fromOrdinal); + final long toOrdinal = lookupTerm(toDocValues, fromBytesRef, lastToOrd); + if (toOrdinal >= 0) { + toOrdBitSet.set(toOrdinal); + if (firstToOrd == BitsetBounds.NO_MATCHES) firstToOrd = toOrdinal; + lastToOrd = toOrdinal; + } + fromOrdinal++; + } + + return new BitsetBounds(firstToOrd, lastToOrd); + } + + /* + * Same binary-search based implementation as SortedSetDocValues.lookupTerm(BytesRef), but with an + * optimization to narrow the search space where possible by providing a startOrd instead of beginning each search + * at 0. + */ + private long lookupTerm(SortedSetDocValues docValues, BytesRef key, long startOrd) throws IOException { + long low = startOrd; + long high = docValues.getValueCount()-1; + + while (low <= high) { + long mid = (low + high) >>> 1; + final BytesRef term = docValues.lookupOrd(mid); + int cmp = term.compareTo(key); + + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return mid; // key found + } + } + + return -(low + 1); // key not found. + } + + private static class BitsetBounds { + public static final long NO_MATCHES = -1L; + public final long lower; + public final long upper; + + public BitsetBounds(long lower, long upper) { + this.lower = lower; + this.upper = upper; + } + } +} diff --git a/solr/core/src/java/org/apache/solr/search/join/MultiValueTermOrdinalCollector.java b/solr/core/src/java/org/apache/solr/search/join/MultiValueTermOrdinalCollector.java new file mode 100644 index 00000000000..1c44cc39f41 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/join/MultiValueTermOrdinalCollector.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.search.join; + +import java.io.IOException; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.SimpleCollector; +import org.apache.lucene.util.LongBitSet; + +/** + * Populates a bitset of (top-level) ordinals based on field values in a multi-valued field. + */ +public class MultiValueTermOrdinalCollector extends SimpleCollector { + + private int docBase; + private SortedSetDocValues topLevelDocValues; + private final String fieldName; + // Records all ordinals found during collection + private final LongBitSet topLevelDocValuesBitSet; + + public MultiValueTermOrdinalCollector(String fieldName, SortedSetDocValues topLevelDocValues, LongBitSet topLevelDocValuesBitSet) { + this.fieldName = fieldName; + this.topLevelDocValues = topLevelDocValues; + this.topLevelDocValuesBitSet = topLevelDocValuesBitSet; + } + + public ScoreMode scoreMode() { + return ScoreMode.COMPLETE_NO_SCORES; + } + + @Override + public void doSetNextReader(LeafReaderContext context) throws IOException { + this.docBase = context.docBase; + } + + @Override + public void collect(int doc) throws IOException { + final int globalDoc = docBase + doc; + + if (topLevelDocValues.advanceExact(globalDoc)) { + long ord = SortedSetDocValues.NO_MORE_ORDS; + while ((ord = topLevelDocValues.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { + topLevelDocValuesBitSet.set(ord); + } + } + } +} diff --git a/solr/core/src/test-files/solr/collection1/conf/schema12.xml b/solr/core/src/test-files/solr/collection1/conf/schema12.xml index eb407262ffd..1368e6b04fc 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema12.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema12.xml @@ -705,6 +705,8 @@ + + diff --git a/solr/core/src/test/org/apache/solr/TestJoin.java b/solr/core/src/test/org/apache/solr/TestJoin.java index e263aa9a904..1e0a676dafc 100644 --- a/solr/core/src/test/org/apache/solr/TestJoin.java +++ b/solr/core/src/test/org/apache/solr/TestJoin.java @@ -27,6 +27,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.solr.common.SolrException; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.Utils; import org.apache.solr.request.SolrQueryRequest; @@ -51,99 +52,127 @@ public class TestJoin extends SolrTestCaseJ4 { initCore("solrconfig.xml","schema12.xml"); } + private static final String PRIMARY_DEPT_FIELD = "primary_dept_indexed_sdv"; + private static final String DEPT_FIELD = "dept_ss_dv"; + private static final String DEPT_ID_FIELD = "dept_id_indexed_sdv"; - @Test - public void testJoin() throws Exception { - assertU(add(doc("id", "1","name", "john", "title", "Director", "dept_s","Engineering"))); - assertU(add(doc("id", "2","name", "mark", "title", "VP", "dept_s","Marketing"))); - assertU(add(doc("id", "3","name", "nancy", "title", "MTS", "dept_s","Sales"))); - assertU(add(doc("id", "4","name", "dave", "title", "MTS", "dept_s","Support", "dept_s","Engineering"))); - assertU(add(doc("id", "5","name", "tina", "title", "VP", "dept_s","Engineering"))); + private void indexEmployeeDocs() { + assertU(add(doc("id", "1","name", "john", "title", "Director", PRIMARY_DEPT_FIELD, "Engineering", DEPT_FIELD,"Engineering"))); + assertU(add(doc("id", "2","name", "mark", "title", "VP", PRIMARY_DEPT_FIELD, "Marketing", DEPT_FIELD,"Marketing"))); + assertU(add(doc("id", "3","name", "nancy", "title", "MTS", PRIMARY_DEPT_FIELD, "Sales", DEPT_FIELD,"Sales"))); + assertU(add(doc("id", "4","name", "dave", "title", "MTS", PRIMARY_DEPT_FIELD, "Support", DEPT_FIELD,"Support", DEPT_FIELD,"Engineering"))); + assertU(add(doc("id", "5","name", "tina", "title", "VP", PRIMARY_DEPT_FIELD, "Engineering", DEPT_FIELD,"Engineering"))); - assertU(add(doc("id","10", "dept_id_s", "Engineering", "text","These guys develop stuff"))); - assertU(add(doc("id","11", "dept_id_s", "Marketing", "text","These guys make you look good"))); - assertU(add(doc("id","12", "dept_id_s", "Sales", "text","These guys sell stuff"))); - assertU(add(doc("id","13", "dept_id_s", "Support", "text","These guys help customers"))); + assertU(add(doc("id","10", DEPT_ID_FIELD, "Engineering", "text","These guys develop stuff"))); + assertU(add(doc("id","11", DEPT_ID_FIELD, "Marketing", "text","These guys make you look good"))); + assertU(add(doc("id","12", DEPT_ID_FIELD, "Sales", "text","These guys sell stuff"))); + assertU(add(doc("id","13", DEPT_ID_FIELD, "Support", "text","These guys help customers"))); assertU(commit()); + } + /* + * Exercises behavior shared by all join methods. + */ + @Test + public void testJoinAllMethods() throws Exception { + indexEmployeeDocs(); ModifiableSolrParams p = params("sort","id asc"); - // test debugging - assertJQ(req(p, "q","{!join from=dept_s to=dept_id_s}title:MTS", "fl","id", "debugQuery","true") - ,"/debug/join/{!join from=dept_s to=dept_id_s}title:MTS=={'_MATCH_':'fromSetSize,toSetSize', 'fromSetSize':2, 'toSetSize':3}" - ); - - assertJQ(req(p, "q","{!join from=dept_s to=dept_id_s}title:MTS", "fl","id") + assertJQ(req(p, "q", buildJoinRequest(DEPT_FIELD, DEPT_ID_FIELD, "title:MTS"), "fl","id") ,"/response=={'numFound':3,'start':0,'docs':[{'id':'10'},{'id':'12'},{'id':'13'}]}" ); // empty from - assertJQ(req(p, "q","{!join from=noexist_s to=dept_id_s}*:*", "fl","id") + assertJQ(req(p, "q", buildJoinRequest("noexist_ss_dv", DEPT_ID_FIELD, "*:*", "fl","id")) ,"/response=={'numFound':0,'start':0,'docs':[]}" ); // empty to - assertJQ(req(p, "q","{!join from=dept_s to=noexist_s}*:*", "fl","id") + assertJQ(req(p, "q", buildJoinRequest(DEPT_FIELD, "noexist_ss_dv", "*:*"), "fl","id") ,"/response=={'numFound':0,'start':0,'docs':[]}" ); - // self join... return everyone with she same title as Dave - assertJQ(req(p, "q","{!join from=title to=title}name:dave", "fl","id") - ,"/response=={'numFound':2,'start':0,'docs':[{'id':'3'},{'id':'4'}]}" - ); - - // find people that develop stuff - assertJQ(req(p, "q","{!join from=dept_id_s to=dept_s}text:develop", "fl","id") + // self join... return everyone in same dept(s) as Dave + assertJQ(req(p, "q", buildJoinRequest(DEPT_FIELD, DEPT_FIELD, "name:dave"), "fl","id") ,"/response=={'numFound':3,'start':0,'docs':[{'id':'1'},{'id':'4'},{'id':'5'}]}" ); - // self join on multivalued text field - assertJQ(req(p, "q","{!join from=title to=title}name:dave", "fl","id") - ,"/response=={'numFound':2,'start':0,'docs':[{'id':'3'},{'id':'4'}]}" + // from single-value to multi-value + assertJQ(req(p, "q", buildJoinRequest(DEPT_ID_FIELD, DEPT_FIELD, "text:develop"), "fl","id") + ,"/response=={'numFound':3,'start':0,'docs':[{'id':'1'},{'id':'4'},{'id':'5'}]}" ); - assertJQ(req(p, "q","{!join from=dept_s to=dept_id_s}title:MTS", "fl","id", "debugQuery","true") + // from multi-value to single-value + assertJQ(req(p, "q",buildJoinRequest(DEPT_FIELD, DEPT_ID_FIELD, "title:MTS"), "fl","id", "debugQuery","true") ,"/response=={'numFound':3,'start':0,'docs':[{'id':'10'},{'id':'12'},{'id':'13'}]}" ); - + // expected outcome for a sub query matching dave joined against departments - final String davesDepartments = - "/response=={'numFound':2,'start':0,'docs':[{'id':'10'},{'id':'13'}]}"; + final String davesDepartments = + "/response=={'numFound':2,'start':0,'docs':[{'id':'10'},{'id':'13'}]}"; // straight forward query - assertJQ(req(p, "q","{!join from=dept_s to=dept_id_s}name:dave", - "fl","id"), - davesDepartments); + assertJQ(req(p, "q", buildJoinRequest(DEPT_FIELD, DEPT_ID_FIELD, "name:dave"), "fl","id"), + davesDepartments); - // variable deref for sub-query parsing - assertJQ(req(p, "q","{!join from=dept_s to=dept_id_s v=$qq}", - "qq","{!dismax}dave", - "qf","name", - "fl","id", - "debugQuery","true"), - davesDepartments); + // variable deref in 'from' query + assertJQ(req(p, "q", buildJoinRequest(DEPT_FIELD, DEPT_ID_FIELD, "$qq"), "qq","{!dismax}dave", "qf","name", + "fl","id", "debugQuery","true"), + davesDepartments); - // variable deref for sub-query parsing w/localparams - assertJQ(req(p, "q","{!join from=dept_s to=dept_id_s v=$qq}", - "qq","{!dismax qf=name}dave", - "fl","id", - "debugQuery","true"), - davesDepartments); + // variable deref in 'from' query (w/ localparams) + assertJQ(req(p, "q", buildJoinRequest(DEPT_FIELD, DEPT_ID_FIELD, "$qq"), "qq","{!dismax qf=name}dave", + "fl","id", "debugQuery","true"), + davesDepartments); // defType local param to control sub-query parsing - assertJQ(req(p, "q","{!join from=dept_s to=dept_id_s defType=dismax}dave", - "qf","name", - "fl","id", - "debugQuery","true"), - davesDepartments); + assertJQ(req(p, "q", buildJoinRequest(DEPT_FIELD, DEPT_ID_FIELD, "dave", "defType=dismax"), "qf","name", + "fl","id", "debugQuery","true"), + davesDepartments); // find people that develop stuff - but limit via filter query to a name of "john" // this tests filters being pushed down to queries (SOLR-3062) - assertJQ(req(p, "q","{!join from=dept_id_s to=dept_s}text:develop", "fl","id", "fq", "name:john") - ,"/response=={'numFound':1,'start':0,'docs':[{'id':'1'}]}" - ); + assertJQ(req(p, "q", buildJoinRequest(DEPT_ID_FIELD, DEPT_FIELD, "text:develop"), "fl","id", "fq", "name:john") + ,"/response=={'numFound':1,'start':0,'docs':[{'id':'1'}]}" + ); + } + /* + * Exercises behavior specific to method=topLevel join queries + */ + @Test + public void testTopLevelDVJoin() throws Exception { + indexEmployeeDocs(); + ModifiableSolrParams p = params("sort","id asc"); + + // "from" field missing docValues + expectThrows(SolrException.class, () -> { + h.query(req(p, "q", "{!join from=nodocvalues_s to=dept_ss_dv method=topLevelDV}*:*", "fl","id")); + }); + + // "to" field missing docValues + expectThrows(SolrException.class, () -> { + h.query(req(p, "q", "{!join from=dept_ss_dv to=nodocvalues_s method=topLevelDV}*:*", "fl","id")); + }); + } + + + @Test + public void testIndexJoin() throws Exception { + indexEmployeeDocs(); + + ModifiableSolrParams p = params("sort","id asc"); + + // Debugging information + assertJQ(req(p, "q", "{!join from=dept_ss_dv to=dept_id_indexed_sdv}title:MTS", "fl","id", "debugQuery","true") + ,"/debug/join/{!join from=dept_ss_dv to=dept_id_indexed_sdv}title:MTS=={'_MATCH_':'fromSetSize,toSetSize', 'fromSetSize':2, 'toSetSize':3}" + ); + + // non-DV/text field. + assertJQ(req(p, "q","{!join from=title to=title}name:dave", "fl","id") + ,"/response=={'numFound':2,'start':0,'docs':[{'id':'3'},{'id':'4'}]}" + ); } @@ -288,4 +317,21 @@ public class TestJoin extends SolrTestCaseJ4 { return ids; } + private static String buildJoinRequest(String fromField, String toField, String fromQuery, String... otherLocalParams) { + final String baseJoinParams = "from=" + fromField + " to=" + toField + " v=" + fromQuery; + final String optionalParamsJoined = (otherLocalParams != null && otherLocalParams.length > 0) ? String.join(" ", otherLocalParams) : " "; + final String allProvidedParams = baseJoinParams + " " + optionalParamsJoined; + + final int joinMethod = random().nextInt(4); + switch (joinMethod) { + case 0: // No explicit method specified + return "{!join " + allProvidedParams + " }"; + case 1: // method=persegment + return "{!join " + allProvidedParams + " method=index}"; + case 2: // method=score + return "{!join " + allProvidedParams + " method=dvWithScore score=none}"; + default: // method=toplevel + return "{!join " + allProvidedParams + " method=topLevelDV}"; + } + } } diff --git a/solr/solr-ref-guide/src/other-parsers.adoc b/solr/solr-ref-guide/src/other-parsers.adoc index 46f2dce8faf..bb18c88112d 100644 --- a/solr/solr-ref-guide/src/other-parsers.adoc +++ b/solr/solr-ref-guide/src/other-parsers.adoc @@ -591,36 +591,95 @@ The hash range query parser uses a special cache to improve the speedup of the q == Join Query Parser -`JoinQParser` extends the `QParserPlugin`. It allows normalizing relationships between documents with a join operation. This is different from the concept of a join in a relational database because no information is being truly joined. An appropriate SQL analogy would be an "inner query". +The Join query parser allows users to run queries that normalize relationships between documents. +Solr runs a subquery of the user's choosing (the `v` param), identifies all the values that matching documents have in a field of interest (the `from` param), and then returns documents where those values are contained in a second field of interest (the `to` param). -Examples: - -Find all products containing the word "ipod", join them against manufacturer docs and return the list of manufacturers: +In practice, these semantics are much like "inner queries" in a SQL engine. +As an example, consider the Solr query below: [source,text] ---- -{!join from=manu_id_s to=id}ipod +/solr/techproducts/select?q={!join from=manu_id_s to=id}title:ipod ---- -Find all manufacturer docs named "belkin", join them against product docs, and filter the list to only products with a price less than $12: +This query, which returns a document for each manufacturer that makes a product with "ipod" in the title, is semantically identical to the SQL query below: [source,text] ---- -q = {!join from=id to=manu_id_s}compName_s:Belkin -fq = price:[* TO 12] +SELECT * +FROM techproducts +WHERE id IN ( + SELECT manu_id_s + FROM techproducts + WHERE title='ipod' + ) ---- -The join operation is done on a term basis, so the "from" and "to" fields must use compatible field types. For example: joining between a `StrField` and a `IntPointField` will not work, likewise joining between a `StrField` and a `TextField` that uses `LowerCaseFilterFactory` will only work for values that are already lower cased in the string field. +The join operation is done on a term basis, so the `from` and `to` fields must use compatible field types. +For example: joining between a `StrField` and a `IntPointField` will not work. +Likewise joining between a `StrField` and a `TextField` that uses `LowerCaseFilterFactory` will only work for values that are already lower cased in the string field. -=== Join Parser Scoring +=== Parameters -You can optionally use the `score` parameter to return scores of the subordinate query. The values to use for this parameter define the type of aggregation, which are `avg` (average), `max` (maximum), `min` (minimum) `total`, or `none`. +This query parser takes the following parameters: -.Score parameter and single value numerics +`from`:: +The name of a field which contains values to look for in the "to" field. +Can be single or multi-valued, but must have a field type compatible with the field represented in the "to" field. +This parameter is required. + +`to`:: +The name of a field whose value(s) will be checked against those found in the "from" field. +Can be single or multi-valued, but must have a field type compatible with the "from" field. +This parameter is required. + +`fromIndex`:: +The name of the index to run the "from" query (`v` parameter) on and where "from" values are gathered. +Must be located on the same node as the core processing the request. +This parameter is optional; it defaults to the value of the processing core if not specified. +See <> below for more information. + +`score`:: +An optional parameter that instructs Solr to return information about the "from" query scores. +The value of this parameter controls what type of aggregation information is returned. +Options include `avg` (average), `max` (maximum), `min` (minimum), `total` (total), or `none`. ++ +If `method` is not specified but `score` is, then the `dvWithScore` method is used. +If `method` is specified and is not `dvWithScore`, then the `score` value is ignored. +See the `method` parameter documentation below for more details. + + +`method`:: +An optional parameter used to determine which of several query implementations should be used by Solr. +Options are restricted to: `index`, `dvWithScore`, and `topLevelDV`. +If unspecified the default value is `index`, unless the `score` parameter is present which overrides it to `dvWithScore`. +Each implementation has its own performance characteristics, and users are encouraged to experiment to determine which implementation is most performant for their use-case. +Details and performance heuristics are given below. ++ +`index` the default `method` unless the `score` parameter is specified. +Uses the terms index structures to process the request. +Performance scales with the cardinality and number of postings (term occurrences) in the "from" field. +Consider this method when the "from" field has low cardinality, when the "to" side returns a large number of documents, or when sporadic post-commit slowdowns cannot be tolerated (this is a disadvantage of other methods that `index` avoids). + ++ +`dvWithScore` returns an optional "score" statistic alongside result documents. +Uses docValues structures if available, but falls back to the field cache when necessary. +The first access to the field cache slows down the initial requests following a commit and takes up additional space on the JVM heap, so docValues are recommended in most situations. +Performance scales linearly with the number of values matched in the "from" field. +This method must be used if score information is required, and should also be considered when the "from" query matches few documents, regardless of the number of "to" side documents returned. ++ +.dvWithScore and single value numerics [WARNING] ==== -Specifying `score` local parameter switches the join algorithm. This might have performance implication on large indices, but it's more important that this algorithm won't work for single value numeric field starting from 7.0. Users are encouraged to change field types to string and rebuild indexes during migration. +The `dvWithScore` method doesn't support single value numeric fields. Users migrating from versions prior to 7.0 are encouraged to change field types to string and rebuild indexes during migration. ==== ++ +`topLevelDV` can only be used when `to` and `from` fields have docValues data, and does not currently support numeric fields. +Uses top-level docValues data structures to find results. +These data structures outperform other methods as the number of values matched in the `from` field grows high. +But they are also expensive to build and need to be lazily populated after each commit, causing a sometimes-noticeable slowdown on the first query to use them after each commit. +If you commit frequently and your use-case can tolerate a static warming query, consider adding one to `solrconfig.xml` so that this work is done as a part of the commit itself and not attached directly to user requests. +Consider this method when the "from" query matches a large number of documents and the "to" result set is small to moderate in size, but only if sporadic post-commit slowness is tolerable. === Joining Across Collections