SOLR-13749: XCJF -> Cross-Collection Join

change solrUrlWhitelist -> allowSolrUrls
ref guide: update join parser section to be inline with the unit tests.
This commit is contained in:
Dan Fox 2020-06-14 17:47:55 -04:00 committed by David Smiley
parent 73a5016ea7
commit 54e6528304
12 changed files with 207 additions and 261 deletions

View File

@ -111,6 +111,9 @@ New Features
* SOLR-14404: CoreContainer level custom requesthandlers (noble)
* SOLR-13749: Join queries can now work cross-collection, even when sharded or when spanning nodes.
(Dan Fox, Kevin Watters, via Gus Heck, David Smiley)
Improvements
---------------------
* SOLR-14316: Remove unchecked type conversion warning in JavaBinCodec's readMapEntry's equals() method

View File

@ -16,27 +16,32 @@
*/
package org.apache.solr.search;
import java.lang.invoke.MethodHandles;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.join.CrossCollectionJoinQParser;
import org.apache.solr.search.join.ScoreJoinQParserPlugin;
import org.apache.solr.util.RefCounted;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class JoinQParserPlugin extends QParserPlugin {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public static final String NAME = "join";
/** Choose the internal algorithm */
private static final String METHOD = "method";
private String routerField;
private Set<String> allowSolrUrls;
private static class JoinParams {
final String fromField;
final String fromCore;
@ -56,7 +61,7 @@ public class JoinQParserPlugin extends QParserPlugin {
private enum Method {
index {
@Override
Query makeFilter(QParser qparser) throws SyntaxError {
Query makeFilter(QParser qparser, JoinQParserPlugin plugin) throws SyntaxError {
final JoinParams jParams = parseJoin(qparser);
final JoinQuery q = new JoinQuery(jParams.fromField, jParams.toField, jParams.fromCore, jParams.fromQuery);
q.fromCoreOpenTime = jParams.fromCoreOpenTime;
@ -65,21 +70,28 @@ public class JoinQParserPlugin extends QParserPlugin {
},
dvWithScore {
@Override
Query makeFilter(QParser qparser) throws SyntaxError {
Query makeFilter(QParser qparser, JoinQParserPlugin plugin) throws SyntaxError {
return new ScoreJoinQParserPlugin().createParser(qparser.qstr, qparser.localParams, qparser.params, qparser.req).parse();
}
},
topLevelDV {
@Override
Query makeFilter(QParser qparser) throws SyntaxError {
Query makeFilter(QParser qparser, JoinQParserPlugin plugin) throws SyntaxError {
final JoinParams jParams = parseJoin(qparser);
final JoinQuery q = new TopLevelJoinQuery(jParams.fromField, jParams.toField, jParams.fromCore, jParams.fromQuery);
q.fromCoreOpenTime = jParams.fromCoreOpenTime;
return q;
}
},
crossCollection {
@Override
Query makeFilter(QParser qparser, JoinQParserPlugin plugin) throws SyntaxError {
return new CrossCollectionJoinQParser(qparser.qstr, qparser.localParams, qparser.params, qparser.req,
plugin.routerField, plugin.allowSolrUrls).parse();
}
};
abstract Query makeFilter(QParser qparser) throws SyntaxError;
abstract Query makeFilter(QParser qparser, JoinQParserPlugin plugin) throws SyntaxError;
JoinParams parseJoin(QParser qparser) throws SyntaxError {
final String fromField = qparser.getParam("from");
@ -127,8 +139,23 @@ public class JoinQParserPlugin extends QParserPlugin {
}
}
@Override
@SuppressWarnings({"unchecked"})
public void init(@SuppressWarnings({"rawtypes"})NamedList args) {
routerField = (String) args.get("routerField");
if (args.get("allowSolrUrls") != null) {
allowSolrUrls = new HashSet<>();
allowSolrUrls.addAll((List<String>) args.get("allowSolrUrls"));
} else {
allowSolrUrls = null;
}
}
@Override
public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
final JoinQParserPlugin plugin = this;
return new QParser(qstr, localParams, params, req) {
@Override
@ -136,14 +163,14 @@ public class JoinQParserPlugin extends QParserPlugin {
if (localParams != null && localParams.get(METHOD) != null) {
// TODO Make sure 'method' is valid value here and give users a nice error
final Method explicitMethod = Method.valueOf(localParams.get(METHOD));
return explicitMethod.makeFilter(this);
return explicitMethod.makeFilter(this, plugin);
}
// Legacy join behavior before introduction of SOLR-13892
if(localParams!=null && localParams.get(ScoreJoinQParserPlugin.SCORE)!=null) {
return new ScoreJoinQParserPlugin().createParser(qstr, localParams, params, req).parse();
} else {
return Method.index.makeFilter(this);
return Method.index.makeFilter(this, plugin);
}
}
};
@ -160,4 +187,4 @@ public class JoinQParserPlugin extends QParserPlugin {
return new JoinQuery(fromField, toField, null, subQuery);
}
}
}

View File

@ -30,7 +30,6 @@ import org.apache.solr.search.join.BlockJoinParentQParserPlugin;
import org.apache.solr.search.join.FiltersQParserPlugin;
import org.apache.solr.search.join.GraphQParserPlugin;
import org.apache.solr.search.join.HashRangeQParserPlugin;
import org.apache.solr.search.join.XCJFQParserPlugin;
import org.apache.solr.search.mlt.MLTQParserPlugin;
import org.apache.solr.util.plugin.NamedListInitializedPlugin;
@ -87,7 +86,6 @@ public abstract class QParserPlugin implements NamedListInitializedPlugin, SolrI
map.put(PayloadCheckQParserPlugin.NAME, new PayloadCheckQParserPlugin());
map.put(BoolQParserPlugin.NAME, new BoolQParserPlugin());
map.put(MinHashQParserPlugin.NAME, new MinHashQParserPlugin());
map.put(XCJFQParserPlugin.NAME, new XCJFQParserPlugin());
map.put(HashRangeQParserPlugin.NAME, new HashRangeQParserPlugin());
standardPlugins = Collections.unmodifiableMap(map);

View File

@ -31,11 +31,11 @@ import java.util.Iterator;
import java.util.Set;
@SuppressWarnings("WeakerAccess")
public class XCJFQParser extends QParser {
public class CrossCollectionJoinQParser extends QParser {
public static final String ZK_HOST = "zkHost";
public static final String SOLR_URL = "solrUrl";
public static final String COLLECTION = "collection";
public static final String FROM_INDEX = "fromIndex";
public static final String FROM = "from";
public static final String TO = "to";
public static final String ROUTED_BY_JOIN_KEY = "routed";
@ -44,16 +44,17 @@ public class XCJFQParser extends QParser {
public static final int TTL_DEFAULT = 60 * 60; // in seconds
private static final Set<String> OWN_PARAMS = new HashSet<>(Arrays.asList(
QueryParsing.TYPE, QueryParsing.V, ZK_HOST, SOLR_URL, COLLECTION, FROM, TO, ROUTED_BY_JOIN_KEY, TTL));
QueryParsing.TYPE, QueryParsing.V, ZK_HOST, SOLR_URL, FROM_INDEX, FROM, TO, ROUTED_BY_JOIN_KEY, TTL));
private final String routerField;
private final Set<String> solrUrlWhitelist;
private final Set<String> allowSolrUrls;
public XCJFQParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req, String routerField, Set<String> solrUrlWhiteList) {
public CrossCollectionJoinQParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req,
String routerField, Set<String> allowSolrUrls) {
super(qstr, localParams, params, req);
this.routerField = routerField;
// If specified in the config, this will limit which solr url's the parser can connect to.
this.solrUrlWhitelist = solrUrlWhiteList;
this.allowSolrUrls = allowSolrUrls;
}
@Override
@ -63,17 +64,18 @@ public class XCJFQParser extends QParser {
String solrUrl = localParams.get(SOLR_URL);
// Test if this is a valid solr url.
if (solrUrl != null) {
if (solrUrlWhitelist == null) {
throw new SyntaxError("White list must be configured to use solrUrl parameter.");
if (allowSolrUrls == null) {
throw new SyntaxError("allowSolrUrls list must be configured to use solrUrl parameter.");
}
if (!solrUrlWhitelist.contains(solrUrl)) {
throw new SyntaxError("Solr Url was not in the whitelist. Please check your configuration.");
if (!allowSolrUrls.contains(solrUrl)) {
throw new SyntaxError("Solr URL was not in allowSolrUrls list. Please check your configuration.");
}
}
String collection = localParams.get(COLLECTION);
String collection = localParams.get(FROM_INDEX);
String fromField = localParams.get(FROM);
String toField = localParams.get(TO);
boolean routedByJoinKey = localParams.getBool(ROUTED_BY_JOIN_KEY, toField.equals(routerField));
int ttl = localParams.getInt(TTL, TTL_DEFAULT);
@ -85,6 +87,6 @@ public class XCJFQParser extends QParser {
}
}
return new XCJFQuery(query, zkHost, solrUrl, collection, fromField, toField, routedByJoinKey, ttl, otherParams);
return new CrossCollectionJoinQuery(query, zkHost, solrUrl, collection, fromField, toField, routedByJoinKey, ttl, otherParams);
}
}

View File

@ -64,7 +64,7 @@ import org.apache.solr.search.DocSetUtil;
import org.apache.solr.search.Filter;
import org.apache.solr.search.SolrIndexSearcher;
public class XCJFQuery extends Query {
public class CrossCollectionJoinQuery extends Query {
protected final String query;
protected final String zkHost;
@ -80,8 +80,9 @@ public class XCJFQuery extends Query {
protected SolrParams otherParams;
protected String otherParamsString;
public XCJFQuery(String query, String zkHost, String solrUrl, String collection, String fromField, String toField,
boolean routedByJoinKey, int ttl, SolrParams otherParams) {
public CrossCollectionJoinQuery(String query, String zkHost, String solrUrl,
String collection, String fromField, String toField,
boolean routedByJoinKey, int ttl, SolrParams otherParams) {
this.query = query;
this.zkHost = zkHost;
@ -173,14 +174,14 @@ public class XCJFQuery extends Query {
}
}
private class XCJFQueryWeight extends ConstantScoreWeight {
private class CrossCollectionJoinQueryWeight extends ConstantScoreWeight {
private SolrIndexSearcher searcher;
private ScoreMode scoreMode;
private Filter filter;
public XCJFQueryWeight(SolrIndexSearcher searcher, ScoreMode scoreMode, float score) {
super(XCJFQuery.this, score);
public CrossCollectionJoinQueryWeight(SolrIndexSearcher searcher, ScoreMode scoreMode, float score) {
super(CrossCollectionJoinQuery.this, score);
this.scoreMode = scoreMode;
this.searcher = searcher;
}
@ -329,7 +330,7 @@ public class XCJFQuery extends Query {
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
return new XCJFQueryWeight((SolrIndexSearcher) searcher, scoreMode, boost);
return new CrossCollectionJoinQueryWeight((SolrIndexSearcher) searcher, scoreMode, boost);
}
@Override
@ -359,7 +360,7 @@ public class XCJFQuery extends Query {
equalsTo(getClass().cast(other));
}
private boolean equalsTo(XCJFQuery other) {
private boolean equalsTo(CrossCollectionJoinQuery other) {
return Objects.equals(query, other.query) &&
Objects.equals(zkHost, other.zkHost) &&
Objects.equals(solrUrl, other.solrUrl) &&

View File

@ -309,7 +309,7 @@ public class ScoreJoinQParserPlugin extends QParserPlugin {
for (Slice slice : zkController.getClusterState().getCollection(fromIndex).getActiveSlicesArr()) {
if (fromReplica != null)
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"SolrCloud join: multiple shards not yet supported " + fromIndex);
"SolrCloud join: To join with a sharded collection, use method=crossCollection.");
for (Replica replica : slice.getReplicas()) {
if (replica.getNodeName().equals(nodeName)) {
@ -327,8 +327,7 @@ public class ScoreJoinQParserPlugin extends QParserPlugin {
if (fromReplica == null)
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"SolrCloud join: No active replicas for "+fromIndex+
" found in node " + nodeName);
"SolrCloud join: To join with a collection that might not be co-located, use method=crossCollection.");
return fromReplica;
}

View File

@ -1,66 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.join;
import java.util.HashSet;
import java.util.List;
import org.apache.solr.common.StringUtils;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.QParser;
import org.apache.solr.search.QParserPlugin;
/**
* Cross-collection join filter. Runs a query against a remote Solr collection to obtain a
* set of join keys, then applies that set of join keys as a filter against the local collection.
* <br>Example: {!xcjf collection="remoteCollection" from="fromField" to="toField" v="*:*"}
*/
public class XCJFQParserPlugin extends QParserPlugin {
public static final String NAME = "xcjf";
private String routerField;
private HashSet<String> solrUrlWhitelist;
@Override
public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
return new XCJFQParser(qstr, localParams, params, req, routerField, solrUrlWhitelist);
}
@Override
@SuppressWarnings({"unchecked"})
public void init(@SuppressWarnings({"rawtypes"})NamedList args) {
routerField = (String) args.get("routerField");
solrUrlWhitelist = new HashSet<>();
if (args.get("solrUrl") != null) {
for (String s : (List<String>) args.get("solrUrl")) {
if (!StringUtils.isEmpty(s))
solrUrlWhitelist.add(s);
}
} else {
solrUrlWhitelist = null;
}
}
@Override
public String getName() {
return NAME;
}
}

View File

@ -48,25 +48,15 @@
initialSize="0"
regenerator="solr.NoOpRegenerator"/>
<queryParser name="xcjf" class="org.apache.solr.search.join.XCJFQParserPlugin">
<queryParser name="join" class="org.apache.solr.search.JoinQParserPlugin">
<str name="routerField">product_id_s</str>
<arr name="solrUrl">
<str>${test.xcjf.solr.url.1:}</str>
<str>${test.xcjf.solr.url.2:}</str>
<str>${test.xcjf.solr.url.3:}</str>
</arr>
</queryParser>
<queryParser name="xcjf_nonrouted" class="org.apache.solr.search.join.XCJFQParserPlugin" />
<queryParser name="xcjf_whitelist" class="org.apache.solr.search.join.XCJFQParserPlugin">
<str name="routerField">product_id_s</str>
<arr name="solrUrl">
<str>${test.xcjf.solr.url.1:}</str>
<str>${test.xcjf.solr.url.2:}</str>
<str>${test.xcjf.solr.url.3:}</str>
<arr name="allowSolrUrls">
<str>${test.ccjoin.solr.url.1:}</str>
<str>${test.ccjoin.solr.url.2:}</str>
<str>${test.ccjoin.solr.url.3:}</str>
</arr>
</queryParser>
<queryParser name="join_nonrouted" class="org.apache.solr.search.JoinQParserPlugin" />
</config>

View File

@ -1293,12 +1293,6 @@ public class QueryEqualityTest extends SolrTestCaseJ4 {
);
}
public void testXCJFQuery() throws Exception {
assertQueryEquals("xcjf",
"{!xcjf collection=abc from=x_id to=x_id}*:*",
"{!xcjf collection=abc from=x_id to=x_id v='*:*'}");
}
public void testHashRangeQuery() throws Exception {
assertQueryEquals("hash_range",
"{!hash_range f=x_id l=107347968 u=214695935}",

View File

@ -36,7 +36,7 @@ import org.apache.solr.common.params.ModifiableSolrParams;
import org.junit.BeforeClass;
import org.junit.Test;
public class XCJFQueryTest extends SolrCloudTestCase {
public class CrossCollectionJoinQueryTest extends SolrCloudTestCase {
private static final int NUM_NODES = 3;
private static final int NUM_SHARDS = 3;
@ -48,15 +48,15 @@ public class XCJFQueryTest extends SolrCloudTestCase {
@BeforeClass
public static void setupCluster() throws Exception {
configureCluster(NUM_NODES)
.addConfig("xcjf", configset("xcjf"))
.addConfig("ccjoin", configset("ccjoin"))
.withSolrXml(TEST_PATH().resolve("solr.xml"))
.configure();
CollectionAdminRequest.createCollection("products", "xcjf", NUM_SHARDS, NUM_REPLICAS)
CollectionAdminRequest.createCollection("products", "ccjoin", NUM_SHARDS, NUM_REPLICAS)
.process(cluster.getSolrClient());
CollectionAdminRequest.createCollection("parts", "xcjf", NUM_SHARDS, NUM_REPLICAS)
CollectionAdminRequest.createCollection("parts", "ccjoin", NUM_SHARDS, NUM_REPLICAS)
.process(cluster.getSolrClient());
}
@ -127,100 +127,100 @@ public class XCJFQueryTest extends SolrCloudTestCase {
}
@Test
public void testXcjfRoutedCollection() throws Exception {
public void testCcJoinRoutedCollection() throws Exception {
setupIndexes(true);
testXcjfQuery("{!xcjf collection=products from=product_id_i to=product_id_i}size_s:M",true);
testCcJoinQuery("{!join method=crossCollection fromIndex=products from=product_id_i to=product_id_i}size_s:M", true);
int i = 0;
for (JettySolrRunner runner : cluster.getJettySolrRunners()) {
i++;
String url = runner.getBaseUrl().toString();
System.setProperty("test.xcjf.solr.url." + i, url);
System.setProperty("test.ccjoin.solr.url." + i, url);
}
try {
// now we need to re-upload our config , now that we know a valid solr url for the cluster.
CloudSolrClient client = cluster.getSolrClient();
((ZkClientClusterStateProvider) client.getClusterStateProvider()).uploadConfig(configset("xcjf"), "xcjf");
// reload the cores with the updated whitelisted solr url config.
((ZkClientClusterStateProvider) client.getClusterStateProvider()).uploadConfig(configset("ccjoin"), "ccjoin");
// reload the cores with the updated allowSolrUrls config.
CollectionAdminRequest.Reload.reloadCollection("products").process(client);
CollectionAdminRequest.Reload.reloadCollection("parts").process(client);
Thread.sleep(10000);
testXcjfQuery("{!xcjf collection=products from=product_id_i to=product_id_i}size_s:M",true);
testCcJoinQuery("{!join method=crossCollection fromIndex=products from=product_id_i to=product_id_i}size_s:M", true);
testXcjfQuery(String.format(Locale.ROOT,
"{!xcjf solrUrl=\"%s\" collection=products from=product_id_i to=product_id_i}size_s:M", getSolrUrl()),
testCcJoinQuery(String.format(Locale.ROOT,
"{!join method=crossCollection solrUrl=\"%s\" fromIndex=products from=product_id_i to=product_id_i}size_s:M", getSolrUrl()),
true);
testXcjfQuery("{!xcjf collection=products from=product_id_l to=product_id_l}size_s:M",
testCcJoinQuery("{!join method=crossCollection fromIndex=products from=product_id_l to=product_id_l}size_s:M",
true);
testXcjfQuery(String.format(Locale.ROOT,
"{!xcjf solrUrl=\"%s\" collection=products from=product_id_l to=product_id_l}size_s:M",
testCcJoinQuery(String.format(Locale.ROOT,
"{!join method=crossCollection solrUrl=\"%s\" fromIndex=products from=product_id_l to=product_id_l}size_s:M",
getSolrUrl()),
true);
testXcjfQuery("{!xcjf collection=products from=product_id_s to=product_id_s}size_s:M",
testCcJoinQuery("{!join method=crossCollection fromIndex=products from=product_id_s to=product_id_s}size_s:M",
true);
testXcjfQuery(String.format(Locale.ROOT,
"{!xcjf solrUrl=\"%s\" collection=products from=product_id_s to=product_id_s}size_s:M",
testCcJoinQuery(String.format(Locale.ROOT,
"{!join method=crossCollection solrUrl=\"%s\" fromIndex=products from=product_id_s to=product_id_s}size_s:M",
getSolrUrl()),
true);
testXcjfQuery(String.format(Locale.ROOT,
"{!xcjf zkHost=\"%s\" collection=products from=product_id_s to=product_id_s}size_s:M",
testCcJoinQuery(String.format(Locale.ROOT,
"{!join method=crossCollection zkHost=\"%s\" fromIndex=products from=product_id_s to=product_id_s}size_s:M",
cluster.getSolrClient().getZkHost()),
true);
// Test the ability to set other parameters on xcjf and have them passed through
// Test the ability to set other parameters on crossCollection join and have them passed through
assertResultCount("parts",
"{!xcjf collection=products from=product_id_s to=product_id_s fq=product_id_s:1}size_s:M",
"{!join method=crossCollection fromIndex=products from=product_id_s to=product_id_s fq=product_id_s:1}size_s:M",
2, true);
assertResultCount("parts",
String.format(Locale.ROOT,
"{!xcjf solrUrl=\"%s\" collection=products from=product_id_s to=product_id_s fq=product_id_s:1}size_s:M",
"{!join method=crossCollection solrUrl=\"%s\" fromIndex=products from=product_id_s to=product_id_s fq=product_id_s:1}size_s:M",
getSolrUrl()), 2, true);
} finally {
for (JettySolrRunner runner : cluster.getJettySolrRunners()) {
i++;
System.getProperties().remove("test.xcjf.solr.url." + i);
System.getProperties().remove("test.ccjoin.solr.url." + i);
}
}
}
@Test
public void testXcjfNonroutedCollection() throws Exception {
public void testCcJoinNonroutedCollection() throws Exception {
setupIndexes(false);
// This query will expect the collection to have been routed on product_id, so it should return
// incomplete results.
testXcjfQuery("{!xcjf collection=products from=product_id_s to=product_id_s}size_s:M",
testCcJoinQuery("{!join method=crossCollection fromIndex=products from=product_id_s to=product_id_s}size_s:M",
false);
// Now if we set routed=false we should get a complete set of results.
testXcjfQuery("{!xcjf collection=products from=product_id_s to=product_id_s routed=false}size_s:M",
testCcJoinQuery("{!join method=crossCollection fromIndex=products from=product_id_s to=product_id_s routed=false}size_s:M",
true);
// The xcjf_nonrouted query parser doesn't assume that the collection was routed on product_id,
// The join_nonrouted query parser doesn't assume that the collection was routed on product_id,
// so we should get the full set of results.
testXcjfQuery("{!xcjf_nonrouted collection=products from=product_id_s to=product_id_s}size_s:M",
testCcJoinQuery("{!join_nonrouted method=crossCollection fromIndex=products from=product_id_s to=product_id_s}size_s:M",
true);
// But if we set routed=true, we are now assuming again that the collection was routed on product_id,
// so we should get incomplete results.
testXcjfQuery("{!xcjf_nonrouted collection=products from=product_id_s to=product_id_s routed=true}size_s:M",
testCcJoinQuery("{!join_nonrouted method=crossCollection fromIndex=products from=product_id_s to=product_id_s routed=true}size_s:M",
false);
}
@Test
public void testSolrUrlWhitelist() throws Exception {
public void testAllowSolrUrlsList() throws Exception {
setupIndexes(false);
// programmatically add the current jetty solr url to the solrUrl whitelist property in the solrconfig.xml
// programmatically add the current jetty solr url to the allowSolrUrls property in the solrconfig.xml
int i = 0;
for (JettySolrRunner runner : cluster.getJettySolrRunners()) {
i++;
System.setProperty("test.xcjf.solr.url." + i, runner.getBaseUrl().toString());
System.setProperty("test.ccjoin.solr.url." + i, runner.getBaseUrl().toString());
}
try {
// now we need to re-upload our config , now that we know a valid solr url for the cluster.
CloudSolrClient client = cluster.getSolrClient();
((ZkClientClusterStateProvider) client.getClusterStateProvider()).uploadConfig(configset("xcjf"), "xcjf");
// reload the cores with the updated whitelisted solr url config.
((ZkClientClusterStateProvider) client.getClusterStateProvider()).uploadConfig(configset("ccjoin"), "ccjoin");
// reload the cores with the updated allowSolrUrls config.
CollectionAdminRequest.Reload.reloadCollection("products").process(client);
CollectionAdminRequest.Reload.reloadCollection("parts").process(client);
@ -232,33 +232,33 @@ public class XCJFQueryTest extends SolrCloudTestCase {
// we expect an exception because bogus url isn't valid.
try {
// This should throw an exception.
// verify the xcfj_whitelist definition has the current valid urls and works.
testXcjfQuery(String.format(Locale.ROOT,
"{!xcjf_whitelist solrUrl=\"%s\" collection=products from=product_id_i to=product_id_i}size_s:M",
// verify the join plugin definition has the current valid urls and works.
testCcJoinQuery(String.format(Locale.ROOT,
"{!join method=crossCollection solrUrl=\"%s\" fromIndex=products from=product_id_i to=product_id_i}size_s:M",
"http://bogus.example.com:8983/solr"),
true);
fail("The query invovling bogus.example.com should not succeed");
} catch (Exception e) {
// should get here.
String message = e.getMessage();
assertTrue("message was " + message, message.contains("SyntaxError: Solr Url was not in the whitelist"));
assertTrue("message was " + message, message.contains("SyntaxError: Solr URL was not in allowSolrUrls list"));
}
// verify the xcfj_whitelist definition has the current valid urls and works.
testXcjfQuery(String.format(Locale.ROOT,
"{!xcjf_whitelist solrUrl=\"%s\" collection=products from=product_id_i to=product_id_i}size_s:M",
// verify the join plugin definition has the current valid urls and works.
testCcJoinQuery(String.format(Locale.ROOT,
"{!join method=crossCollection solrUrl=\"%s\" fromIndex=products from=product_id_i to=product_id_i}size_s:M",
getSolrUrl()),
true);
} finally {
for (JettySolrRunner runner : cluster.getJettySolrRunners()) {
i++;
System.getProperties().remove("test.xcjf.solr.url." + i);
System.getProperties().remove("test.ccjoin.solr.url." + i);
}
}
}
public void testXcjfQuery(String query, boolean expectFullResults) throws Exception {
public void testCcJoinQuery(String query, boolean expectFullResults) throws Exception {
assertResultCount("parts", query, NUM_PRODUCTS / 2, expectFullResults);
}

View File

@ -551,9 +551,9 @@ http://localhost:8983/solr/alt_graph/query?fl=id&q={!graph+from=id+to=out_edge+m
----
== Hash Range Query Parser
The hash range query parser will return documents that have a field that contains a value that would be hashed to a particular range. This is used by the XCJF query parser. This query parser has a per segment cache for each field that this query parser will operate on.
The hash range query parser will return documents that have a field that contains a value that would be hashed to a particular range. This is used by the join query when using method=crossCollection. The hash rang query parser has a per segment cache for each field that this query parser will operate on.
When specifying a min/max hash range and a field name with the hash range query parser, only documents who contain a field value that hashes into that range will be returned. If you want to query for a very large result set, you can query for various hash ranges to return a fraction of the documents with each range request. In the XCJF case, the hash_range query parser is used to ensure that each shard only gets the set of join keys that would end up on that shard.
When specifying a min/max hash range and a field name with the hash range query parser, only documents who contain a field value that hashes into that range will be returned. If you want to query for a very large result set, you can query for various hash ranges to return a fraction of the documents with each range request. In the cross collection join case, the hash_range query parser is used to ensure that each shard only gets the set of join keys that would end up on that shard.
This query parser uses the MurmurHash3_x86_32. This is the same as the default hashing for the default composite ID router in Solr.
@ -637,7 +637,7 @@ This parameter is required.
The name of the index to run the "from" query (`v` parameter) on and where "from" values are gathered.
Must be located on the same node as the core processing the request.
This parameter is optional; it defaults to the value of the processing core if not specified.
See <<Joining Across Collections,Joining Across Collections>> below for more information.
See <<Joining Across Single Shard Collections,Joining Across Single Shard Collections>> or <<Cross Collection Join,Cross Collection Join>> below for more information.
`score`::
An optional parameter that instructs Solr to return information about the "from" query scores.
@ -681,9 +681,9 @@ But they are also expensive to build and need to be lazily populated after each
If you commit frequently and your use-case can tolerate a static warming query, consider adding one to `solrconfig.xml` so that this work is done as a part of the commit itself and not attached directly to user requests.
Consider this method when the "from" query matches a large number of documents and the "to" result set is small to moderate in size, but only if sporadic post-commit slowness is tolerable.
=== Joining Across Collections
=== Joining Across Single Shard Collections
You can also specify a `fromIndex` parameter to join with a field from another core or collection. If running in SolrCloud mode, then the collection specified in the `fromIndex` parameter must have a single shard and a replica on all Solr nodes where the collection you're joining to has a replica.
You can also specify a `fromIndex` parameter to join with a field from another core or a single shard collection. If running in SolrCloud mode, then the collection specified in the `fromIndex` parameter must have a single shard and a replica on all Solr nodes where the collection you're joining to has a replica.
Let's consider an example where you want to use a Solr join query to filter movies by directors that have won an Oscar. Specifically, imagine we have two collections with the following fields:
@ -724,6 +724,92 @@ At query time, the `JoinQParser` will access the local replica of the *movie_dir
For more information, Erick Erickson has written a blog post about join performance titled https://lucidworks.com/2012/06/20/solr-and-joins/[Solr and Joins].
=== Cross Collection Join
The Cross Collection Join Filter is a method for the join parser that will execute a query against a remote Solr collection to get back a set of join keys that will be used to as a filter query against the local Solr collection.
The crossCollection join query will create an CrossCollectionQuery object.
The CrossCollectionQuery will first query a remote Solr collection and get back a streaming expression result of the join keys.
As the join keys are streamed to the node, a bitset of the matching documents in the local index is built up.
This avoids keeping the full set of join keys in memory at any given time.
This bitset is then inserted into the filter cache upon successful execution as with the normal behavior of the Solr filter cache.
If the local index is sharded according to the join key field, the cross collection join can leverage a secondary query parser called the "hash_range" query parser.
The hash_range query parser is responsible for returning only the documents that hash to a given range of values.
This allows the CrossCollectionQuery to query the remote Solr collection and return only the join keys that would match a specific shard in the local Solr collection.
This has the benefit of making sure that network traffic doesn't increase as the number of shards increases and allows for much greater scalability.
The CrossCollection join query works with both String and Point types of fields.
The fields that are being used for the join key must be single-valued and have docValues enabled.
It's advised to shard the local collection by the join key as this allows for the optimization mentioned above to be utilized.
The cross collection join queries should not be generally used as part of the `q` parameter, but rather it is designed to be used as a filter query (`fq` parameter) to ensure proper caching.
The remote Solr collection that is being queried should have a single-valued field for the join key with docValues enabled.
The remote Solr collection does not have any specific sharding requirements.
==== Join Query Parser Definition in solrconfig.xml
The cross collection join has some configuration options that can be specified in `solrconfig.xml`.
`routerField`::
If the documents are routed to shards using the CompositeID router by the join field, then that field name should be specified in the configuration here. This will allow the parser to optimize the resulting HashRange query.
`solrUrl`::
If specified, this array of strings specifies the white listed Solr URLs that you can pass to the solrUrl query parameter. Without this configuration the solrUrl parameter cannot be used. This restriction is necessary to prevent an attacker from using Solr to explore the network.
[source,xml]
----
<queryParser name="join" class="org.apache.solr.search.JoinQParserPlugin">
<str name="routerField">product_id_s</str>
<arr name="allowSolrUrls">
<str>http://othersolr.example.com:8983/solr</str>
</arr>
</queryParser>
----
==== Cross Collection Join Query Parameters
`fromIndex`::
The name of the external Solr collection to be queried to retrieve the set of join key values (required).
`zkHost`::
The connection string to be used to connect to ZooKeeper. `zkHost` and `solrUrl` are both optional parameters, and at most one of them should be specified. If neither `zkHost` nor `solrUrl` are specified, the local ZooKeeper cluster will be used. (optional).
`solrUrl`::
The URL of the external Solr node to be queried. Must be a character for character exact match of a whitelisted url. (optional, disabled by default for security).
`from`::
The join key field name in the external collection (required).
`to`::
The join key field name in the local collection.
`v`::
The query substituted in as a local param. This is the query string that will match documents in the remote collection.
`routed`::
If `true`, the cross collection join query will use each shard's hash range to determine the set of join keys to retrieve for that shard.
This parameter improves the performance of the cross-collection join, but it depends on the local collection being routed by the `to` field.
If this parameter is not specified, the cross collection join query will try to determine the correct value automatically.
`ttl`::
The length of time that a cross colleciton join query in the cache will be considered valid, in seconds.
Defaults to `3600` (one hour).
The cross collection join query will not be aware of changes to the remote collection, so if the remote collection is updated, cached cross collection queries may give inaccurate results.
After the `ttl` period has expired, the cross collection join query will re-execute the join against the remote collection.
Other Parameters::
Any normal Solr query parameter can also be specified/passed through as a local param.
==== Cross Collection Query Examples
[source,text]
----
http://localhost:8983/solr/localCollection/query?fl=id&q={!join method="crossCollection" fromIndex="otherCollection" from="fromField" to="toField" v="*:*"}
----
== Lucene Query Parser
The `LuceneQParser` extends the `QParserPlugin` by parsing Solr's variant on the Lucene QueryParser syntax. This is effectively the same query parser that is used in Lucene. It uses the operators `q.op`, the default operator ("OR" or "AND") and `df`, the default field name.
@ -1116,94 +1202,6 @@ An optional parameter used to determine which of several query implementations s
{!terms f=categoryId method=booleanQuery separator=" "}8 6 7 5309
----
== XCJF Query Parser
The Cross Collection Join Filter (XCJF) is a query parser plugin that will execute a query against a remote Solr collection to get back a set of join keys that will be used to as a filter query against the local Solr collection.
The XCJF parser will create an XCJFQuery object.
The XCJFQuery will first query a remote Solr collection and get back a streaming expression result of the join keys.
As the join keys are streamed to the node, a bitset of the matching documents in the local index is built up.
This avoids keeping the full set of join keys in memory at any given time.
This bitset is then inserted into the filter cache upon successful execution as with the normal behavior of the Solr filter cache.
If the local index is sharded according to the join key field, the XCJF parser can leverage a secondary query parser called the "hash_range" query parser.
The hash_range query parser is responsible for returning only the documents that hash to a given range of values.
This allows the XCJFQuery to query the remote Solr collection and return only the join keys that would match a specific shard in the local Solr collection.
This has the benefit of making sure that network traffic doesn't increase as the number of shards increases and allows for much greater scalability.
The XCJF parser works with both String and Point types of fields.
The fields that are being used for the join key must be single-valued and have docValues enabled.
It's advised to shard the local collection by the join key as this allows for the optimization mentioned above to be utilized.
The XCJF parser should not be generally used as part of the `q` parameter, but rather it is designed to be used as a filter query (`fq` parameter) to ensure proper caching.
The remote Solr collection that is being queried should have a single-valued field for the join key with docValues enabled.
The remote Solr collection does not have any specific sharding requirements.
=== XCJF Query Parser Definition in solrconfig.xml
The XCJF has some configuration options that can be specified in `solrconfig.xml`.
`routerField`::
If the documents are routed to shards using the CompositeID router by the join field, then that field name should be specified in the configuration here. This will allow the parser to optimize the resulting HashRange query.
`solrUrl`::
If specified, this array of strings specifies the white listed Solr URLs that you can pass to the solrUrl query parameter. Without this configuration the solrUrl parameter cannot be used. This restriction is necessary to prevent an attacker from using Solr to explore the network.
[source,xml]
----
<queryParser name="xcjf" class="org.apache.solr.search.join.XCJFQParserPlugin">
<!-- Define which field has the routing information for the collection to use the hash range query parser. -->
<str name="routerField">joinfield_id_s</str>
<!-- Demo only, most users will want to remove this parameter -->
<arr name="solrUrl">
<str>http://othersolr.example.com:8983/solr</str>
</arr>
</queryParser>
----
=== XCJF Query Parameters
`collection`::
The name of the external Solr collection to be queried to retrieve the set of join key values (required).
`zkHost`::
The connection string to be used to connect to ZooKeeper. `zkHost` and `solrUrl` are both optional parameters, and at most one of them should be specified. If neither `zkHost` nor `solrUrl` are specified, the local ZooKeeper cluster will be used. (optional).
`solrUrl`::
The URL of the external Solr node to be queried. Must be a character for character exact match of a whitelisted url. (optional, disabled by default for security).
`from`::
The join key field name in the external collection (required).
`to`::
The join key field name in the local collection.
`v`::
The query substituted in as a local param. This is the query string that will match documents in the remote collection.
`routed`::
If `true`, the XCJF query will use each shard's hash range to determine the set of join keys to retrieve for that shard.
This parameter improves the performance of the cross-collection join, but it depends on the local collection being routed by the `to` field.
If this parameter is not specified, the XCJF query will try to determine the correct value automatically.
`ttl`::
The length of time that an XCJF query in the cache will be considered valid, in seconds.
Defaults to `3600` (one hour).
The XCJF query will not be aware of changes to the remote collection, so if the remote collection is updated, cached XCJF queries may give inaccurate results.
After the `ttl` period has expired, the XCJF query will re-execute the join against the remote collection.
Other Parameters::
Any normal Solr query parameter can also be specified/passed through as a local param.
=== XCJF Query Examples
[source,text]
----
http://localhost:8983/solr/localCollection/query?fl=id&q={!xcjf collection="otherCollection" from="fromField" to="toField" v="*:*"}
----
== XML Query Parser
The {solr-javadocs}/solr-core/org/apache/solr/search/XmlQParserPlugin.html[XmlQParserPlugin] extends the {solr-javadocs}/solr-core/org/apache/solr/search/QParserPlugin.html[QParserPlugin] and supports the creation of queries from XML. Example: