SOLR-13793: Limiting number of forwards to total replicas in collection to avoid deadly forwarding loops

This commit is contained in:
Ishan Chattopadhyaya 2019-10-15 14:43:20 +05:30
parent 64fb42c714
commit b8648c60e7
3 changed files with 186 additions and 3 deletions

View File

@ -322,6 +322,10 @@ Bug Fixes
* SOLR-13665: Added missing netty dependencies to solrJ and upgraded netty to v 4.1.29.Final (Jörn Franke, janhoy) * SOLR-13665: Added missing netty dependencies to solrJ and upgraded netty to v 4.1.29.Final (Jörn Franke, janhoy)
* SOLR-13793: HttpSolrCall now maintains internal request count (_forwardedCount) for remote queries and limits them to
the number of replicas. This avoids making too many cascading calls to remote servers, which, if not restricted, can
bring down nodes containing the said collection (Kesharee Nandan Vishwakarma, Ishan Chattopadhyaya)
Other Changes Other Changes
---------------------- ----------------------

View File

@ -139,6 +139,8 @@ public class HttpSolrCall {
public static final String ORIGINAL_USER_PRINCIPAL_HEADER = "originalUserPrincipal"; public static final String ORIGINAL_USER_PRINCIPAL_HEADER = "originalUserPrincipal";
public static final String INTERNAL_REQUEST_COUNT = "_forwardedCount";
public static final Random random; public static final Random random;
static { static {
// We try to make things reproducible in the context of our tests by initializing the random instance // We try to make things reproducible in the context of our tests by initializing the random instance
@ -446,7 +448,7 @@ public class HttpSolrCall {
} }
} }
protected void extractRemotePath(String collectionName, String origCorename) throws UnsupportedEncodingException, KeeperException, InterruptedException { protected void extractRemotePath(String collectionName, String origCorename) throws UnsupportedEncodingException, KeeperException, InterruptedException, SolrException {
assert core == null; assert core == null;
coreUrl = getRemoteCoreUrl(collectionName, origCorename); coreUrl = getRemoteCoreUrl(collectionName, origCorename);
// don't proxy for internal update requests // don't proxy for internal update requests
@ -644,12 +646,19 @@ public class HttpSolrCall {
} }
} }
private String getQuerySting() {
int internalRequestCount = queryParams.getInt(INTERNAL_REQUEST_COUNT, 0);
ModifiableSolrParams updatedQueryParams = new ModifiableSolrParams(queryParams);
updatedQueryParams.set(INTERNAL_REQUEST_COUNT, internalRequestCount + 1);
return updatedQueryParams.toQueryString();
}
//TODO using Http2Client //TODO using Http2Client
private void remoteQuery(String coreUrl, HttpServletResponse resp) throws IOException { private void remoteQuery(String coreUrl, HttpServletResponse resp) throws IOException {
HttpRequestBase method; HttpRequestBase method;
HttpEntity httpEntity = null; HttpEntity httpEntity = null;
try { try {
String urlstr = coreUrl + queryParams.toQueryString(); String urlstr = coreUrl + getQuerySting();
boolean isPostOrPutRequest = "POST".equals(req.getMethod()) || "PUT".equals(req.getMethod()); boolean isPostOrPutRequest = "POST".equals(req.getMethod()) || "PUT".equals(req.getMethod());
if ("GET".equals(req.getMethod())) { if ("GET".equals(req.getMethod())) {
@ -963,13 +972,15 @@ public class HttpSolrCall {
} }
} }
protected String getRemoteCoreUrl(String collectionName, String origCorename) { protected String getRemoteCoreUrl(String collectionName, String origCorename) throws SolrException {
ClusterState clusterState = cores.getZkController().getClusterState(); ClusterState clusterState = cores.getZkController().getClusterState();
final DocCollection docCollection = clusterState.getCollectionOrNull(collectionName); final DocCollection docCollection = clusterState.getCollectionOrNull(collectionName);
Slice[] slices = (docCollection != null) ? docCollection.getActiveSlicesArr() : null; Slice[] slices = (docCollection != null) ? docCollection.getActiveSlicesArr() : null;
List<Slice> activeSlices = new ArrayList<>(); List<Slice> activeSlices = new ArrayList<>();
boolean byCoreName = false; boolean byCoreName = false;
int totalReplicas = 0;
if (slices == null) { if (slices == null) {
byCoreName = true; byCoreName = true;
activeSlices = new ArrayList<>(); activeSlices = new ArrayList<>();
@ -983,6 +994,9 @@ public class HttpSolrCall {
} }
} }
for (Slice s: activeSlices) {
totalReplicas += s.getReplicas().size();
}
if (activeSlices.isEmpty()) { if (activeSlices.isEmpty()) {
return null; return null;
} }
@ -996,7 +1010,13 @@ public class HttpSolrCall {
String coreUrl = getCoreUrl(collectionName, origCorename, clusterState, String coreUrl = getCoreUrl(collectionName, origCorename, clusterState,
activeSlices, byCoreName, true); activeSlices, byCoreName, true);
// Avoid getting into a recursive loop of requests being forwarded by
// stopping forwarding and erroring out after (totalReplicas) forwards
if (coreUrl == null) { if (coreUrl == null) {
if (queryParams.getInt(INTERNAL_REQUEST_COUNT, 0) > totalReplicas){
throw new SolrException(SolrException.ErrorCode.INVALID_STATE,
"No active replicas found for collection: " + collectionName);
}
coreUrl = getCoreUrl(collectionName, origCorename, clusterState, coreUrl = getCoreUrl(collectionName, origCorename, clusterState,
activeSlices, byCoreName, false); activeSlices, byCoreName, false);
} }

View File

@ -0,0 +1,159 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.cloud;
import java.lang.invoke.MethodHandles;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.impl.Http2SolrClient;
import org.apache.solr.client.solrj.impl.PreemptiveBasicAuthClientBuilderFactory;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.util.Utils;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class TestQueryingOnDownCollection extends SolrCloudTestCase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static final String COLLECTION_NAME = "infected";
private static final String USERNAME = "solr";
private static final String PASSWORD = "solr";
static {
System.setProperty("basicauth", String.format(Locale.ROOT,"{}:{}", USERNAME, PASSWORD));
}
@BeforeClass
public static void setupCluster() throws Exception {
configureCluster(3)
.addConfig("conf", configset("cloud-minimal"))
.withSecurityJson(STD_CONF)
.configure();
}
@Test
/**
* Assert that requests to "down collection", i.e. a collection which has all replicas in down state
* (but are hosted on nodes that are live), fail fast and throw meaningful exceptions
*/
public void testQueryToDownCollectionShouldFailFast() throws Exception {
CollectionAdminRequest.createCollection(COLLECTION_NAME, "conf", 2, 1)
.setBasicAuthCredentials(USERNAME, PASSWORD)
.process(cluster.getSolrClient());
// Add some dummy documents
UpdateRequest update = (UpdateRequest) new UpdateRequest().setBasicAuthCredentials(USERNAME, PASSWORD);
for (int i = 0; i < 100; i++) {
update.add("id", Integer.toString(i));
}
update.commit(cluster.getSolrClient(), COLLECTION_NAME);
// Bring down replicas but keep nodes up. This could've been done by some combinations of collections API operations;
// however, to make it faster, altering cluster state directly! ;-)
downAllReplicas();
// assert all replicas are in down state
List<Replica> replicas = getCollectionState(COLLECTION_NAME).getReplicas();
for (Replica replica: replicas){
assertEquals(replica.getState(), Replica.State.DOWN);
}
// assert all nodes as active
assertEquals(3, cluster.getSolrClient().getClusterStateProvider().getLiveNodes().size());
SolrClient client = cluster.getJettySolrRunner(0).newClient();
SolrRequest req = new QueryRequest(new SolrQuery("*:*").setRows(0)).setBasicAuthCredentials(USERNAME, PASSWORD);
// Without the SOLR-13793 fix, this causes requests to "down collection" to pile up (until the nodes run out
// of serviceable threads and they crash, even for other collections hosted on the nodes).
SolrException error = expectThrows(SolrException.class,
"Request should fail after trying all replica nodes once",
() -> client.request(req, COLLECTION_NAME)
);
client.close();
assertEquals(error.code(), SolrException.ErrorCode.INVALID_STATE.code);
assertTrue(error.getMessage().contains("No active replicas found for collection: " + COLLECTION_NAME));
// run same set of tests on v2 client which uses V2HttpCall
Http2SolrClient v2Client = new Http2SolrClient.Builder(cluster.getJettySolrRunner(0).getBaseUrl().toString())
.build();
PreemptiveBasicAuthClientBuilderFactory factory = new PreemptiveBasicAuthClientBuilderFactory();
factory.setup(v2Client);
error = expectThrows(SolrException.class,
"Request should fail after trying all replica nodes once",
() -> v2Client.request(req, COLLECTION_NAME)
);
v2Client.close();
assertEquals(error.code(), SolrException.ErrorCode.INVALID_STATE.code);
assertTrue(error.getMessage().contains("No active replicas found for collection: " + COLLECTION_NAME));
}
private void downAllReplicas() throws Exception {
byte[] collectionState = cluster.getZkClient().getData("/collections/" + COLLECTION_NAME + "/state.json",
null, null, true);
Map<String,Map<String,?>> infectedState = (Map<String,Map<String,?>>) Utils.fromJSON(collectionState);
Map<String, Object> shards = (Map<String, Object>) infectedState.get(COLLECTION_NAME).get("shards");
for(Map.Entry<String, Object> shard: shards.entrySet()) {
Map<String, Object> replicas = (Map<String, Object>) ((Map<String, Object>) shard.getValue() ).get("replicas");
for (Map.Entry<String, Object> replica : replicas.entrySet()) {
((Map<String, Object>) replica.getValue()).put("state", Replica.State.DOWN.toString());
}
}
cluster.getZkClient().setData("/collections/" + COLLECTION_NAME + "/state.json", Utils.toJSON(infectedState)
, true);
}
protected static final String STD_CONF = "{\n" +
" \"authentication\":{\n" +
" \"blockUnknown\": true,\n" +
" \"class\":\"solr.BasicAuthPlugin\",\n" +
" \"credentials\":{\"solr\":\"EEKn7ywYk5jY8vG9TyqlG2jvYuvh1Q7kCCor6Hqm320= 6zkmjMjkMKyJX6/f0VarEWQujju5BzxZXub6WOrEKCw=\"}\n" +
" },\n" +
" \"authorization\":{\n" +
" \"class\":\"solr.RuleBasedAuthorizationPlugin\",\n" +
" \"permissions\":[\n" +
" {\"name\":\"security-edit\", \"role\":\"admin\"},\n" +
" {\"name\":\"collection-admin-edit\", \"role\":\"admin\"},\n" +
" {\"name\":\"core-admin-edit\", \"role\":\"admin\"}\n" +
" ],\n" +
" \"user-role\":{\"solr\":\"admin\"}\n" +
" }\n" +
"}";
}