From b8648c60e781a684a6c017fb09cac87595c2a8fa Mon Sep 17 00:00:00 2001 From: Ishan Chattopadhyaya Date: Tue, 15 Oct 2019 14:43:20 +0530 Subject: [PATCH] SOLR-13793: Limiting number of forwards to total replicas in collection to avoid deadly forwarding loops --- solr/CHANGES.txt | 4 + .../org/apache/solr/servlet/HttpSolrCall.java | 26 ++- .../cloud/TestQueryingOnDownCollection.java | 159 ++++++++++++++++++ 3 files changed, 186 insertions(+), 3 deletions(-) create mode 100644 solr/core/src/test/org/apache/solr/cloud/TestQueryingOnDownCollection.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 705d03e677e..802ad3227c5 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -322,6 +322,10 @@ Bug Fixes * SOLR-13665: Added missing netty dependencies to solrJ and upgraded netty to v 4.1.29.Final (Jörn Franke, janhoy) +* SOLR-13793: HttpSolrCall now maintains internal request count (_forwardedCount) for remote queries and limits them to + the number of replicas. This avoids making too many cascading calls to remote servers, which, if not restricted, can + bring down nodes containing the said collection (Kesharee Nandan Vishwakarma, Ishan Chattopadhyaya) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java b/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java index 042c392eb36..5e56b7310ae 100644 --- a/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java +++ b/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java @@ -139,6 +139,8 @@ public class HttpSolrCall { public static final String ORIGINAL_USER_PRINCIPAL_HEADER = "originalUserPrincipal"; + public static final String INTERNAL_REQUEST_COUNT = "_forwardedCount"; + public static final Random random; static { // We try to make things reproducible in the context of our tests by initializing the random instance @@ -446,7 +448,7 @@ public class HttpSolrCall { } } - protected void extractRemotePath(String collectionName, String origCorename) throws UnsupportedEncodingException, KeeperException, InterruptedException { + protected void extractRemotePath(String collectionName, String origCorename) throws UnsupportedEncodingException, KeeperException, InterruptedException, SolrException { assert core == null; coreUrl = getRemoteCoreUrl(collectionName, origCorename); // don't proxy for internal update requests @@ -644,12 +646,19 @@ public class HttpSolrCall { } } + private String getQuerySting() { + int internalRequestCount = queryParams.getInt(INTERNAL_REQUEST_COUNT, 0); + ModifiableSolrParams updatedQueryParams = new ModifiableSolrParams(queryParams); + updatedQueryParams.set(INTERNAL_REQUEST_COUNT, internalRequestCount + 1); + return updatedQueryParams.toQueryString(); + } + //TODO using Http2Client private void remoteQuery(String coreUrl, HttpServletResponse resp) throws IOException { HttpRequestBase method; HttpEntity httpEntity = null; try { - String urlstr = coreUrl + queryParams.toQueryString(); + String urlstr = coreUrl + getQuerySting(); boolean isPostOrPutRequest = "POST".equals(req.getMethod()) || "PUT".equals(req.getMethod()); if ("GET".equals(req.getMethod())) { @@ -963,13 +972,15 @@ public class HttpSolrCall { } } - protected String getRemoteCoreUrl(String collectionName, String origCorename) { + protected String getRemoteCoreUrl(String collectionName, String origCorename) throws SolrException { ClusterState clusterState = cores.getZkController().getClusterState(); final DocCollection docCollection = clusterState.getCollectionOrNull(collectionName); Slice[] slices = (docCollection != null) ? docCollection.getActiveSlicesArr() : null; List activeSlices = new ArrayList<>(); boolean byCoreName = false; + int totalReplicas = 0; + if (slices == null) { byCoreName = true; activeSlices = new ArrayList<>(); @@ -983,6 +994,9 @@ public class HttpSolrCall { } } + for (Slice s: activeSlices) { + totalReplicas += s.getReplicas().size(); + } if (activeSlices.isEmpty()) { return null; } @@ -996,7 +1010,13 @@ public class HttpSolrCall { String coreUrl = getCoreUrl(collectionName, origCorename, clusterState, activeSlices, byCoreName, true); + // Avoid getting into a recursive loop of requests being forwarded by + // stopping forwarding and erroring out after (totalReplicas) forwards if (coreUrl == null) { + if (queryParams.getInt(INTERNAL_REQUEST_COUNT, 0) > totalReplicas){ + throw new SolrException(SolrException.ErrorCode.INVALID_STATE, + "No active replicas found for collection: " + collectionName); + } coreUrl = getCoreUrl(collectionName, origCorename, clusterState, activeSlices, byCoreName, false); } diff --git a/solr/core/src/test/org/apache/solr/cloud/TestQueryingOnDownCollection.java b/solr/core/src/test/org/apache/solr/cloud/TestQueryingOnDownCollection.java new file mode 100644 index 00000000000..763cdd47390 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/cloud/TestQueryingOnDownCollection.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.cloud; + +import java.lang.invoke.MethodHandles; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrRequest; +import org.apache.solr.client.solrj.impl.Http2SolrClient; +import org.apache.solr.client.solrj.impl.PreemptiveBasicAuthClientBuilderFactory; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.request.QueryRequest; +import org.apache.solr.client.solrj.request.UpdateRequest; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.util.Utils; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class TestQueryingOnDownCollection extends SolrCloudTestCase { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final String COLLECTION_NAME = "infected"; + + private static final String USERNAME = "solr"; + private static final String PASSWORD = "solr"; + + static { + System.setProperty("basicauth", String.format(Locale.ROOT,"{}:{}", USERNAME, PASSWORD)); + } + + @BeforeClass + public static void setupCluster() throws Exception { + configureCluster(3) + .addConfig("conf", configset("cloud-minimal")) + .withSecurityJson(STD_CONF) + .configure(); + } + + @Test + /** + * Assert that requests to "down collection", i.e. a collection which has all replicas in down state + * (but are hosted on nodes that are live), fail fast and throw meaningful exceptions + */ + public void testQueryToDownCollectionShouldFailFast() throws Exception { + + CollectionAdminRequest.createCollection(COLLECTION_NAME, "conf", 2, 1) + .setBasicAuthCredentials(USERNAME, PASSWORD) + .process(cluster.getSolrClient()); + + // Add some dummy documents + UpdateRequest update = (UpdateRequest) new UpdateRequest().setBasicAuthCredentials(USERNAME, PASSWORD); + for (int i = 0; i < 100; i++) { + update.add("id", Integer.toString(i)); + } + update.commit(cluster.getSolrClient(), COLLECTION_NAME); + + // Bring down replicas but keep nodes up. This could've been done by some combinations of collections API operations; + // however, to make it faster, altering cluster state directly! ;-) + downAllReplicas(); + + // assert all replicas are in down state + List replicas = getCollectionState(COLLECTION_NAME).getReplicas(); + for (Replica replica: replicas){ + assertEquals(replica.getState(), Replica.State.DOWN); + } + + // assert all nodes as active + assertEquals(3, cluster.getSolrClient().getClusterStateProvider().getLiveNodes().size()); + + SolrClient client = cluster.getJettySolrRunner(0).newClient(); + + SolrRequest req = new QueryRequest(new SolrQuery("*:*").setRows(0)).setBasicAuthCredentials(USERNAME, PASSWORD); + + // Without the SOLR-13793 fix, this causes requests to "down collection" to pile up (until the nodes run out + // of serviceable threads and they crash, even for other collections hosted on the nodes). + SolrException error = expectThrows(SolrException.class, + "Request should fail after trying all replica nodes once", + () -> client.request(req, COLLECTION_NAME) + ); + + client.close(); + + assertEquals(error.code(), SolrException.ErrorCode.INVALID_STATE.code); + assertTrue(error.getMessage().contains("No active replicas found for collection: " + COLLECTION_NAME)); + + // run same set of tests on v2 client which uses V2HttpCall + Http2SolrClient v2Client = new Http2SolrClient.Builder(cluster.getJettySolrRunner(0).getBaseUrl().toString()) + .build(); + PreemptiveBasicAuthClientBuilderFactory factory = new PreemptiveBasicAuthClientBuilderFactory(); + factory.setup(v2Client); + + error = expectThrows(SolrException.class, + "Request should fail after trying all replica nodes once", + () -> v2Client.request(req, COLLECTION_NAME) + ); + + v2Client.close(); + + assertEquals(error.code(), SolrException.ErrorCode.INVALID_STATE.code); + assertTrue(error.getMessage().contains("No active replicas found for collection: " + COLLECTION_NAME)); + } + + private void downAllReplicas() throws Exception { + byte[] collectionState = cluster.getZkClient().getData("/collections/" + COLLECTION_NAME + "/state.json", + null, null, true); + + Map> infectedState = (Map>) Utils.fromJSON(collectionState); + Map shards = (Map) infectedState.get(COLLECTION_NAME).get("shards"); + for(Map.Entry shard: shards.entrySet()) { + Map replicas = (Map) ((Map) shard.getValue() ).get("replicas"); + for (Map.Entry replica : replicas.entrySet()) { + ((Map) replica.getValue()).put("state", Replica.State.DOWN.toString()); + } + } + + cluster.getZkClient().setData("/collections/" + COLLECTION_NAME + "/state.json", Utils.toJSON(infectedState) + , true); + } + + protected static final String STD_CONF = "{\n" + + " \"authentication\":{\n" + + " \"blockUnknown\": true,\n" + + " \"class\":\"solr.BasicAuthPlugin\",\n" + + " \"credentials\":{\"solr\":\"EEKn7ywYk5jY8vG9TyqlG2jvYuvh1Q7kCCor6Hqm320= 6zkmjMjkMKyJX6/f0VarEWQujju5BzxZXub6WOrEKCw=\"}\n" + + " },\n" + + " \"authorization\":{\n" + + " \"class\":\"solr.RuleBasedAuthorizationPlugin\",\n" + + " \"permissions\":[\n" + + " {\"name\":\"security-edit\", \"role\":\"admin\"},\n" + + " {\"name\":\"collection-admin-edit\", \"role\":\"admin\"},\n" + + " {\"name\":\"core-admin-edit\", \"role\":\"admin\"}\n" + + " ],\n" + + " \"user-role\":{\"solr\":\"admin\"}\n" + + " }\n" + + "}"; + + +}