SOLR-9527: Improve distribution of replicas when restoring a collection

This commit is contained in:
Varun Thacker 2017-05-10 13:34:04 +01:00
parent 4cc20d27cd
commit b1efd37ba7
8 changed files with 149 additions and 8 deletions

View File

@ -335,6 +335,10 @@ Bug Fixes
* SOLR-10630: HttpSolrCall.getAuthCtx().new AuthorizationContext() {...}.getParams()
sometimes throws java.lang.NullPointerException (hu xiaodong via shalin)
* SOLR-9527: Improve distribution of replicas when restoring a collection
(Hrishikesh Gadre, Stephen Lewis, Rohit, Varun Thacker)
Other Changes
----------------------

View File

@ -58,6 +58,7 @@ import org.apache.solr.common.cloud.ZkConfigManager;
import org.apache.solr.common.cloud.ZkCoreNodeProps;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CollectionAdminParams;
import org.apache.solr.common.params.CollectionParams.CollectionAction;
import org.apache.solr.common.params.CoreAdminParams;
import org.apache.solr.common.params.CoreAdminParams.CoreAdminAction;
@ -104,9 +105,9 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
public static final String NUM_SLICES = "numShards";
static final boolean CREATE_NODE_SET_SHUFFLE_DEFAULT = true;
public static final String CREATE_NODE_SET_SHUFFLE = "createNodeSet.shuffle";
public static final String CREATE_NODE_SET_SHUFFLE = CollectionAdminParams.CREATE_NODE_SET_SHUFFLE_PARAM;
public static final String CREATE_NODE_SET_EMPTY = "EMPTY";
public static final String CREATE_NODE_SET = "createNodeSet";
public static final String CREATE_NODE_SET = CollectionAdminParams.CREATE_NODE_SET_PARAM;
public static final String ROUTER = "router";

View File

@ -20,16 +20,22 @@ package org.apache.solr.cloud;
import java.lang.invoke.MethodHandles;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Properties;
import java.util.Set;
import org.apache.solr.cloud.overseer.OverseerAction;
import org.apache.solr.cloud.rule.ReplicaAssigner;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.ImplicitDocRouter;
@ -53,6 +59,7 @@ import static org.apache.solr.cloud.OverseerCollectionMessageHandler.COLL_PROPS;
import static org.apache.solr.cloud.OverseerCollectionMessageHandler.CREATE_NODE_SET;
import static org.apache.solr.cloud.OverseerCollectionMessageHandler.CREATE_NODE_SET_EMPTY;
import static org.apache.solr.cloud.OverseerCollectionMessageHandler.NUM_SLICES;
import static org.apache.solr.cloud.OverseerCollectionMessageHandler.RANDOM;
import static org.apache.solr.cloud.OverseerCollectionMessageHandler.SHARDS_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
@ -60,7 +67,8 @@ import static org.apache.solr.common.params.CollectionParams.CollectionAction.CR
import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATESHARD;
import static org.apache.solr.common.params.CommonAdminParams.ASYNC;
import static org.apache.solr.common.params.CommonParams.NAME;
import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR;
import static org.apache.solr.common.cloud.ZkStateReader.MAX_SHARDS_PER_NODE;
public class RestoreCmd implements OverseerCollectionMessageHandler.Cmd {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@ -74,6 +82,7 @@ public class RestoreCmd implements OverseerCollectionMessageHandler.Cmd {
@Override
public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
// TODO maybe we can inherit createCollection's options/code
String restoreCollectionName = message.getStr(COLLECTION_PROP);
String backupName = message.getStr(NAME); // of backup
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler();
@ -93,6 +102,22 @@ public class RestoreCmd implements OverseerCollectionMessageHandler.Cmd {
String backupCollection = properties.getProperty(BackupManager.COLLECTION_NAME_PROP);
DocCollection backupCollectionState = backupMgr.readCollectionState(location, backupName, backupCollection);
// Get the Solr nodes to restore a collection.
final List<String> nodeList = OverseerCollectionMessageHandler.getLiveOrLiveAndCreateNodeSetList(
zkStateReader.getClusterState().getLiveNodes(), message, RANDOM);
int numShards = backupCollectionState.getActiveSlices().size();
int repFactor = message.getInt(REPLICATION_FACTOR, backupCollectionState.getReplicationFactor());
int maxShardsPerNode = message.getInt(MAX_SHARDS_PER_NODE, backupCollectionState.getMaxShardsPerNode());
int availableNodeCount = nodeList.size();
if ((numShards * repFactor) > (availableNodeCount * maxShardsPerNode)) {
throw new SolrException(ErrorCode.BAD_REQUEST,
String.format(Locale.ROOT, "Solr cloud with available number of nodes:%d is insufficient for"
+ " restoring a collection with %d shards, replication factor %d and maxShardsPerNode %d."
+ " Consider increasing maxShardsPerNode value OR number of available nodes.",
availableNodeCount, numShards, repFactor, maxShardsPerNode));
}
//Upload the configs
String configName = (String) properties.get(COLL_CONF);
String restoreConfigName = message.getStr(COLL_CONF, configName);
@ -168,9 +193,16 @@ public class RestoreCmd implements OverseerCollectionMessageHandler.Cmd {
inQueue.offer(Utils.toJSON(new ZkNodeProps(propMap)));
}
// TODO how do we leverage the CREATE_NODE_SET / RULE / SNITCH logic in createCollection?
// TODO how do we leverage the RULE / SNITCH logic in createCollection?
ClusterState clusterState = zkStateReader.getClusterState();
List<String> sliceNames = new ArrayList<>();
restoreCollection.getSlices().forEach(x -> sliceNames.add(x.getName()));
Map<ReplicaAssigner.Position, String> positionVsNodes = ocmh.identifyNodes(clusterState, nodeList,
message, sliceNames, repFactor);
//Create one replica per shard and copy backed up data to it
for (Slice slice : restoreCollection.getSlices()) {
log.debug("Adding replica for shard={} collection={} ", slice.getName(), restoreCollection);
@ -178,6 +210,19 @@ public class RestoreCmd implements OverseerCollectionMessageHandler.Cmd {
propMap.put(Overseer.QUEUE_OPERATION, CREATESHARD);
propMap.put(COLLECTION_PROP, restoreCollectionName);
propMap.put(SHARD_ID_PROP, slice.getName());
// Get the first node matching the shard to restore in
String node;
for (Map.Entry<ReplicaAssigner.Position, String> pvn : positionVsNodes.entrySet()) {
ReplicaAssigner.Position position = pvn.getKey();
if (position.shard == slice.getName()) {
node = pvn.getValue();
propMap.put(CoreAdminParams.NODE, node);
positionVsNodes.remove(position);
break;
}
}
// add async param
if (asyncId != null) {
propMap.put(ASYNC, asyncId);
@ -227,6 +272,19 @@ public class RestoreCmd implements OverseerCollectionMessageHandler.Cmd {
HashMap<String, Object> propMap = new HashMap<>();
propMap.put(COLLECTION_PROP, restoreCollectionName);
propMap.put(SHARD_ID_PROP, slice.getName());
// Get the first node matching the shard to restore in
String node;
for (Map.Entry<ReplicaAssigner.Position, String> pvn : positionVsNodes.entrySet()) {
ReplicaAssigner.Position position = pvn.getKey();
if (position.shard == slice.getName()) {
node = pvn.getValue();
propMap.put(CoreAdminParams.NODE, node);
positionVsNodes.remove(position);
break;
}
}
// add async param
if (asyncId != null) {
propMap.put(ASYNC, asyncId);

View File

@ -101,6 +101,7 @@ import static org.apache.solr.cloud.Overseer.QUEUE_OPERATION;
import static org.apache.solr.cloud.OverseerCollectionMessageHandler.COLL_CONF;
import static org.apache.solr.cloud.OverseerCollectionMessageHandler.COLL_PROP_PREFIX;
import static org.apache.solr.cloud.OverseerCollectionMessageHandler.CREATE_NODE_SET;
import static org.apache.solr.cloud.OverseerCollectionMessageHandler.CREATE_NODE_SET_EMPTY;
import static org.apache.solr.cloud.OverseerCollectionMessageHandler.CREATE_NODE_SET_SHUFFLE;
import static org.apache.solr.cloud.OverseerCollectionMessageHandler.NUM_SLICES;
import static org.apache.solr.cloud.OverseerCollectionMessageHandler.ONLY_ACTIVE_NODES;
@ -814,10 +815,19 @@ public class CollectionsHandler extends RequestHandlerBase implements Permission
throw new SolrException(ErrorCode.SERVER_ERROR, "Failed to check the existance of " + uri + ". Is it valid?", ex);
}
String createNodeArg = req.getParams().get(CREATE_NODE_SET);
if (CREATE_NODE_SET_EMPTY.equals(createNodeArg)) {
throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,
"Cannot restore with a CREATE_NODE_SET of CREATE_NODE_SET_EMPTY."
);
}
Map<String, Object> params = req.getParams().getAll(null, NAME, COLLECTION_PROP);
params.put(CoreAdminParams.BACKUP_LOCATION, location);
// from CREATE_OP:
req.getParams().getAll(params, COLL_CONF, REPLICATION_FACTOR, MAX_SHARDS_PER_NODE, STATE_FORMAT, AUTO_ADD_REPLICAS);
req.getParams().getAll(params, COLL_CONF, REPLICATION_FACTOR, MAX_SHARDS_PER_NODE, STATE_FORMAT,
AUTO_ADD_REPLICAS, CREATE_NODE_SET, CREATE_NODE_SET_SHUFFLE);
copyPropertiesWithPrefix(req.getParams(), params, COLL_PROP_PREFIX);
return params;
}),

View File

@ -19,6 +19,8 @@ package org.apache.solr.cloud;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
@ -28,6 +30,7 @@ import java.util.TreeMap;
import org.apache.lucene.util.TestUtil;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.CollectionAdminRequest.ClusterProp;
@ -236,9 +239,24 @@ public abstract class AbstractCloudBackupRestoreTestCase extends SolrCloudTestCa
// may need to increase maxShardsPerNode (e.g. if it was shard split, then now we need more)
restore.setMaxShardsPerNode(origShardToDocCount.size());
}
if (rarely()) { // Try with createNodeSet configuration
int nodeSetSize = cluster.getJettySolrRunners().size() / 2;
List<String> nodeStrs = new ArrayList<>(nodeSetSize);
Iterator<JettySolrRunner> iter = cluster.getJettySolrRunners().iterator();
for (int i = 0; i < nodeSetSize ; i++) {
nodeStrs.add(iter.next().getNodeName());
}
restore.setCreateNodeSet(String.join(",", nodeStrs));
restore.setCreateNodeSetShuffle(usually());
// we need to double maxShardsPerNode value since we reduced number of available nodes by half.
restore.setMaxShardsPerNode(origShardToDocCount.size() * 2);
}
Properties props = new Properties();
props.setProperty("customKey", "customVal");
restore.setProperties(props);
if (sameConfig==false) {
restore.setConfigName("customConfigName");
}
@ -268,6 +286,15 @@ public abstract class AbstractCloudBackupRestoreTestCase extends SolrCloudTestCa
assertEquals(sameConfig ? "conf1" : "customConfigName",
cluster.getSolrClient().getZkStateReader().readConfigName(restoreCollectionName));
Map<String, Integer> numReplicasByNodeName = new HashMap<>();
restoreCollection.getReplicas().forEach(x -> {
numReplicasByNodeName.put(x.getNodeName(), numReplicasByNodeName.getOrDefault(x.getNodeName(), 0) + 1);
});
numReplicasByNodeName.forEach((k, v) -> {
assertTrue("Node " + k + " has " + v + " replicas. Expected num replicas : " + restoreCollection.getMaxShardsPerNode() ,
v <= restoreCollection.getMaxShardsPerNode());
});
// assert added core properties:
// DWS: did via manual inspection.
// TODO Find the applicable core.properties on the file system but how?

View File

@ -178,6 +178,11 @@ public class TestSolrCloudSnapshots extends SolrCloudTestCase {
{
CollectionAdminRequest.Restore restore = CollectionAdminRequest.restoreCollection(restoreCollectionName, backupName)
.setLocation(backupLocation);
if (replicaFailures) {
// In this case one of the Solr servers would be down. Hence we need to increase
// max_shards_per_node property for restore command to succeed.
restore.setMaxShardsPerNode(2);
}
if (random().nextBoolean()) {
assertEquals(0, restore.process(solrClient).getStatus());
} else {

View File

@ -46,6 +46,8 @@ import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.NamedList;
import static org.apache.solr.common.params.CollectionAdminParams.COUNT_PROP;
import static org.apache.solr.common.params.CollectionAdminParams.CREATE_NODE_SET_PARAM;
import static org.apache.solr.common.params.CollectionAdminParams.CREATE_NODE_SET_SHUFFLE_PARAM;
/**
* This class is experimental and subject to change.
@ -490,7 +492,7 @@ public abstract class CollectionAdminRequest<T extends CollectionAdminResponse>
if (configName != null)
params.set("collection.configName", configName);
if (createNodeSet != null)
params.set("createNodeSet", createNodeSet);
params.set(CREATE_NODE_SET_PARAM, createNodeSet);
if (numShards != null) {
params.set( ZkStateReader.NUM_SHARDS_PROP, numShards);
}
@ -852,6 +854,8 @@ public abstract class CollectionAdminRequest<T extends CollectionAdminResponse>
protected Integer maxShardsPerNode;
protected Integer replicationFactor;
protected Boolean autoAddReplicas;
protected Optional<String> createNodeSet = Optional.empty();
protected Optional<Boolean> createNodeSetShuffle = Optional.empty();
protected Properties properties;
public Restore(String collection, String backupName) {
@ -889,6 +893,22 @@ public abstract class CollectionAdminRequest<T extends CollectionAdminResponse>
return this;
}
public void setCreateNodeSet(String createNodeSet) {
this.createNodeSet = Optional.of(createNodeSet);
}
public Optional<String> getCreateNodeSet() {
return createNodeSet;
}
public Optional<Boolean> getCreateNodeSetShuffle() {
return createNodeSetShuffle;
}
public void setCreateNodeSetShuffle(boolean createNodeSetShuffle) {
this.createNodeSetShuffle = Optional.of(createNodeSetShuffle);
}
// Collection creation params in common:
public Restore setConfigName(String config) { this.configName = config; return this; }
public String getConfigName() { return configName; }
@ -907,7 +927,7 @@ public abstract class CollectionAdminRequest<T extends CollectionAdminResponse>
}
public Restore setProperties(Properties properties) { this.properties = properties; return this;}
// TODO support createNodeSet, rule, snitch
// TODO support rule, snitch
@Override
public SolrParams getParams() {
@ -931,6 +951,12 @@ public abstract class CollectionAdminRequest<T extends CollectionAdminResponse>
if (repositoryName.isPresent()) {
params.set(CoreAdminParams.BACKUP_REPOSITORY, repositoryName.get());
}
if (createNodeSet.isPresent()) {
params.set(CREATE_NODE_SET_PARAM, createNodeSet.get());
}
if (createNodeSetShuffle.isPresent()) {
params.set(CREATE_NODE_SET_SHUFFLE_PARAM, createNodeSetShuffle.get());
}
return params;
}
@ -1111,7 +1137,7 @@ public abstract class CollectionAdminRequest<T extends CollectionAdminResponse>
public SolrParams getParams() {
ModifiableSolrParams params = (ModifiableSolrParams) super.getParams();
if (nodeSet != null) {
params.set("createNodeSet", nodeSet);
params.set(CREATE_NODE_SET_PARAM, nodeSet);
}
if (properties != null) {
addProperties(params, properties);

View File

@ -28,6 +28,16 @@ public interface CollectionAdminParams {
String COUNT_PROP = "count";
/**
* A parameter to specify list of Solr nodes to be used (e.g. for collection creation or restore operation).
*/
public static final String CREATE_NODE_SET_PARAM = "createNodeSet";
/**
* A parameter which specifies if the provided list of Solr nodes (via {@linkplain #CREATE_NODE_SET_PARAM})
* should be shuffled before being used.
*/
public static final String CREATE_NODE_SET_SHUFFLE_PARAM = "createNodeSet.shuffle";
/**
* A parameter to specify the name of the index backup strategy to be used.