SOLR-5209: Unloading or deleting the last replica of a shard now no longer cascades to remove the shard from the clusterstate.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1724098 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christine Poerschke 2016-01-11 17:52:48 +00:00
parent 439c29ae28
commit c5883d54b9
7 changed files with 161 additions and 82 deletions

View File

@ -210,6 +210,9 @@ Other Changes
* SOLR-8443: Change /stream handler http param from "stream" to "expr" (Joel Bernstein, Dennis Gove)
* SOLR-5209: Unloading or deleting the last replica of a shard now no longer
cascades to remove the shard from the clusterstate. (Christine Poerschke)
======================= 5.5.0 =======================
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release

View File

@ -92,45 +92,18 @@ public class SliceMutator {
}
Map<String, Slice> newSlices = new LinkedHashMap<>();
boolean lastSlice = false;
for (Slice slice : coll.getSlices()) {
Replica replica = slice.getReplica(cnn);
if (replica != null) {
Map<String, Replica> newReplicas = slice.getReplicasCopy();
newReplicas.remove(cnn);
// TODO TODO TODO!!! if there are no replicas left for the slice, and the slice has no hash range, remove it
// if (newReplicas.size() == 0 && slice.getRange() == null) {
// if there are no replicas left for the slice remove it
if (newReplicas.size() == 0) {
slice = null;
lastSlice = true;
} else {
slice = new Slice(slice.getName(), newReplicas, slice.getProperties());
}
}
if (slice != null) {
newSlices.put(slice.getName(), slice);
slice = new Slice(slice.getName(), newReplicas, slice.getProperties());
}
newSlices.put(slice.getName(), slice);
}
if (lastSlice) {
// remove all empty pre allocated slices
for (Slice slice : coll.getSlices()) {
if (slice.getReplicas().size() == 0) {
newSlices.remove(slice.getName());
}
}
}
// if there are no slices left in the collection, remove it?
if (newSlices.size() == 0) {
return new ClusterStateMutator(zkStateReader).deleteCollection(clusterState,
new ZkNodeProps(Utils.makeMap(NAME, collection)));
} else {
return new ZkWriteCommand(collection, coll.copyWithSlices(newSlices));
}
return new ZkWriteCommand(collection, coll.copyWithSlices(newSlices));
}
public ZkWriteCommand setShardLeader(ClusterState clusterState, ZkNodeProps message) {

View File

@ -365,42 +365,12 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
testUpdateProcessorsRunOnlyOnce("distrib-dup-test-chain-implicit");
testStopAndStartCoresInOneInstance();
testFailedCoreCreateCleansUp();
// Thread.sleep(10000000000L);
if (DEBUG) {
super.printLayout();
}
}
private void testFailedCoreCreateCleansUp() throws Exception {
Create createCmd = new Create();
createCmd.setCoreName("core1");
createCmd.setCollection("the_core_collection");
String coredataDir = createTempDir().toFile().getAbsolutePath();
createCmd.setDataDir(coredataDir);
createCmd.setNumShards(1);
createCmd.setSchemaName("nonexistent_schema.xml");
String url = getBaseUrl(clients.get(0));
try (final HttpSolrClient client = new HttpSolrClient(url)) {
client.request(createCmd);
fail("Expected SolrCore create to fail");
} catch (Exception e) {
}
TimeOut timeout = new TimeOut(15, TimeUnit.SECONDS);
while (cloudClient.getZkStateReader().getZkClient().exists("/collections/the_core_collection", true)) {
if (timeout.hasTimedOut()) {
fail(cloudClient.getZkStateReader().getZkClient().getChildren("/collections", null, true).toString() + " Collection zk node still exists");
}
Thread.sleep(100);
}
assertFalse("Collection zk node still exists", cloudClient.getZkStateReader().getZkClient().exists("/collections/the_core_collection", true));
}
private void testShardParamVariations() throws Exception {
SolrQuery query = new SolrQuery("*:*");
Map<String,Long> shardCounts = new HashMap<>();

View File

@ -509,13 +509,19 @@ public class CollectionsAPIDistributedZkTest extends AbstractFullDistribZkTestBa
cloudClient.getZkStateReader().updateClusterState();
Collection<Slice> slices = cloudClient.getZkStateReader().getClusterState().getActiveSlices("corewithnocollection3");
assertNull(slices);
int replicaCount = 0;
if (slices != null) {
for (Slice slice : slices) {
replicaCount += slice.getReplicas().size();
}
}
assertEquals("replicaCount", 0, replicaCount);
CollectionAdminRequest.List list = new CollectionAdminRequest.List();
CollectionAdminResponse res = new CollectionAdminResponse();
res.setResponse(makeRequest(getBaseUrl((HttpSolrClient) clients.get(1)), list));
List<String> collections = (List<String>) res.getResponse().get("collections");
assertFalse(collections.contains("corewithnocollection3"));
assertTrue(collections.contains("corewithnocollection3"));
}
private void testNodesUsedByCreate() throws Exception {

View File

@ -24,6 +24,7 @@ import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.ImplicitDocRouter;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.MapSolrParams;
@ -84,11 +85,12 @@ public class DeleteLastCustomShardedReplicaTest extends AbstractFullDistribZkTes
.getClusterState().getCollection(collectionName);
Replica replica = testcoll.getSlice("a").getReplicas().iterator().next();
removeAndWaitForLastReplicaGone(client, collectionName, replica, "a");
removeAndWaitForReplicaGone(client, collectionName, replica, "a", replicationFactor-1);
}
}
protected void removeAndWaitForLastReplicaGone(CloudSolrClient client, String COLL_NAME, Replica replica, String shard)
protected void removeAndWaitForReplicaGone(CloudSolrClient client, String COLL_NAME, Replica replica, String shard,
final int expectedNumReplicasRemaining)
throws SolrServerException, IOException, InterruptedException {
Map m = makeMap("collection", COLL_NAME, "action", DELETEREPLICA.toLower(), "shard",
shard, "replica", replica.getName());
@ -102,9 +104,11 @@ public class DeleteLastCustomShardedReplicaTest extends AbstractFullDistribZkTes
while (! timeout.hasTimedOut()) {
testcoll = getCommonCloudSolrClient().getZkStateReader()
.getClusterState().getCollection(COLL_NAME);
// In case of a custom sharded collection, the last replica deletion would also lead to
// As of SOLR-5209 the last replica deletion no longer leads to
// the deletion of the slice.
success = testcoll.getSlice(shard) == null;
final Slice slice = testcoll.getSlice(shard);
final int actualNumReplicasRemaining = (slice == null ? 0 : slice.getReplicas().size());
success = (actualNumReplicasRemaining == expectedNumReplicasRemaining);
if (success) {
log.info("replica cleaned up {}/{} core {}",
shard + "/" + replica.getName(), replica.getStr("core"));

View File

@ -680,8 +680,10 @@ public class OverseerTest extends SolrTestCaseJ4 {
mockController.publishState(collection, core, core_node, null, numShards);
while (version == getClusterStateVersion(zkClient));
Thread.sleep(500);
assertFalse(collection+" should be gone after publishing the null state",
reader.getClusterState().hasCollection(collection));
assertTrue(collection+" should remain after removal of the last core", // as of SOLR-5209 core removal does not cascade to remove the slice and collection
reader.getClusterState().getCollections().contains(collection));
assertTrue(core_node+" should be gone after publishing the null state",
null == reader.getClusterState().getCollection(collection).getReplica(core_node));
} finally {
close(mockController);
close(overseerClient);
@ -1312,4 +1314,134 @@ public class OverseerTest extends SolrTestCaseJ4 {
return zkClient;
}
@Test
public void testRemovalOfLastReplica() throws Exception {
final Integer numReplicas = 1+random().nextInt(4); // between 1 and 4 replicas
final Integer numShards = 1+random().nextInt(4); // between 1 and 4 shards
final String zkDir = createTempDir("zkData").toFile().getAbsolutePath();
final ZkTestServer server = new ZkTestServer(zkDir);
SolrZkClient zkClient = null;
ZkStateReader zkStateReader = null;
SolrZkClient overseerClient = null;
try {
server.run();
AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost());
AbstractZkTestCase.makeSolrZkNode(server.getZkHost());
zkClient = new SolrZkClient(server.getZkAddress(), TIMEOUT);
ZkController.createClusterZkNodes(zkClient);
zkStateReader = new ZkStateReader(zkClient);
zkStateReader.createClusterStateWatchersAndUpdate();
overseerClient = electNewOverseer(server.getZkAddress());
DistributedQueue q = Overseer.getInQueue(zkClient);
// create collection
{
final Integer maxShardsPerNode = numReplicas * numShards;
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, CollectionParams.CollectionAction.CREATE.toLower(),
"name", collection,
ZkStateReader.NUM_SHARDS_PROP, numShards.toString(),
ZkStateReader.REPLICATION_FACTOR, "1",
ZkStateReader.MAX_SHARDS_PER_NODE, maxShardsPerNode.toString()
);
q.offer(Utils.toJSON(m));
}
waitForCollections(zkStateReader, collection);
// create nodes with state recovering
for (int rr = 1; rr <= numReplicas; ++rr) {
for (int ss = 1; ss <= numShards; ++ss) {
final int N = (numReplicas-rr)*numShards + ss;
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.STATE.toLower(),
ZkStateReader.BASE_URL_PROP, "http://127.0.0.1/solr",
ZkStateReader.SHARD_ID_PROP, "shard"+ss,
ZkStateReader.NODE_NAME_PROP, "node"+N,
ZkStateReader.COLLECTION_PROP, collection,
ZkStateReader.CORE_NAME_PROP, "core"+N,
ZkStateReader.ROLES_PROP, "",
ZkStateReader.STATE_PROP, Replica.State.RECOVERING.toString());
q.offer(Utils.toJSON(m));
}
}
// verify recovering
for (int rr = 1; rr <= numReplicas; ++rr) {
for (int ss = 1; ss <= numShards; ++ss) {
final int N = (numReplicas-rr)*numShards + ss;
verifyReplicaStatus(zkStateReader, collection, "shard"+ss, "core_node"+N, Replica.State.RECOVERING);
}
}
// publish node states (active)
for (int rr = 1; rr <= numReplicas; ++rr) {
for (int ss = 1; ss <= numShards; ++ss) {
final int N = (numReplicas-rr)*numShards + ss;
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.STATE.toLower(),
ZkStateReader.BASE_URL_PROP, "http://127.0.0.1/solr",
ZkStateReader.NODE_NAME_PROP, "node"+N,
ZkStateReader.COLLECTION_PROP, collection,
ZkStateReader.CORE_NAME_PROP, "core"+N,
ZkStateReader.ROLES_PROP, "",
ZkStateReader.STATE_PROP, Replica.State.ACTIVE.toString());
q.offer(Utils.toJSON(m));
}
}
// verify active
for (int rr = 1; rr <= numReplicas; ++rr) {
for (int ss = 1; ss <= numShards; ++ss) {
final int N = (numReplicas-rr)*numShards + ss;
verifyReplicaStatus(zkStateReader, collection, "shard"+ss, "core_node"+N, Replica.State.ACTIVE);
}
}
// delete node
for (int rr = 1; rr <= numReplicas; ++rr) {
for (int ss = 1; ss <= numShards; ++ss) {
final int N = (numReplicas-rr)*numShards + ss;
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.DELETECORE.toLower(),
ZkStateReader.COLLECTION_PROP, collection,
ZkStateReader.CORE_NODE_NAME_PROP, "core_node"+N);
q.offer(Utils.toJSON(m));
{
int iterationsLeft = 100;
while (iterationsLeft-- > 0) {
final Slice slice = zkStateReader.getClusterState().getSlice(collection, "shard"+ss);
if (null == slice || null == slice.getReplicasMap().get("core_node"+N)) {
break;
}
if (VERBOSE) log.info("still seeing {} shard{} core_node{}, rechecking in 50ms ({} iterations left)", collection, ss, N, iterationsLeft);
Thread.sleep(50);
}
}
final DocCollection docCollection = zkStateReader.getClusterState().getCollection(collection);
assertTrue("found no "+collection, (null != docCollection));
final Slice slice = docCollection.getSlice("shard"+ss);
assertTrue("found no "+collection+" shard"+ss+" slice after removal of replica "+rr+" of "+numReplicas, (null != slice));
final Collection<Replica> replicas = slice.getReplicas();
assertEquals("wrong number of "+collection+" shard"+ss+" replicas left, replicas="+replicas, numReplicas-rr, replicas.size());
}
}
} finally {
close(overseerClient);
close(zkStateReader);
close(zkClient);
server.shutdown();
}
}
}

View File

@ -149,28 +149,19 @@ public class UnloadDistributedZkTest extends BasicDistributedZkTest {
unloadCmd.setCoreName(unloadCmdCoreName1);
adminClient.request(unloadCmd);
// there should be only one shard
checkCoreNamePresenceAndSliceCount(collection, unloadCmdCoreName1, false /* shouldBePresent */, numShards-1 /* expectedSliceCount */);
// there should still be two shards (as of SOLR-5209)
checkCoreNamePresenceAndSliceCount(collection, unloadCmdCoreName1, false /* shouldBePresent */, numShards /* expectedSliceCount */);
// now unload one of the other
unloadCmd = new Unload(false);
unloadCmd.setCoreName(unloadCmdCoreName2);
adminClient.request(unloadCmd);
checkCoreNamePresenceAndSliceCount(collection, unloadCmdCoreName2, false /* shouldBePresent */, numShards-2 /* expectedSliceCount */);
checkCoreNamePresenceAndSliceCount(collection, unloadCmdCoreName2, false /* shouldBePresent */, numShards /* expectedSliceCount */);
}
//printLayout();
// the collection should be gone
final TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS);
while (getCommonCloudSolrClient().getZkStateReader().getClusterState().hasCollection(collection)) {
if (timeout.hasTimedOut()) {
printLayout();
fail("Still found collection");
}
Thread.sleep(50);
}
// the collection should still be present (as of SOLR-5209 replica removal does not cascade to remove the slice and collection)
assertTrue("No longer found collection "+collection, getCommonCloudSolrClient().getZkStateReader().getClusterState().hasCollection(collection));
}
/**