SOLR-7338: A reloaded core will never register itself as active after a ZK session expiration, also fixes SOLR-6583

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1671554 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Timothy Potter 2015-04-06 15:42:12 +00:00
parent 69fdcfe306
commit e2bc6dc8ec
3 changed files with 198 additions and 8 deletions

View File

@ -397,6 +397,9 @@ Bug Fixes
* SOLR-7334: Admin UI does not show "Num Docs" and "Deleted Docs". (Erick Erickson, Timothy Potter) * SOLR-7334: Admin UI does not show "Num Docs" and "Deleted Docs". (Erick Erickson, Timothy Potter)
* SOLR-7338, SOLR-6583: A reloaded core will never register itself as active after a ZK session expiration
(Mark Miller, Timothy Potter)
Optimizations Optimizations
---------------------- ----------------------

View File

@ -883,7 +883,9 @@ public final class ZkController {
// leader election perhaps? // leader election perhaps?
UpdateLog ulog = core.getUpdateHandler().getUpdateLog(); UpdateLog ulog = core.getUpdateHandler().getUpdateLog();
if (!core.isReloaded() && ulog != null) {
// we will call register again after zk expiration and on reload
if (!afterExpiration && !core.isReloaded() && ulog != null) {
// disable recovery in case shard is in construction state (for shard splits) // disable recovery in case shard is in construction state (for shard splits)
Slice slice = getClusterState().getSlice(collection, shardId); Slice slice = getClusterState().getSlice(collection, shardId);
if (slice.getState() != Slice.State.CONSTRUCTION || !isLeader) { if (slice.getState() != Slice.State.CONSTRUCTION || !isLeader) {
@ -898,11 +900,11 @@ public final class ZkController {
log.info("No LogReplay needed for core=" + core.getName() + " baseURL=" + baseUrl); log.info("No LogReplay needed for core=" + core.getName() + " baseURL=" + baseUrl);
} }
} }
boolean didRecovery = checkRecovery(coreName, desc, recoverReloadedCores, isLeader, cloudDesc, }
collection, coreZkNodeName, shardId, leaderProps, core, cc); boolean didRecovery = checkRecovery(coreName, desc, recoverReloadedCores, isLeader, cloudDesc,
if (!didRecovery) { collection, coreZkNodeName, shardId, leaderProps, core, cc, afterExpiration);
publish(desc, Replica.State.ACTIVE); if (!didRecovery) {
} publish(desc, Replica.State.ACTIVE);
} }
} }
@ -1051,7 +1053,7 @@ public final class ZkController {
boolean recoverReloadedCores, final boolean isLeader, boolean recoverReloadedCores, final boolean isLeader,
final CloudDescriptor cloudDesc, final String collection, final CloudDescriptor cloudDesc, final String collection,
final String shardZkNodeName, String shardId, ZkNodeProps leaderProps, final String shardZkNodeName, String shardId, ZkNodeProps leaderProps,
SolrCore core, CoreContainer cc) { SolrCore core, CoreContainer cc, boolean afterExpiration) {
if (SKIP_AUTO_RECOVERY) { if (SKIP_AUTO_RECOVERY) {
log.warn("Skipping recovery according to sys prop solrcloud.skip.autorecovery"); log.warn("Skipping recovery according to sys prop solrcloud.skip.autorecovery");
return false; return false;
@ -1059,7 +1061,7 @@ public final class ZkController {
boolean doRecovery = true; boolean doRecovery = true;
if (!isLeader) { if (!isLeader) {
if (core.isReloaded() && !recoverReloadedCores) { if (!afterExpiration && core.isReloaded() && !recoverReloadedCores) {
doRecovery = false; doRecovery = false;
} }

View File

@ -0,0 +1,185 @@
package org.apache.solr.cloud;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.SolrTestCaseJ4.SuppressSSL;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.CoreAdminRequest;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
import org.apache.solr.client.solrj.response.CoreAdminResponse;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.ZkCoreNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Verifies cluster state remains consistent after collection reload.
*/
@Slow
@SuppressSSL(bugUrl = "https://issues.apache.org/jira/browse/SOLR-5776")
public class CollectionReloadTest extends AbstractFullDistribZkTestBase {
protected static final transient Logger log = LoggerFactory.getLogger(CollectionReloadTest.class);
public CollectionReloadTest() {
super();
sliceCount = 1;
}
@Override
public void distribSetUp() throws Exception {
super.distribSetUp();
System.setProperty("numShards", Integer.toString(sliceCount));
}
@Test
public void testReloadedLeaderStateAfterZkSessionLoss() throws Exception {
waitForThingsToLevelOut(30000);
log.info("testReloadedLeaderStateAfterZkSessionLoss initialized OK ... running test logic");
String testCollectionName = "c8n_1x1";
String shardId = "shard1";
createCollectionRetry(testCollectionName, 1, 1, 1);
cloudClient.setDefaultCollection(testCollectionName);
Replica leader = null;
String replicaState = null;
int timeoutSecs = 30;
long timeout = System.nanoTime() + TimeUnit.NANOSECONDS.convert(timeoutSecs, TimeUnit.SECONDS);
while (System.nanoTime() < timeout) {
Replica tmp = null;
try {
tmp = cloudClient.getZkStateReader().getLeaderRetry(testCollectionName, shardId);
} catch (Exception exc) {}
if (tmp != null && "active".equals(tmp.getStr(ZkStateReader.STATE_PROP))) {
leader = tmp;
replicaState = "active";
break;
}
Thread.sleep(1000);
}
assertNotNull("Could not find active leader for " + shardId + " of " +
testCollectionName + " after "+timeoutSecs+" secs; clusterState: " +
printClusterStateInfo(testCollectionName), leader);
// reload collection and wait to see the core report it has been reloaded
boolean wasReloaded = reloadCollection(leader, testCollectionName);
assertTrue("Collection '"+testCollectionName+"' failed to reload within a reasonable amount of time!",
wasReloaded);
// cause session loss
chaosMonkey.expireSession(getJettyOnPort(getReplicaPort(leader)));
// TODO: have to wait a while for the node to get marked down after ZK session loss
// but tests shouldn't be so timing dependent!
Thread.sleep(15000);
// wait up to 15 seconds to see the replica in the active state
timeoutSecs = 15;
timeout = System.nanoTime() + TimeUnit.NANOSECONDS.convert(timeoutSecs, TimeUnit.SECONDS);
while (System.nanoTime() < timeout) {
// state of leader should be active after session loss recovery - see SOLR-7338
cloudClient.getZkStateReader().updateClusterState(true);
ClusterState cs = cloudClient.getZkStateReader().getClusterState();
Slice slice = cs.getSlice(testCollectionName, shardId);
replicaState = slice.getReplica(leader.getName()).getStr(ZkStateReader.STATE_PROP);
if ("active".equals(replicaState))
break;
Thread.sleep(1000);
}
assertEquals("Leader state should be active after recovering from ZK session loss, but after " +
timeoutSecs + " seconds, it is " + replicaState, "active", replicaState);
// try to clean up
try {
CollectionAdminRequest.Delete req = new CollectionAdminRequest.Delete();
req.setCollectionName(testCollectionName);
req.process(cloudClient);
} catch (Exception e) {
// don't fail the test
log.warn("Could not delete collection {} after test completed", testCollectionName);
}
log.info("testReloadedLeaderStateAfterZkSessionLoss succeeded ... shutting down now!");
}
protected boolean reloadCollection(Replica replica, String testCollectionName) throws Exception {
ZkCoreNodeProps coreProps = new ZkCoreNodeProps(replica);
String coreName = coreProps.getCoreName();
boolean reloadedOk = false;
try (HttpSolrClient client = new HttpSolrClient(coreProps.getBaseUrl())) {
CoreAdminResponse statusResp = CoreAdminRequest.getStatus(coreName, client);
long leaderCoreStartTime = statusResp.getStartTime(coreName).getTime();
Thread.sleep(1000);
// send reload command for the collection
log.info("Sending RELOAD command for "+testCollectionName);
ModifiableSolrParams params = new ModifiableSolrParams();
params.set("action", CollectionParams.CollectionAction.RELOAD.toString());
params.set("name", testCollectionName);
QueryRequest request = new QueryRequest(params);
request.setPath("/admin/collections");
client.request(request);
Thread.sleep(2000); // reload can take a short while
// verify reload is done, waiting up to 30 seconds for slow test environments
long timeout = System.nanoTime() + TimeUnit.NANOSECONDS.convert(30, TimeUnit.SECONDS);
while (System.nanoTime() < timeout) {
statusResp = CoreAdminRequest.getStatus(coreName, client);
long startTimeAfterReload = statusResp.getStartTime(coreName).getTime();
if (startTimeAfterReload > leaderCoreStartTime) {
reloadedOk = true;
break;
}
// else ... still waiting to see the reloaded core report a later start time
Thread.sleep(1000);
}
}
return reloadedOk;
}
private void createCollectionRetry(String testCollectionName, int numShards, int replicationFactor, int maxShardsPerNode)
throws SolrServerException, IOException {
CollectionAdminResponse resp = createCollection(testCollectionName, numShards, replicationFactor, maxShardsPerNode);
if (resp.getResponse().get("failure") != null) {
CollectionAdminRequest.Delete req = new CollectionAdminRequest.Delete();
req.setCollectionName(testCollectionName);
req.process(cloudClient);
resp = createCollection(testCollectionName, numShards, replicationFactor, maxShardsPerNode);
if (resp.getResponse().get("failure") != null)
fail("Could not create " + testCollectionName);
}
}
}