From 80b39a3ba76f97895520fa3e3d3ab431c305f19c Mon Sep 17 00:00:00 2001 From: Mark Robert Miller Date: Fri, 26 Oct 2012 05:18:51 +0000 Subject: [PATCH] SOLR-3995: Recovery may never finish on SolrCore shutdown if the last reference to a SolrCore is closed by the recovery process SOLR-3994:Create more extensive tests around unloading cores. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1402393 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 2 + .../apache/solr/cloud/ElectionContext.java | 10 +- .../org/apache/solr/cloud/LeaderElector.java | 4 + .../apache/solr/cloud/RecoveryStrategy.java | 2 +- .../org/apache/solr/core/CoreContainer.java | 4 +- .../solr/handler/admin/CoreAdminHandler.java | 7 +- .../solr/update/DefaultSolrCoreState.java | 2 +- .../solr/cloud/BasicDistributedZkTest.java | 95 ++++++++++++++----- .../client/solrj/impl/CloudSolrServer.java | 1 - .../solr/common/cloud/ClusterState.java | 1 + 10 files changed, 99 insertions(+), 29 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index db018e02a18..120edcd47a7 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -113,6 +113,8 @@ Bug Fixes * SOLR-3992: QuerySenderListener doesn't populate document cache. (Shotaro Kamio, yonik) +* SOLR-3995: Recovery may never finish on SolrCore shutdown if the last reference to + a SolrCore is closed by the recovery process. (Mark Miller) Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java index f4b2bea6735..d620a3e93ef 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java +++ b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java @@ -17,6 +17,7 @@ import org.apache.solr.core.SolrCore; import org.apache.solr.update.UpdateLog; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; +import org.apache.zookeeper.KeeperException.NoNodeException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -38,7 +39,7 @@ import org.slf4j.LoggerFactory; */ public abstract class ElectionContext { - + private static Logger log = LoggerFactory.getLogger(ElectionContext.class); final String electionPath; final ZkNodeProps leaderProps; final String id; @@ -58,7 +59,12 @@ public abstract class ElectionContext { public void close() {} public void cancelElection() throws InterruptedException, KeeperException { - zkClient.delete(leaderSeqPath, -1, true); + try { + zkClient.delete(leaderSeqPath, -1, true); + } catch (NoNodeException e) { + // fine + log.warn("cancelElection did not find election node to remove"); + } } abstract void runLeaderProcess(boolean weAreReplacement) throws KeeperException, InterruptedException, IOException; diff --git a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java index 07caa55734b..7be033fc37d 100644 --- a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java +++ b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java @@ -84,6 +84,10 @@ public class LeaderElector { sortSeqs(seqs); List intSeqs = getSeqs(seqs); + if (intSeqs.size() == 0) { + log.warn("Our node is no longer in line to be leader"); + return; + } if (seq <= intSeqs.get(0)) { // first we delete the node advertising the old leader in case the ephem is still there try { diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java index 7e11fbd026a..02472610cc3 100644 --- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java +++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java @@ -313,7 +313,7 @@ public class RecoveryStrategy extends Thread implements ClosableThread { } } - while (!successfulRecovery && !isInterrupted()) { // don't use interruption or it will close channels though + while (!successfulRecovery && !isInterrupted() && !isClosed()) { // don't use interruption or it will close channels though try { CloudDescriptor cloudDesc = core.getCoreDescriptor() .getCloudDescriptor(); diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java index 51b1553f073..7dd315d506a 100644 --- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java +++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java @@ -1072,7 +1072,9 @@ public class CoreContainer synchronized(cores) { SolrCore core = cores.remove( name ); - coreToOrigName.remove(core); + if (core != null) { + coreToOrigName.remove(core); + } return core; } diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index e47ce6ef67a..a1af638873f 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -668,7 +668,12 @@ public class CoreAdminHandler extends RequestHandlerBase { }); } } finally { - if (core != null) core.close(); + if (core != null) { + if (coreContainer.getZkController() != null) { + core.getSolrCoreState().cancelRecovery(); + } + core.close(); + } } return coreContainer.isPersistent(); diff --git a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java index 58d577e51aa..0e141b58f35 100644 --- a/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java +++ b/solr/core/src/java/org/apache/solr/update/DefaultSolrCoreState.java @@ -44,7 +44,7 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover private volatile boolean recoveryRunning; private RecoveryStrategy recoveryStrat; - private boolean closed = false; + private volatile boolean closed = false; private RefCounted refCntWriter; diff --git a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java index 0f5862df18d..06abe626f35 100644 --- a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java @@ -67,7 +67,6 @@ import org.apache.solr.common.cloud.ZkNodeProps; import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.common.params.CollectionParams.CollectionAction; import org.apache.solr.common.params.CommonParams; -import org.apache.solr.common.params.CoreAdminParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.UpdateParams; import org.apache.solr.common.util.NamedList; @@ -322,19 +321,19 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase { // would be better if these where all separate tests - but much, much // slower -// doOptimisticLockingAndUpdating(); -// testMultipleCollections(); -// testANewCollectionInOneInstance(); -// testSearchByCollectionName(); -// testANewCollectionInOneInstanceWithManualShardAssignement(); -// testNumberOfCommitsWithCommitAfterAdd(); -// -// testUpdateProcessorsRunOnlyOnce("distrib-dup-test-chain-explicit"); -// testUpdateProcessorsRunOnlyOnce("distrib-dup-test-chain-implicit"); -// -// testCollectionsAPI(); + doOptimisticLockingAndUpdating(); + testMultipleCollections(); + testANewCollectionInOneInstance(); + testSearchByCollectionName(); + testANewCollectionInOneInstanceWithManualShardAssignement(); + testNumberOfCommitsWithCommitAfterAdd(); + + testUpdateProcessorsRunOnlyOnce("distrib-dup-test-chain-explicit"); + testUpdateProcessorsRunOnlyOnce("distrib-dup-test-chain-implicit"); + + testCollectionsAPI(); testCoreUnloadAndLeaders(); - + testUnloadLotsOfCores(); // Thread.sleep(10000000000L); if (DEBUG) { super.printLayout(); @@ -412,8 +411,6 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase { createCmd.setDataDir(core3dataDir); server.request(createCmd); - Thread.sleep(1000); - waitForRecoveriesToFinish("unloadcollection", zkStateReader, false); // so that we start with some versions when we reload... @@ -474,8 +471,6 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase { createCmd.setDataDir(core4dataDir); server.request(createCmd); - Thread.sleep(1000); - waitForRecoveriesToFinish("unloadcollection", zkStateReader, false); // unload the leader again @@ -509,9 +504,7 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase { createCmd.setCollection("unloadcollection"); createCmd.setDataDir(core1DataDir); server.request(createCmd); - - Thread.sleep(1000); - + waitForRecoveriesToFinish("unloadcollection", zkStateReader, false); @@ -538,7 +531,65 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase { assertEquals(found3, found4); } - + + private void testUnloadLotsOfCores() throws Exception { + SolrServer client = clients.get(2); + String url3 = getBaseUrl(client); + final HttpSolrServer server = new HttpSolrServer(url3); + + ThreadPoolExecutor executor = new ThreadPoolExecutor(0, Integer.MAX_VALUE, + 5, TimeUnit.SECONDS, new SynchronousQueue(), + new DefaultSolrThreadFactory("testExecutor")); + int cnt = atLeast(6); + for (int i = 0; i < cnt; i++) { + final int freezeI = i; + executor.execute(new Runnable() { + + @Override + public void run() { + Create createCmd = new Create(); + createCmd.setCoreName("multiunload" + freezeI); + createCmd.setCollection("multiunload"); + String core3dataDir = dataDir.getAbsolutePath() + File.separator + + System.currentTimeMillis() + "unloadcollection" + "_3n" + freezeI; + createCmd.setDataDir(core3dataDir); + try { + server.request(createCmd); + } catch (SolrServerException e) { + throw new RuntimeException(e); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + }); + } + executor.shutdown(); + executor.awaitTermination(120, TimeUnit.SECONDS); + executor = new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, + TimeUnit.SECONDS, new SynchronousQueue(), + new DefaultSolrThreadFactory("testExecutor")); + for (int j = 0; j < cnt; j++) { + final int freezeJ = j; + executor.execute(new Runnable() { + @Override + public void run() { + Unload unloadCmd = new Unload(true); + unloadCmd.setCoreName("multiunload" + freezeJ); + try { + server.request(unloadCmd); + } catch (SolrServerException e) { + throw new RuntimeException(e); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }); + Thread.sleep(random().nextInt(50)); + } + executor.shutdown(); + executor.awaitTermination(120, TimeUnit.SECONDS); + } private String getBaseUrl(SolrServer client) { String url2 = ((HttpSolrServer) client).getBaseURL() @@ -794,7 +845,7 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase { } Thread.sleep(50); } - printLayout(); + fail("Could not find the new collection - " + exp.code() + " : " + collectionClient.getBaseURL()); } diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrServer.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrServer.java index 808c47c3799..5fdb870f56a 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrServer.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrServer.java @@ -211,7 +211,6 @@ public class CloudSolrServer extends SolrServer { if ((sendToLeaders && leaderUrlList == null) || (!sendToLeaders && urlList == null) || clusterState.hashCode() != this.lastClusterStateHashCode) { - System.out.println("build a new map for " + collection); // build a map of unique nodes // TODO: allow filtering by group, role, etc Map nodes = new HashMap(); diff --git a/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterState.java b/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterState.java index ba20e0c4b49..6b7293176de 100644 --- a/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterState.java +++ b/solr/solrj/src/java/org/apache/solr/common/cloud/ClusterState.java @@ -117,6 +117,7 @@ public class ClusterState implements JSONWriter.Writable { */ public Replica getShardProps(final String collection, final String coreNodeName) { Map slices = getSlices(collection); + if (slices == null) return null; for(Slice slice: slices.values()) { if(slice.getReplicasMap().get(coreNodeName)!=null) { return slice.getReplicasMap().get(coreNodeName);