HBASE-16367 Race between master and region server initialization may lead to premature server abort

This commit is contained in:
tedyu 2016-08-08 04:07:30 -07:00
parent e5f9df1e23
commit 50f3c9572c
2 changed files with 15 additions and 0 deletions

View File

@ -40,6 +40,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.Set; import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.atomic.AtomicReference;
@ -451,6 +452,7 @@ public class HMaster extends HRegionServer implements MasterServices {
// Some unit tests don't need a cluster, so no zookeeper at all // Some unit tests don't need a cluster, so no zookeeper at all
if (!conf.getBoolean("hbase.testing.nocluster", false)) { if (!conf.getBoolean("hbase.testing.nocluster", false)) {
setInitLatch(new CountDownLatch(1));
activeMasterManager = new ActiveMasterManager(zooKeeper, this.serverName, this); activeMasterManager = new ActiveMasterManager(zooKeeper, this.serverName, this);
int infoPort = putUpJettyServer(); int infoPort = putUpJettyServer();
startActiveMasterManager(infoPort); startActiveMasterManager(infoPort);
@ -693,6 +695,7 @@ public class HMaster extends HRegionServer implements MasterServices {
// publish cluster ID // publish cluster ID
status.setStatus("Publishing Cluster ID in ZooKeeper"); status.setStatus("Publishing Cluster ID in ZooKeeper");
ZKClusterId.setClusterId(this.zooKeeper, fileSystemManager.getClusterId()); ZKClusterId.setClusterId(this.zooKeeper, fileSystemManager.getClusterId());
this.initLatch.countDown();
this.serverManager = createServerManager(this); this.serverManager = createServerManager(this);

View File

@ -55,6 +55,8 @@ import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock;
@ -233,6 +235,7 @@ public class HRegionServer extends HasThread implements
protected MemStoreFlusher cacheFlusher; protected MemStoreFlusher cacheFlusher;
protected HeapMemoryManager hMemManager; protected HeapMemoryManager hMemManager;
protected CountDownLatch initLatch = null;
/** /**
* Cluster connection to be shared by services. * Cluster connection to be shared by services.
@ -655,6 +658,10 @@ public class HRegionServer extends HasThread implements
this.fs, this.rootDir, !canUpdateTableDescriptor(), false); this.fs, this.rootDir, !canUpdateTableDescriptor(), false);
} }
protected void setInitLatch(CountDownLatch latch) {
this.initLatch = latch;
}
/* /*
* Returns true if configured hostname should be used * Returns true if configured hostname should be used
*/ */
@ -799,6 +806,8 @@ public class HRegionServer extends HasThread implements
* @throws IOException * @throws IOException
* @throws InterruptedException * @throws InterruptedException
*/ */
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value="RV_RETURN_VALUE_IGNORED_BAD_PRACTICE",
justification="cluster Id znode read would give us correct response")
private void initializeZooKeeper() throws IOException, InterruptedException { private void initializeZooKeeper() throws IOException, InterruptedException {
// Create the master address tracker, register with zk, and start it. Then // Create the master address tracker, register with zk, and start it. Then
// block until a master is available. No point in starting up if no master // block until a master is available. No point in starting up if no master
@ -809,6 +818,9 @@ public class HRegionServer extends HasThread implements
// when ready. // when ready.
blockAndCheckIfStopped(this.clusterStatusTracker); blockAndCheckIfStopped(this.clusterStatusTracker);
if (this.initLatch != null) {
this.initLatch.await(50, TimeUnit.SECONDS);
}
// Retrieve clusterId // Retrieve clusterId
// Since cluster status is now up // Since cluster status is now up
// ID should have already been set by HMaster // ID should have already been set by HMaster