diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 945f54d7d70..ee7cd186674 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -40,7 +40,6 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Objects; import java.util.Set; -import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; @@ -524,12 +523,9 @@ public class HMaster extends HRegionServer implements MasterServices { // Some unit tests don't need a cluster, so no zookeeper at all if (!conf.getBoolean("hbase.testing.nocluster", false)) { - setInitLatch(new CountDownLatch(1)); - activeMasterManager = new ActiveMasterManager(zooKeeper, this.serverName, this); - int infoPort = putUpJettyServer(); - startActiveMasterManager(infoPort); + this.activeMasterManager = new ActiveMasterManager(zooKeeper, this.serverName, this); } else { - activeMasterManager = null; + this.activeMasterManager = null; } } catch (Throwable t) { // Make sure we log the exception. HMaster is often started via reflection and the @@ -539,10 +535,27 @@ public class HMaster extends HRegionServer implements MasterServices { } } - // Main run loop. Calls through to the regionserver run loop. + // Main run loop. Calls through to the regionserver run loop AFTER becoming active Master; will + // block in here until then. @Override public void run() { try { + if (!conf.getBoolean("hbase.testing.nocluster", false)) { + try { + int infoPort = putUpJettyServer(); + startActiveMasterManager(infoPort); + } catch (Throwable t) { + // Make sure we log the exception. + String error = "Failed to become Active Master"; + LOG.error(error, t); + // Abort should have been called already. + if (!isAborted()) { + abort(error, t); + } + } + } + // Fall in here even if we have been aborted. Need to run the shutdown services and + // the super run call will do this for us. super.run(); } finally { if (this.clusterSchemaService != null) { @@ -757,9 +770,9 @@ public class HMaster extends HRegionServer implements MasterServices { private void finishActiveMasterInitialization(MonitoredTask status) throws IOException, InterruptedException, KeeperException, CoordinatedStateException { - activeMaster = true; Thread zombieDetector = new Thread(new InitializationMonitor(this), "ActiveMasterInitializationMonitor-" + System.currentTimeMillis()); + zombieDetector.setDaemon(true); zombieDetector.start(); /* @@ -783,10 +796,9 @@ public class HMaster extends HRegionServer implements MasterServices { this.tableDescriptors.getAll(); } - // publish cluster ID + // Publish cluster ID status.setStatus("Publishing Cluster ID in ZooKeeper"); ZKClusterId.setClusterId(this.zooKeeper, fileSystemManager.getClusterId()); - this.initLatch.countDown(); this.serverManager = createServerManager(this); @@ -795,6 +807,10 @@ public class HMaster extends HRegionServer implements MasterServices { status.setStatus("Initializing ZK system trackers"); initializeZKBasedSystemTrackers(); + // Set Master as active now after we've setup zk with stuff like whether cluster is up or not. + // RegionServers won't come up if the cluster status is not up. + this.activeMaster = true; + // This is for backwards compatibility // See HBASE-11393 status.setStatus("Update TableCFs node in ZNode"); @@ -818,7 +834,9 @@ public class HMaster extends HRegionServer implements MasterServices { // Wake up this server to check in sleeper.skipSleepCycle(); - // Wait for region servers to report in + // Wait for region servers to report in. + // With this as part of master initialization, it precludes our being able to start a single + // server that is both Master and RegionServer. Needs more thought. TODO. String statusStr = "Wait for region servers to report in"; status.setStatus(statusStr); LOG.info(Objects.toString(status)); @@ -1985,57 +2003,43 @@ public class HMaster extends HRegionServer implements MasterServices { * this node for us since it is ephemeral. */ LOG.info("Adding backup master ZNode " + backupZNode); - if (!MasterAddressTracker.setMasterAddress(zooKeeper, backupZNode, - serverName, infoPort)) { + if (!MasterAddressTracker.setMasterAddress(zooKeeper, backupZNode, serverName, infoPort)) { LOG.warn("Failed create of " + backupZNode + " by " + serverName); } - - activeMasterManager.setInfoPort(infoPort); - // Start a thread to try to become the active master, so we won't block here - Threads.setDaemonThreadRunning(new Thread(new Runnable() { - @Override - public void run() { - int timeout = conf.getInt(HConstants.ZK_SESSION_TIMEOUT, - HConstants.DEFAULT_ZK_SESSION_TIMEOUT); - // If we're a backup master, stall until a primary to writes his address - if (conf.getBoolean(HConstants.MASTER_TYPE_BACKUP, - HConstants.DEFAULT_MASTER_TYPE_BACKUP)) { - LOG.debug("HMaster started in backup mode. " - + "Stalling until master znode is written."); - // This will only be a minute or so while the cluster starts up, - // so don't worry about setting watches on the parent znode - while (!activeMasterManager.hasActiveMaster()) { - LOG.debug("Waiting for master address ZNode to be written " - + "(Also watching cluster state node)"); - Threads.sleep(timeout); - } - } - MonitoredTask status = TaskMonitor.get().createStatus("Master startup"); - status.setDescription("Master startup"); - try { - if (activeMasterManager.blockUntilBecomingActiveMaster(timeout, status)) { - finishActiveMasterInitialization(status); - } - } catch (Throwable t) { - status.setStatus("Failed to become active: " + t.getMessage()); - LOG.error(HBaseMarkers.FATAL, "Failed to become active master", t); - // HBASE-5680: Likely hadoop23 vs hadoop 20.x/1.x incompatibility - if (t instanceof NoClassDefFoundError && - t.getMessage() - .contains("org/apache/hadoop/hdfs/protocol/HdfsConstants$SafeModeAction")) { - // improved error message for this special case - abort("HBase is having a problem with its Hadoop jars. You may need to " - + "recompile HBase against Hadoop version " - + org.apache.hadoop.util.VersionInfo.getVersion() - + " or change your hadoop jars to start properly", t); - } else { - abort("Unhandled exception. Starting shutdown.", t); - } - } finally { - status.cleanup(); - } + this.activeMasterManager.setInfoPort(infoPort); + int timeout = conf.getInt(HConstants.ZK_SESSION_TIMEOUT, HConstants.DEFAULT_ZK_SESSION_TIMEOUT); + // If we're a backup master, stall until a primary to write this address + if (conf.getBoolean(HConstants.MASTER_TYPE_BACKUP, HConstants.DEFAULT_MASTER_TYPE_BACKUP)) { + LOG.debug("HMaster started in backup mode. Stalling until master znode is written."); + // This will only be a minute or so while the cluster starts up, + // so don't worry about setting watches on the parent znode + while (!activeMasterManager.hasActiveMaster()) { + LOG.debug("Waiting for master address and cluster state znode to be written."); + Threads.sleep(timeout); } - }, getServerName().toShortString() + ".masterManager")); + } + MonitoredTask status = TaskMonitor.get().createStatus("Master startup"); + status.setDescription("Master startup"); + try { + if (activeMasterManager.blockUntilBecomingActiveMaster(timeout, status)) { + finishActiveMasterInitialization(status); + } + } catch (Throwable t) { + status.setStatus("Failed to become active: " + t.getMessage()); + LOG.error(HBaseMarkers.FATAL, "Failed to become active master", t); + // HBASE-5680: Likely hadoop23 vs hadoop 20.x/1.x incompatibility + if (t instanceof NoClassDefFoundError && t.getMessage(). + contains("org/apache/hadoop/hdfs/protocol/HdfsConstants$SafeModeAction")) { + // improved error message for this special case + abort("HBase is having a problem with its Hadoop jars. You may need to recompile " + + "HBase against Hadoop version " + org.apache.hadoop.util.VersionInfo.getVersion() + + " or change your hadoop jars to start properly", t); + } else { + abort("Unhandled exception. Starting shutdown.", t); + } + } finally { + status.cleanup(); + } } private void checkCompression(final TableDescriptor htd) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 53390bd870c..3a52a165ed1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -243,7 +243,6 @@ public class HRegionServer extends HasThread implements protected MemStoreFlusher cacheFlusher; protected HeapMemoryManager hMemManager; - protected CountDownLatch initLatch = null; /** * Cluster connection to be shared by services. @@ -696,10 +695,6 @@ public class HRegionServer extends HasThread implements return null; } - protected void setInitLatch(CountDownLatch latch) { - this.initLatch = latch; - } - /* * Returns true if configured hostname should be used */ @@ -854,8 +849,6 @@ public class HRegionServer extends HasThread implements // when ready. blockAndCheckIfStopped(this.clusterStatusTracker); - doLatch(this.initLatch); - // Retrieve clusterId // Since cluster status is now up // ID should have already been set by HMaster diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestTableStateManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestTableStateManager.java index 1f61ee7d341..81c1dfcaa8d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestTableStateManager.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestTableStateManager.java @@ -58,7 +58,7 @@ public class TestTableStateManager { @Test(timeout = 60000) public void testUpgradeFromZk() throws Exception { final TableName tableName = TableName.valueOf(name.getMethodName()); - TEST_UTIL.startMiniCluster(2, 1); + TEST_UTIL.startMiniCluster(1, 1); TEST_UTIL.shutdownMiniHBaseCluster(); ZKWatcher watcher = TEST_UTIL.getZooKeeperWatcher(); setTableStateInZK(watcher, tableName, ZooKeeperProtos.DeprecatedTableState.State.DISABLED); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestPerColumnFamilyFlush.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestPerColumnFamilyFlush.java index 2e3ff7ec08a..3b5609e53be 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestPerColumnFamilyFlush.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestPerColumnFamilyFlush.java @@ -236,7 +236,6 @@ public class TestPerColumnFamilyFlush { // CF3 shouldn't have been touched. assertEquals(cf3MemstoreSize, oldCF3MemstoreSize); assertEquals(totalMemstoreSize, cf3MemstoreSize.getDataSize()); - assertEquals(smallestSeqInRegionCurrentMemstore, smallestSeqCF3); // What happens when we hit the memstore limit, but we are not able to find // any Column Family above the threshold?