diff --git a/CHANGES.txt b/CHANGES.txt index 7508cfe041a..7e29d004c91 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -87,6 +87,7 @@ Release 0.91.0 - Unreleased (Prakash Khemani) HBASE-3819 TestSplitLogWorker has too many SLWs running -- makes for contention and occasional failures + HBASE-3210 HBASE-1921 for the new master IMPROVEMENTS HBASE-3290 Max Compaction Size (Nicolas Spiegelberg via Stack) diff --git a/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index 212860426b7..eed62ee1f6b 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -222,6 +222,10 @@ public class AssignmentManager extends ZooKeeperListener { // Process list of dead servers processDeadServers(deadServers); // Check existing regions in transition + processRegionsInTransition(); + } + + public void processRegionsInTransition() throws KeeperException, IOException { List nodes = ZKUtil.listChildrenAndWatchForNewChildren(watcher, watcher.assignmentZNode); if (nodes.isEmpty()) { diff --git a/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index e06af959fc0..250a8cf2357 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -276,10 +276,8 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server { * now wait until it dies to try and become the next active master. If we * do not succeed on our first attempt, this is no longer a cluster startup. */ - this.activeMasterManager = new ActiveMasterManager(zooKeeper, address, this); - this.zooKeeper.registerListener(activeMasterManager); - stallIfBackupMaster(this.conf, this.activeMasterManager); - this.activeMasterManager.blockUntilBecomingActiveMaster(); + becomeActiveMaster(); + // We are either the active master or we were asked to shutdown if (!this.stopped) { finishInitialization(); @@ -308,6 +306,52 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server { LOG.info("HMaster main thread exiting"); } + /** + * Try becoming active master. + * @return True if we could successfully become the active master. + * @throws InterruptedException + */ + private boolean becomeActiveMaster() throws InterruptedException { + this.activeMasterManager = new ActiveMasterManager(zooKeeper, address, + this); + this.zooKeeper.registerListener(activeMasterManager); + stallIfBackupMaster(this.conf, this.activeMasterManager); + return this.activeMasterManager.blockUntilBecomingActiveMaster(); + } + + /** + * Initilize all ZK based system trackers. + * @throws IOException + * @throws InterruptedException + */ + private void initializeZKBasedSystemTrackers() throws IOException, + InterruptedException, KeeperException { + this.catalogTracker = new CatalogTracker(this.zooKeeper, this.connection, + this, conf.getInt("hbase.master.catalog.timeout", Integer.MAX_VALUE)); + this.catalogTracker.start(); + + this.assignmentManager = new AssignmentManager(this, serverManager, + this.catalogTracker, this.executorService); + this.balancer = new LoadBalancer(conf); + zooKeeper.registerListenerFirst(assignmentManager); + + this.regionServerTracker = new RegionServerTracker(zooKeeper, this, + this.serverManager); + this.regionServerTracker.start(); + + // Set the cluster as up. If new RSs, they'll be waiting on this before + // going ahead with their startup. + this.clusterStatusTracker = new ClusterStatusTracker(getZooKeeper(), this); + this.clusterStatusTracker.start(); + boolean wasUp = this.clusterStatusTracker.isClusterUp(); + if (!wasUp) this.clusterStatusTracker.setClusterUp(); + + LOG.info("Server active/primary master; " + this.address + + ", sessionid=0x" + + Long.toHexString(this.zooKeeper.getZooKeeper().getSessionId()) + + ", cluster-up flag was=" + wasUp); + } + private void loop() { // Check if we should stop every second. Sleeper sleeper = new Sleeper(1000, this); @@ -357,30 +401,7 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server { this.serverManager = new ServerManager(this, this, metrics); - this.catalogTracker = new CatalogTracker(this.zooKeeper, this.connection, - this, conf.getInt("hbase.master.catalog.timeout", Integer.MAX_VALUE)); - this.catalogTracker.start(); - - this.assignmentManager = new AssignmentManager(this, serverManager, - this.catalogTracker, this.executorService); - this.balancer = new LoadBalancer(conf); - zooKeeper.registerListenerFirst(assignmentManager); - - this.regionServerTracker = new RegionServerTracker(zooKeeper, this, - this.serverManager); - this.regionServerTracker.start(); - - // Set the cluster as up. If new RSs, they'll be waiting on this before - // going ahead with their startup. - this.clusterStatusTracker = new ClusterStatusTracker(getZooKeeper(), this); - this.clusterStatusTracker.start(); - boolean wasUp = this.clusterStatusTracker.isClusterUp(); - if (!wasUp) this.clusterStatusTracker.setClusterUp(); - - LOG.info("Server active/primary master; " + this.address + - ", sessionid=0x" + - Long.toHexString(this.zooKeeper.getZooKeeper().getSessionId()) + - ", cluster-up flag was=" + wasUp); + initializeZKBasedSystemTrackers(); // initialize master side coprocessors before we start handling requests this.cpHost = new MasterCoprocessorHost(this, this.conf); @@ -1089,10 +1110,67 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server { @Override public void abort(final String msg, final Throwable t) { - if (t != null) LOG.fatal(msg, t); - else LOG.fatal(msg); - this.abort = true; - stop("Aborting"); + if (abortNow(msg, t)) { + if (t != null) LOG.fatal(msg, t); + else LOG.fatal(msg); + this.abort = true; + stop("Aborting"); + } + } + + /** + * We do the following. + * 1. Create a new ZK session. (since our current one is expired) + * 2. Try to become a primary master again + * 3. Initialize all ZK based system trackers. + * 4. Assign root and meta. (they are already assigned, but we need to update our + * internal memory state to reflect it) + * 5. Process any RIT if any during the process of our recovery. + * + * @return True if we could successfully recover from ZK session expiry. + * @throws InterruptedException + * @throws IOException + */ + private boolean tryRecoveringExpiredZKSession() throws InterruptedException, + IOException, KeeperException { + this.zooKeeper = new ZooKeeperWatcher(conf, MASTER + ":" + + address.getPort(), this); + + if (!becomeActiveMaster()) { + return false; + } + initializeZKBasedSystemTrackers(); + // Update in-memory structures to reflect our earlier Root/Meta assignment. + assignRootAndMeta(); + // process RIT if any + this.assignmentManager.processRegionsInTransition(); + return true; + } + + /** + * Check to see if the current trigger for abort is due to ZooKeeper session + * expiry, and If yes, whether we can recover from ZK session expiry. + * + * @param msg Original abort message + * @param t The cause for current abort request + * @return true if we should proceed with abort operation, false other wise. + */ + private boolean abortNow(final String msg, final Throwable t) { + if (!this.isActiveMaster) { + return true; + } + if (t != null && t instanceof KeeperException.SessionExpiredException) { + try { + LOG.info("Primary Master trying to recover from ZooKeeper session " + + "expiry."); + return !tryRecoveringExpiredZKSession(); + } catch (Throwable newT) { + LOG.error("Primary master encountered unexpected exception while " + + "trying to recover from ZooKeeper session" + + " expiry. Proceeding with server abort.", newT); + } + } + return true; } @Override diff --git a/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java b/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java index 7b19356741a..ffa1710083d 100644 --- a/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java +++ b/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java @@ -131,8 +131,8 @@ public class TestZooKeeper { testSanity(); } - //@Test - public void disabledTestMasterSessionExpired() throws Exception { + @Test + public void testMasterSessionExpired() throws Exception { LOG.info("Starting testMasterSessionExpired"); TEST_UTIL.expireMasterSession(); testSanity();