From 23532b3538c2161f9594442ba5001dd1414d4ed7 Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Tue, 22 Nov 2011 01:04:40 +0000 Subject: [PATCH] HBASE-4798 Sleeps and synchronisation improvements for tests git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1204769 13f79535-47bb-0310-9956-ffa450edef68 --- .../hbase/master/AssignmentManager.java | 8 +- .../apache/hadoop/hbase/master/HMaster.java | 8 +- .../hbase/regionserver/HRegionServer.java | 39 +++++++-- .../hadoop/hbase/util/JVMClusterUtil.java | 86 ++++++++++--------- .../org/apache/hadoop/hbase/util/Sleeper.java | 2 +- .../hbase/zookeeper/ZooKeeperNodeTracker.java | 8 +- .../hadoop/hbase/HBaseTestingUtility.java | 4 +- .../apache/hadoop/hbase/MiniHBaseCluster.java | 3 +- .../hadoop/hbase/TestRegionRebalancing.java | 2 +- .../catalog/TestCatalogTrackerOnCluster.java | 11 ++- .../hbase/master/TestRestartCluster.java | 3 + .../hbase/regionserver/wal/TestHLog.java | 2 +- .../hbase/util/TestFSTableDescriptors.java | 2 +- 13 files changed, 111 insertions(+), 67 deletions(-) diff --git a/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index 328f07c7c58..9251905ffec 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -409,7 +409,8 @@ public class AssignmentManager extends ZooKeeperListener { synchronized(this.regionsInTransition) { while (!this.master.isStopped() && this.regionsInTransition.containsKey(hri.getEncodedName())) { - this.regionsInTransition.wait(); + // We expect a notify, but by security we set a timout + this.regionsInTransition.wait(100); } } return intransistion; @@ -1824,7 +1825,10 @@ public class AssignmentManager extends ZooKeeperListener { throws InterruptedException { synchronized(regions) { while(!regions.containsKey(regionInfo)) { - regions.wait(); + // We should receive a notification, but it's + // better to have a timeout to recheck the condition here: + // it lowers the impact of a race condition if any + regions.wait(100); } } } diff --git a/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index ef904b810ad..20d00cb4543 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -420,11 +420,11 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server { ", cluster-up flag was=" + wasUp); } + // Check if we should stop every second. + private Sleeper stopSleeper = new Sleeper(1000, this); private void loop() { - // Check if we should stop every second. - Sleeper sleeper = new Sleeper(1000, this); while (!this.stopped) { - sleeper.sleep(); + stopSleeper.sleep(); } } @@ -1504,6 +1504,8 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server { public void stop(final String why) { LOG.info(why); this.stopped = true; + // We wake up the stopSleeper to stop immediately + stopSleeper.skipSleepCycle(); // If we are a backup master, we need to interrupt wait if (this.activeMasterManager != null) { synchronized (this.activeMasterManager.clusterHasActiveMaster) { diff --git a/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 8ee6782a5c2..6f37b84bc63 100644 --- a/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -967,6 +967,8 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, stop("Failed initialization"); throw convertThrowableToIOE(cleanup(e, "Failed init"), "Region server startup failed"); + } finally { + sleeper.skipSleepCycle(); } } @@ -1552,9 +1554,14 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, public void stop(final String msg) { this.stopped = true; LOG.info("STOPPED: " + msg); - synchronized (this) { - // Wakes run() if it is sleeping - notifyAll(); // FindBugs NN_NAKED_NOTIFY + // Wakes run() if it is sleeping + //sleeper.skipSleepCycle(); + //will be uncommented later, see discussion in jira 4798 + } + + public void waitForServerOnline(){ + while (!isOnline() && !isStopped()){ + sleeper.sleep(); } } @@ -1722,10 +1729,17 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, */ private ServerName getMaster() { ServerName masterServerName = null; + long previousLogTime = 0; while ((masterServerName = this.masterAddressManager.getMasterAddress()) == null) { if (!keepLooping()) return null; - LOG.debug("No master found; retry"); - sleeper.sleep(); + if (System.currentTimeMillis() > (previousLogTime+1000)){ + LOG.debug("No master found; retry"); + previousLogTime = System.currentTimeMillis(); + } + try { + Thread.sleep(100); + } catch (InterruptedException ignored) { + } } InetSocketAddress isa = new InetSocketAddress(masterServerName.getHostname(), masterServerName.getPort()); @@ -1744,11 +1758,20 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, e = e instanceof RemoteException ? ((RemoteException)e).unwrapRemoteException() : e; if (e instanceof ServerNotRunningYetException) { - LOG.info("Master isn't available yet, retrying"); + if (System.currentTimeMillis() > (previousLogTime+1000)){ + LOG.info("Master isn't available yet, retrying"); + previousLogTime = System.currentTimeMillis(); + } } else { - LOG.warn("Unable to connect to master. Retrying. Error was:", e); + if (System.currentTimeMillis() > (previousLogTime + 1000)) { + LOG.warn("Unable to connect to master. Retrying. Error was:", e); + previousLogTime = System.currentTimeMillis(); + } + } + try { + Thread.sleep(200); + } catch (InterruptedException ignored) { } - sleeper.sleep(); } } LOG.info("Connected to master at " + isa); diff --git a/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java b/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java index c7d4c6befc8..6056f73fee4 100644 --- a/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java +++ b/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java @@ -62,14 +62,7 @@ public class JVMClusterUtil { // the HRS#run method. HRS#init can fail for whatever region. In those // cases, we'll jump out of the run without setting online flag. Check // stopRequested so we don't wait here a flag that will never be flipped. - while (!this.regionServer.isOnline() && - !this.regionServer.isStopped()) { - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - // continue waiting - } - } + regionServer.waitForServerOnline(); } } @@ -118,22 +111,6 @@ public class JVMClusterUtil { public HMaster getMaster() { return this.master; } - - /** - * Block until the master has come online, indicating it is ready - * to be used. - */ - public void waitForServerOnline() { - // The server is marked online after init begins but before race to become - // the active master. - while (!this.master.isMasterRunning() && !this.master.isStopped()) { - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - // continue waiting - } - } - } } /** @@ -165,20 +142,49 @@ public class JVMClusterUtil { return new JVMClusterUtil.MasterThread(server, index); } + private static JVMClusterUtil.MasterThread findActiveMaster( + List masters) { + for (JVMClusterUtil.MasterThread t : masters) { + if (t.master.isActiveMaster()) { + return t; + } + } + + return null; + } + /** - * Start the cluster. Waits until there is a primary master and returns its - * address. + * Start the cluster. Waits until there is a primary master initialized + * and returns its address. * @param masters * @param regionservers * @return Address to use contacting primary master. */ public static String startup(final List masters, final List regionservers) throws IOException { - if (masters != null) { - for (JVMClusterUtil.MasterThread t : masters) { - t.start(); + + if (masters == null || masters.isEmpty()) { + return null; + } + + for (JVMClusterUtil.MasterThread t : masters) { + t.start(); + } + + // Wait for an active master + // having an active master before starting the region threads allows + // then to succeed on their connection to master + long startTime = System.currentTimeMillis(); + while (findActiveMaster(masters) == null) { + try { + Thread.sleep(100); + } catch (InterruptedException ignored) { + } + if (System.currentTimeMillis() > startTime + 30000) { + throw new RuntimeException("Master not active after 30 seconds"); } } + if (regionservers != null) { for (JVMClusterUtil.RegionServerThread t: regionservers) { HRegionServer hrs = t.getRegionServer(); @@ -187,19 +193,21 @@ public class JVMClusterUtil { t.start(); } } - if (masters == null || masters.isEmpty()) { - return null; - } - // Wait for an active master + + // Wait for an active master to be initialized (implies being master) + // with this, when we return the cluster is complete + startTime = System.currentTimeMillis(); while (true) { - for (JVMClusterUtil.MasterThread t : masters) { - if (t.master.isActiveMaster()) { - return t.master.getServerName().toString(); - } + JVMClusterUtil.MasterThread t = findActiveMaster(masters); + if (t != null && t.master.isInitialized()) { + return t.master.getServerName().toString(); + } + if (System.currentTimeMillis() > startTime + 200000) { + throw new RuntimeException("Master not initialized after 200 seconds"); } try { - Thread.sleep(1000); - } catch(InterruptedException e) { + Thread.sleep(100); + } catch (InterruptedException ignored) { // Keep waiting } } diff --git a/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java b/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java index 011dcbee686..f153f7f34d4 100644 --- a/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java +++ b/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java @@ -62,7 +62,7 @@ public class Sleeper { public void skipSleepCycle() { synchronized (sleepLock) { triggerWake = true; - sleepLock.notify(); + sleepLock.notifyAll(); } } diff --git a/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperNodeTracker.java b/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperNodeTracker.java index 19dcffe5b73..c6e607ef133 100644 --- a/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperNodeTracker.java +++ b/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperNodeTracker.java @@ -127,11 +127,9 @@ public abstract class ZooKeeperNodeTracker extends ZooKeeperListener { } } while (!this.stopped && (notimeout || remaining > 0) && this.data == null) { - if (notimeout) { - wait(); - continue; - } - wait(remaining); + // We expect a notification; but we wait with a + // a timeout to lower the impact of a race condition if any + wait(100); remaining = timeout - (System.currentTimeMillis() - startTime); } return this.data; diff --git a/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java b/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java index 38f312e4cab..04d1c551944 100644 --- a/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java +++ b/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java @@ -1245,7 +1245,7 @@ public class HBaseTestingUtility { boolean checkStatus) throws Exception { Configuration c = new Configuration(this.conf); String quorumServers = ZKConfig.getZKQuorumServersString(c); - int sessionTimeout = 5 * 1000; // 5 seconds + int sessionTimeout = 500; ZooKeeper zk = nodeZK.getRecoverableZooKeeper().getZooKeeper(); byte[] password = zk.getSessionPasswd(); long sessionID = zk.getSessionId(); @@ -1253,7 +1253,7 @@ public class HBaseTestingUtility { ZooKeeper newZK = new ZooKeeper(quorumServers, sessionTimeout, EmptyWatcher.instance, sessionID, password); newZK.close(); - final long sleep = sessionTimeout * 5L; + final long sleep = 7000; // 7s seems enough to manage the timeout LOG.info("ZK Closed Session 0x" + Long.toHexString(sessionID) + "; sleeping=" + sleep); diff --git a/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java b/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java index 56b181869a5..b36d12ca283 100644 --- a/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java +++ b/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java @@ -293,7 +293,6 @@ public class MiniHBaseCluster { try { t = hbaseCluster.addMaster(c, hbaseCluster.getMasters().size(), user); t.start(); - t.waitForServerOnline(); } catch (InterruptedException ie) { throw new IOException("Interrupted adding master to cluster", ie); } @@ -382,7 +381,7 @@ public class MiniHBaseCluster { return true; } } - Thread.sleep(200); + Thread.sleep(100); } return false; } diff --git a/src/test/java/org/apache/hadoop/hbase/TestRegionRebalancing.java b/src/test/java/org/apache/hadoop/hbase/TestRegionRebalancing.java index 317336f08b1..7f7d20de33d 100644 --- a/src/test/java/org/apache/hadoop/hbase/TestRegionRebalancing.java +++ b/src/test/java/org/apache/hadoop/hbase/TestRegionRebalancing.java @@ -212,7 +212,7 @@ public class TestRegionRebalancing { // while (!cluster.getMaster().allRegionsAssigned()) { LOG.debug("Waiting for there to be 22 regions, but there are " + getRegionCount() + " right now."); try { - Thread.sleep(1000); + Thread.sleep(200); } catch (InterruptedException e) {} } } diff --git a/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTrackerOnCluster.java b/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTrackerOnCluster.java index 31e8fdead33..4952519fc55 100644 --- a/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTrackerOnCluster.java +++ b/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTrackerOnCluster.java @@ -23,6 +23,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.*; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; +import org.apache.zookeeper.KeeperException; import org.junit.Test; import org.junit.experimental.categories.Category; @@ -37,7 +38,7 @@ public class TestCatalogTrackerOnCluster { /** * @throws Exception - * @see https://issues.apache.org/jira/browse/HBASE-3445 + * @see {https://issues.apache.org/jira/browse/HBASE-3445} */ @Test public void testBadOriginalRootLocation() throws Exception { UTIL.getConfiguration().setInt("ipc.socket.timeout", 3000); @@ -61,9 +62,15 @@ public class TestCatalogTrackerOnCluster { ServerName nonsense = new ServerName("example.org", 1234, System.currentTimeMillis()); RootLocationEditor.setRootLocation(zookeeper, nonsense); + // Bring back up the hbase cluster. See if it can deal with nonsense root - // location. + // location. The cluster should start and be fully available. UTIL.startMiniHBaseCluster(1, 1); + + // if we can create a table, it's a good sign that it's working + UTIL.createTable( + getClass().getSimpleName().getBytes(), "family".getBytes()); + UTIL.shutdownMiniCluster(); } } diff --git a/src/test/java/org/apache/hadoop/hbase/master/TestRestartCluster.java b/src/test/java/org/apache/hadoop/hbase/master/TestRestartCluster.java index 30e59b6c2fa..9185c1e6eec 100644 --- a/src/test/java/org/apache/hadoop/hbase/master/TestRestartCluster.java +++ b/src/test/java/org/apache/hadoop/hbase/master/TestRestartCluster.java @@ -94,8 +94,11 @@ public class TestRestartCluster { LOG.info("\n\nCreating tables"); for(byte [] TABLE : TABLES) { UTIL.createTable(TABLE, FAMILY); + } + for(byte [] TABLE : TABLES) { UTIL.waitTableAvailable(TABLE, 30000); } + List allRegions = MetaScanner.listAllRegions(UTIL.getConfiguration()); assertEquals(3, allRegions.size()); diff --git a/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestHLog.java b/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestHLog.java index 09d1b0bf036..d6b88d4da34 100644 --- a/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestHLog.java +++ b/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestHLog.java @@ -127,7 +127,7 @@ public class TestHLog { } @AfterClass public static void tearDownAfterClass() throws Exception { - TEST_UTIL.shutdownMiniDFSCluster(); + TEST_UTIL.shutdownMiniCluster(); } private static String getName() { diff --git a/src/test/java/org/apache/hadoop/hbase/util/TestFSTableDescriptors.java b/src/test/java/org/apache/hadoop/hbase/util/TestFSTableDescriptors.java index f7e44313ecd..caf3d19b28d 100644 --- a/src/test/java/org/apache/hadoop/hbase/util/TestFSTableDescriptors.java +++ b/src/test/java/org/apache/hadoop/hbase/util/TestFSTableDescriptors.java @@ -59,7 +59,7 @@ public class TestFSTableDescriptors { assertTrue(FSTableDescriptors.createTableDescriptor(fs, testdir, htd)); assertFalse(FSTableDescriptors.createTableDescriptor(fs, testdir, htd)); FileStatus [] statuses = fs.listStatus(testdir); - assertTrue(statuses.length == 1); + assertTrue("statuses.length="+statuses.length, statuses.length == 1); for (int i = 0; i < 10; i++) { FSTableDescriptors.updateHTableDescriptor(fs, testdir, htd); }