From 47f4477f79aa9efee585813dab4f53ea6d08d5c2 Mon Sep 17 00:00:00 2001 From: jxiang Date: Tue, 14 May 2013 22:55:11 +0000 Subject: [PATCH] HBASE-8537 Dead region server pulled in from ZK git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1482635 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/hadoop/hbase/ServerName.java | 13 ---- .../apache/hadoop/hbase/master/HMaster.java | 18 ++--- .../hadoop/hbase/master/ServerManager.java | 67 ++++++++++++------- .../hbase/master/TestMasterNoCluster.java | 62 +++++++++++++++++ 4 files changed, 112 insertions(+), 48 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/ServerName.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/ServerName.java index ce48f9590fb..91e4c9d0b5b 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/ServerName.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/ServerName.java @@ -28,7 +28,6 @@ import org.apache.hadoop.hbase.util.Addressing; import org.apache.hadoop.hbase.util.Bytes; import java.util.ArrayList; -import java.util.Collection; import java.util.List; import java.util.regex.Pattern; @@ -248,18 +247,6 @@ public class ServerName implements Comparable { return this.compareTo((ServerName)o) == 0; } - - /** - * @return ServerName with matching hostname and port. - */ - public static ServerName findServerWithSameHostnamePort(final Collection names, - final ServerName serverName) { - for (ServerName sn: names) { - if (isSameHostnameAndPort(serverName, sn)) return sn; - } - return null; - } - /** * @param left * @param right diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index bd594521ae8..565d9fb9000 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -252,7 +252,7 @@ MasterServices, Server { // Manager and zk listener for master election private ActiveMasterManager activeMasterManager; // Region server tracker - private RegionServerTracker regionServerTracker; + RegionServerTracker regionServerTracker; // Draining region server tracker private DrainingServerTracker drainingServerTracker; // Tracker for load balancer state @@ -275,7 +275,7 @@ MasterServices, Server { private MasterFileSystem fileSystemManager; // server manager to deal with region server info - private ServerManager serverManager; + ServerManager serverManager; // manager of assignment nodes in zookeeper AssignmentManager assignmentManager; @@ -599,7 +599,7 @@ MasterServices, Server { * @throws IOException * @throws InterruptedException */ - private void initializeZKBasedSystemTrackers() throws IOException, + void initializeZKBasedSystemTrackers() throws IOException, InterruptedException, KeeperException { this.catalogTracker = createCatalogTracker(this.zooKeeper, this.conf, this); this.catalogTracker.start(); @@ -757,11 +757,11 @@ MasterServices, Server { this.serverManager.waitForRegionServers(status); // Check zk for region servers that are up but didn't register for (ServerName sn: this.regionServerTracker.getOnlineServers()) { - if (!this.serverManager.isServerOnline(sn)) { - // Not registered; add it. - LOG.info("Registering server found up in zk but who has not yet " + - "reported in: " + sn); - this.serverManager.recordNewServer(sn, ServerLoad.EMPTY_SERVERLOAD); + if (!this.serverManager.isServerOnline(sn) + && serverManager.checkAlreadySameHostPortAndRecordNewServer( + sn, ServerLoad.EMPTY_SERVERLOAD)) { + LOG.info("Registered server found up in zk but who has not yet " + + "reported in: " + sn); } } @@ -2666,7 +2666,7 @@ MasterServices, Server { try { SnapshotDescription snapshot = request.getSnapshot(); IsRestoreSnapshotDoneResponse.Builder builder = IsRestoreSnapshotDoneResponse.newBuilder(); - boolean done = snapshotManager.isRestoreDone(request.getSnapshot()); + boolean done = snapshotManager.isRestoreDone(snapshot); builder.setDone(done); return builder.build(); } catch (IOException e) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 042c3a2dfc2..da54a4293ed 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -60,7 +60,6 @@ import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionRequest; import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionResponse; import org.apache.hadoop.hbase.regionserver.RegionOpeningState; import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.util.Triple; import com.google.protobuf.ServiceException; @@ -214,8 +213,11 @@ public class ServerManager { ServerName sn = new ServerName(ia.getHostName(), port, serverStartcode); checkClockSkew(sn, serverCurrentTime); checkIsDead(sn, "STARTUP"); - checkAlreadySameHostPort(sn); - recordNewServer(sn, ServerLoad.EMPTY_SERVERLOAD); + if (!checkAlreadySameHostPortAndRecordNewServer( + sn, ServerLoad.EMPTY_SERVERLOAD)) { + LOG.warn("THIS SHOULD NOT HAPPEN, RegionServerStartup" + + " could not record the server: " + sn); + } return sn; } @@ -246,18 +248,20 @@ public class ServerManager { } } - void regionServerReport(ServerName sn, ServerLoad sl) - throws YouAreDeadException, PleaseHoldException { + void regionServerReport(ServerName sn, + ServerLoad sl) throws YouAreDeadException { checkIsDead(sn, "REPORT"); if (!this.onlineServers.containsKey(sn)) { // Already have this host+port combo and its just different start code? - checkAlreadySameHostPort(sn); // Just let the server in. Presume master joining a running cluster. // recordNewServer is what happens at the end of reportServerStartup. // The only thing we are skipping is passing back to the regionserver // the ServerName to use. Here we presume a master has already done // that so we'll press on with whatever it gave us for ServerName. - recordNewServer(sn, sl); + if (!checkAlreadySameHostPortAndRecordNewServer(sn, sl)) { + LOG.info("RegionServerReport ignored, could not record the sever: " + sn); + return; // Not recorded, so no need to move on + } } else { this.onlineServers.put(sn, sl); } @@ -265,29 +269,29 @@ public class ServerManager { } /** - * Test to see if we have a server of same host and port already. - * @param serverName - * @throws PleaseHoldException + * Check is a server of same host and port already exists, + * if not, or the existed one got a smaller start code, record it. + * + * @param sn the server to check and record + * @param sl the server load on the server + * @return true if the server is recorded, otherwise, false */ - void checkAlreadySameHostPort(final ServerName serverName) - throws PleaseHoldException { - ServerName existingServer = - ServerName.findServerWithSameHostnamePort(getOnlineServersList(), serverName); + boolean checkAlreadySameHostPortAndRecordNewServer( + final ServerName serverName, final ServerLoad sl) { + ServerName existingServer = findServerWithSameHostnamePort(serverName); if (existingServer != null) { - String message = "Server serverName=" + serverName + - " rejected; we already have " + existingServer.toString() + - " registered with same hostname and port"; - LOG.info(message); - if (existingServer.getStartcode() < serverName.getStartcode()) { - LOG.info("Triggering server recovery; existingServer " + - existingServer + " looks stale, new server:" + serverName); - expireServer(existingServer); - } - if (services.isServerShutdownHandlerEnabled()) { - // master has completed the initialization - throw new PleaseHoldException(message); + if (existingServer.getStartcode() > serverName.getStartcode()) { + LOG.info("Server serverName=" + serverName + + " rejected; we already have " + existingServer.toString() + + " registered with same hostname and port"); + return false; } + LOG.info("Triggering server recovery; existingServer " + + existingServer + " looks stale, new server:" + serverName); + expireServer(existingServer); } + recordNewServer(serverName, sl); + return true; } /** @@ -344,6 +348,17 @@ public class ServerManager { } } + /** + * @return ServerName with matching hostname and port. + */ + private ServerName findServerWithSameHostnamePort( + final ServerName serverName) { + for (ServerName sn: getOnlineServersList()) { + if (ServerName.isSameHostnameAndPort(serverName, sn)) return sn; + } + return null; + } + /** * Adds the onlineServers list. * @param serverName The remote servers name. diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterNoCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterNoCluster.java index 30e68222209..e237b167a5d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterNoCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterNoCluster.java @@ -19,11 +19,14 @@ package org.apache.hadoop.hbase.master; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.io.IOException; import java.net.InetAddress; import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.Abortable; @@ -49,6 +52,7 @@ import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; import org.apache.hadoop.hbase.MediumTests; import org.apache.zookeeper.KeeperException; +import org.apache.hadoop.hbase.monitoring.MonitoredTask; import org.apache.hadoop.hbase.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.NameStringPair; import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerReportRequest; @@ -341,4 +345,62 @@ public class TestMasterNoCluster { } } + @Test + public void testNotPullingDeadRegionServerFromZK() + throws IOException, KeeperException, InterruptedException { + final Configuration conf = TESTUTIL.getConfiguration(); + final ServerName newServer = new ServerName("test.sample", 1, 101); + final ServerName deadServer = new ServerName("test.sample", 1, 100); + final MockRegionServer rs0 = new MockRegionServer(conf, newServer); + + HMaster master = new HMaster(conf) { + @Override + boolean assignMeta(MonitoredTask status) { + return true; + } + + @Override + void initializeZKBasedSystemTrackers() throws IOException, + InterruptedException, KeeperException { + super.initializeZKBasedSystemTrackers(); + // Record a newer server in server manager at first + serverManager.recordNewServer(newServer, ServerLoad.EMPTY_SERVERLOAD); + + List onlineServers = new ArrayList(); + onlineServers.add(deadServer); + onlineServers.add(newServer); + // Mock the region server tracker to pull the dead server from zk + regionServerTracker = Mockito.spy(regionServerTracker); + Mockito.doReturn(onlineServers).when( + regionServerTracker).getOnlineServers(); + } + + @Override + CatalogTracker createCatalogTracker(ZooKeeperWatcher zk, + Configuration conf, Abortable abortable) + throws IOException { + // Insert a mock for the connection used by the CatalogTracker. Any + // regionserver should do. Use TESTUTIL.getConfiguration rather than + // the conf from the master; the conf will already have an HConnection + // associate so the below mocking of a connection will fail. + HConnection connection = + HConnectionTestingUtility.getMockedConnectionAndDecorate(TESTUTIL.getConfiguration(), + rs0, rs0, rs0.getServerName(), HRegionInfo.ROOT_REGIONINFO); + return new CatalogTracker(zk, conf, connection, abortable); + } + }; + master.start(); + + try { + // Wait till master is initialized. + while (!master.initialized) Threads.sleep(10); + LOG.info("Master is initialized"); + + assertFalse("The dead server should not be pulled in", + master.serverManager.isServerOnline(deadServer)); + } finally { + master.stopMaster(); + master.join(); + } + } }