HBASE-8537 Dead region server pulled in from ZK

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1482635 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
jxiang 2013-05-14 22:55:11 +00:00
parent 267e55da40
commit 47f4477f79
4 changed files with 112 additions and 48 deletions

View File

@ -28,7 +28,6 @@ import org.apache.hadoop.hbase.util.Addressing;
import org.apache.hadoop.hbase.util.Bytes;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.regex.Pattern;
@ -248,18 +247,6 @@ public class ServerName implements Comparable<ServerName> {
return this.compareTo((ServerName)o) == 0;
}
/**
* @return ServerName with matching hostname and port.
*/
public static ServerName findServerWithSameHostnamePort(final Collection<ServerName> names,
final ServerName serverName) {
for (ServerName sn: names) {
if (isSameHostnameAndPort(serverName, sn)) return sn;
}
return null;
}
/**
* @param left
* @param right

View File

@ -252,7 +252,7 @@ MasterServices, Server {
// Manager and zk listener for master election
private ActiveMasterManager activeMasterManager;
// Region server tracker
private RegionServerTracker regionServerTracker;
RegionServerTracker regionServerTracker;
// Draining region server tracker
private DrainingServerTracker drainingServerTracker;
// Tracker for load balancer state
@ -275,7 +275,7 @@ MasterServices, Server {
private MasterFileSystem fileSystemManager;
// server manager to deal with region server info
private ServerManager serverManager;
ServerManager serverManager;
// manager of assignment nodes in zookeeper
AssignmentManager assignmentManager;
@ -599,7 +599,7 @@ MasterServices, Server {
* @throws IOException
* @throws InterruptedException
*/
private void initializeZKBasedSystemTrackers() throws IOException,
void initializeZKBasedSystemTrackers() throws IOException,
InterruptedException, KeeperException {
this.catalogTracker = createCatalogTracker(this.zooKeeper, this.conf, this);
this.catalogTracker.start();
@ -757,11 +757,11 @@ MasterServices, Server {
this.serverManager.waitForRegionServers(status);
// Check zk for region servers that are up but didn't register
for (ServerName sn: this.regionServerTracker.getOnlineServers()) {
if (!this.serverManager.isServerOnline(sn)) {
// Not registered; add it.
LOG.info("Registering server found up in zk but who has not yet " +
"reported in: " + sn);
this.serverManager.recordNewServer(sn, ServerLoad.EMPTY_SERVERLOAD);
if (!this.serverManager.isServerOnline(sn)
&& serverManager.checkAlreadySameHostPortAndRecordNewServer(
sn, ServerLoad.EMPTY_SERVERLOAD)) {
LOG.info("Registered server found up in zk but who has not yet "
+ "reported in: " + sn);
}
}
@ -2666,7 +2666,7 @@ MasterServices, Server {
try {
SnapshotDescription snapshot = request.getSnapshot();
IsRestoreSnapshotDoneResponse.Builder builder = IsRestoreSnapshotDoneResponse.newBuilder();
boolean done = snapshotManager.isRestoreDone(request.getSnapshot());
boolean done = snapshotManager.isRestoreDone(snapshot);
builder.setDone(done);
return builder.build();
} catch (IOException e) {

View File

@ -60,7 +60,6 @@ import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionRequest;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionResponse;
import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.Triple;
import com.google.protobuf.ServiceException;
@ -214,8 +213,11 @@ public class ServerManager {
ServerName sn = new ServerName(ia.getHostName(), port, serverStartcode);
checkClockSkew(sn, serverCurrentTime);
checkIsDead(sn, "STARTUP");
checkAlreadySameHostPort(sn);
recordNewServer(sn, ServerLoad.EMPTY_SERVERLOAD);
if (!checkAlreadySameHostPortAndRecordNewServer(
sn, ServerLoad.EMPTY_SERVERLOAD)) {
LOG.warn("THIS SHOULD NOT HAPPEN, RegionServerStartup"
+ " could not record the server: " + sn);
}
return sn;
}
@ -246,18 +248,20 @@ public class ServerManager {
}
}
void regionServerReport(ServerName sn, ServerLoad sl)
throws YouAreDeadException, PleaseHoldException {
void regionServerReport(ServerName sn,
ServerLoad sl) throws YouAreDeadException {
checkIsDead(sn, "REPORT");
if (!this.onlineServers.containsKey(sn)) {
// Already have this host+port combo and its just different start code?
checkAlreadySameHostPort(sn);
// Just let the server in. Presume master joining a running cluster.
// recordNewServer is what happens at the end of reportServerStartup.
// The only thing we are skipping is passing back to the regionserver
// the ServerName to use. Here we presume a master has already done
// that so we'll press on with whatever it gave us for ServerName.
recordNewServer(sn, sl);
if (!checkAlreadySameHostPortAndRecordNewServer(sn, sl)) {
LOG.info("RegionServerReport ignored, could not record the sever: " + sn);
return; // Not recorded, so no need to move on
}
} else {
this.onlineServers.put(sn, sl);
}
@ -265,29 +269,29 @@ public class ServerManager {
}
/**
* Test to see if we have a server of same host and port already.
* @param serverName
* @throws PleaseHoldException
* Check is a server of same host and port already exists,
* if not, or the existed one got a smaller start code, record it.
*
* @param sn the server to check and record
* @param sl the server load on the server
* @return true if the server is recorded, otherwise, false
*/
void checkAlreadySameHostPort(final ServerName serverName)
throws PleaseHoldException {
ServerName existingServer =
ServerName.findServerWithSameHostnamePort(getOnlineServersList(), serverName);
boolean checkAlreadySameHostPortAndRecordNewServer(
final ServerName serverName, final ServerLoad sl) {
ServerName existingServer = findServerWithSameHostnamePort(serverName);
if (existingServer != null) {
String message = "Server serverName=" + serverName +
" rejected; we already have " + existingServer.toString() +
" registered with same hostname and port";
LOG.info(message);
if (existingServer.getStartcode() < serverName.getStartcode()) {
LOG.info("Triggering server recovery; existingServer " +
existingServer + " looks stale, new server:" + serverName);
expireServer(existingServer);
}
if (services.isServerShutdownHandlerEnabled()) {
// master has completed the initialization
throw new PleaseHoldException(message);
if (existingServer.getStartcode() > serverName.getStartcode()) {
LOG.info("Server serverName=" + serverName +
" rejected; we already have " + existingServer.toString() +
" registered with same hostname and port");
return false;
}
LOG.info("Triggering server recovery; existingServer " +
existingServer + " looks stale, new server:" + serverName);
expireServer(existingServer);
}
recordNewServer(serverName, sl);
return true;
}
/**
@ -344,6 +348,17 @@ public class ServerManager {
}
}
/**
* @return ServerName with matching hostname and port.
*/
private ServerName findServerWithSameHostnamePort(
final ServerName serverName) {
for (ServerName sn: getOnlineServersList()) {
if (ServerName.isSameHostnameAndPort(serverName, sn)) return sn;
}
return null;
}
/**
* Adds the onlineServers list.
* @param serverName The remote servers name.

View File

@ -19,11 +19,14 @@ package org.apache.hadoop.hbase.master;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Abortable;
@ -49,6 +52,7 @@ import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.hadoop.hbase.MediumTests;
import org.apache.zookeeper.KeeperException;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.NameStringPair;
import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerReportRequest;
@ -341,4 +345,62 @@ public class TestMasterNoCluster {
}
}
@Test
public void testNotPullingDeadRegionServerFromZK()
throws IOException, KeeperException, InterruptedException {
final Configuration conf = TESTUTIL.getConfiguration();
final ServerName newServer = new ServerName("test.sample", 1, 101);
final ServerName deadServer = new ServerName("test.sample", 1, 100);
final MockRegionServer rs0 = new MockRegionServer(conf, newServer);
HMaster master = new HMaster(conf) {
@Override
boolean assignMeta(MonitoredTask status) {
return true;
}
@Override
void initializeZKBasedSystemTrackers() throws IOException,
InterruptedException, KeeperException {
super.initializeZKBasedSystemTrackers();
// Record a newer server in server manager at first
serverManager.recordNewServer(newServer, ServerLoad.EMPTY_SERVERLOAD);
List<ServerName> onlineServers = new ArrayList<ServerName>();
onlineServers.add(deadServer);
onlineServers.add(newServer);
// Mock the region server tracker to pull the dead server from zk
regionServerTracker = Mockito.spy(regionServerTracker);
Mockito.doReturn(onlineServers).when(
regionServerTracker).getOnlineServers();
}
@Override
CatalogTracker createCatalogTracker(ZooKeeperWatcher zk,
Configuration conf, Abortable abortable)
throws IOException {
// Insert a mock for the connection used by the CatalogTracker. Any
// regionserver should do. Use TESTUTIL.getConfiguration rather than
// the conf from the master; the conf will already have an HConnection
// associate so the below mocking of a connection will fail.
HConnection connection =
HConnectionTestingUtility.getMockedConnectionAndDecorate(TESTUTIL.getConfiguration(),
rs0, rs0, rs0.getServerName(), HRegionInfo.ROOT_REGIONINFO);
return new CatalogTracker(zk, conf, connection, abortable);
}
};
master.start();
try {
// Wait till master is initialized.
while (!master.initialized) Threads.sleep(10);
LOG.info("Master is initialized");
assertFalse("The dead server should not be pulled in",
master.serverManager.isServerOnline(deadServer));
} finally {
master.stopMaster();
master.join();
}
}
}