HBASE-8537 Dead region server pulled in from ZK
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1482635 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
267e55da40
commit
47f4477f79
|
@ -28,7 +28,6 @@ import org.apache.hadoop.hbase.util.Addressing;
|
|||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
@ -248,18 +247,6 @@ public class ServerName implements Comparable<ServerName> {
|
|||
return this.compareTo((ServerName)o) == 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return ServerName with matching hostname and port.
|
||||
*/
|
||||
public static ServerName findServerWithSameHostnamePort(final Collection<ServerName> names,
|
||||
final ServerName serverName) {
|
||||
for (ServerName sn: names) {
|
||||
if (isSameHostnameAndPort(serverName, sn)) return sn;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param left
|
||||
* @param right
|
||||
|
|
|
@ -252,7 +252,7 @@ MasterServices, Server {
|
|||
// Manager and zk listener for master election
|
||||
private ActiveMasterManager activeMasterManager;
|
||||
// Region server tracker
|
||||
private RegionServerTracker regionServerTracker;
|
||||
RegionServerTracker regionServerTracker;
|
||||
// Draining region server tracker
|
||||
private DrainingServerTracker drainingServerTracker;
|
||||
// Tracker for load balancer state
|
||||
|
@ -275,7 +275,7 @@ MasterServices, Server {
|
|||
private MasterFileSystem fileSystemManager;
|
||||
|
||||
// server manager to deal with region server info
|
||||
private ServerManager serverManager;
|
||||
ServerManager serverManager;
|
||||
|
||||
// manager of assignment nodes in zookeeper
|
||||
AssignmentManager assignmentManager;
|
||||
|
@ -599,7 +599,7 @@ MasterServices, Server {
|
|||
* @throws IOException
|
||||
* @throws InterruptedException
|
||||
*/
|
||||
private void initializeZKBasedSystemTrackers() throws IOException,
|
||||
void initializeZKBasedSystemTrackers() throws IOException,
|
||||
InterruptedException, KeeperException {
|
||||
this.catalogTracker = createCatalogTracker(this.zooKeeper, this.conf, this);
|
||||
this.catalogTracker.start();
|
||||
|
@ -757,11 +757,11 @@ MasterServices, Server {
|
|||
this.serverManager.waitForRegionServers(status);
|
||||
// Check zk for region servers that are up but didn't register
|
||||
for (ServerName sn: this.regionServerTracker.getOnlineServers()) {
|
||||
if (!this.serverManager.isServerOnline(sn)) {
|
||||
// Not registered; add it.
|
||||
LOG.info("Registering server found up in zk but who has not yet " +
|
||||
"reported in: " + sn);
|
||||
this.serverManager.recordNewServer(sn, ServerLoad.EMPTY_SERVERLOAD);
|
||||
if (!this.serverManager.isServerOnline(sn)
|
||||
&& serverManager.checkAlreadySameHostPortAndRecordNewServer(
|
||||
sn, ServerLoad.EMPTY_SERVERLOAD)) {
|
||||
LOG.info("Registered server found up in zk but who has not yet "
|
||||
+ "reported in: " + sn);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2666,7 +2666,7 @@ MasterServices, Server {
|
|||
try {
|
||||
SnapshotDescription snapshot = request.getSnapshot();
|
||||
IsRestoreSnapshotDoneResponse.Builder builder = IsRestoreSnapshotDoneResponse.newBuilder();
|
||||
boolean done = snapshotManager.isRestoreDone(request.getSnapshot());
|
||||
boolean done = snapshotManager.isRestoreDone(snapshot);
|
||||
builder.setDone(done);
|
||||
return builder.build();
|
||||
} catch (IOException e) {
|
||||
|
|
|
@ -60,7 +60,6 @@ import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionRequest;
|
|||
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionResponse;
|
||||
import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.apache.hadoop.hbase.util.Triple;
|
||||
|
||||
import com.google.protobuf.ServiceException;
|
||||
|
@ -214,8 +213,11 @@ public class ServerManager {
|
|||
ServerName sn = new ServerName(ia.getHostName(), port, serverStartcode);
|
||||
checkClockSkew(sn, serverCurrentTime);
|
||||
checkIsDead(sn, "STARTUP");
|
||||
checkAlreadySameHostPort(sn);
|
||||
recordNewServer(sn, ServerLoad.EMPTY_SERVERLOAD);
|
||||
if (!checkAlreadySameHostPortAndRecordNewServer(
|
||||
sn, ServerLoad.EMPTY_SERVERLOAD)) {
|
||||
LOG.warn("THIS SHOULD NOT HAPPEN, RegionServerStartup"
|
||||
+ " could not record the server: " + sn);
|
||||
}
|
||||
return sn;
|
||||
}
|
||||
|
||||
|
@ -246,18 +248,20 @@ public class ServerManager {
|
|||
}
|
||||
}
|
||||
|
||||
void regionServerReport(ServerName sn, ServerLoad sl)
|
||||
throws YouAreDeadException, PleaseHoldException {
|
||||
void regionServerReport(ServerName sn,
|
||||
ServerLoad sl) throws YouAreDeadException {
|
||||
checkIsDead(sn, "REPORT");
|
||||
if (!this.onlineServers.containsKey(sn)) {
|
||||
// Already have this host+port combo and its just different start code?
|
||||
checkAlreadySameHostPort(sn);
|
||||
// Just let the server in. Presume master joining a running cluster.
|
||||
// recordNewServer is what happens at the end of reportServerStartup.
|
||||
// The only thing we are skipping is passing back to the regionserver
|
||||
// the ServerName to use. Here we presume a master has already done
|
||||
// that so we'll press on with whatever it gave us for ServerName.
|
||||
recordNewServer(sn, sl);
|
||||
if (!checkAlreadySameHostPortAndRecordNewServer(sn, sl)) {
|
||||
LOG.info("RegionServerReport ignored, could not record the sever: " + sn);
|
||||
return; // Not recorded, so no need to move on
|
||||
}
|
||||
} else {
|
||||
this.onlineServers.put(sn, sl);
|
||||
}
|
||||
|
@ -265,29 +269,29 @@ public class ServerManager {
|
|||
}
|
||||
|
||||
/**
|
||||
* Test to see if we have a server of same host and port already.
|
||||
* @param serverName
|
||||
* @throws PleaseHoldException
|
||||
* Check is a server of same host and port already exists,
|
||||
* if not, or the existed one got a smaller start code, record it.
|
||||
*
|
||||
* @param sn the server to check and record
|
||||
* @param sl the server load on the server
|
||||
* @return true if the server is recorded, otherwise, false
|
||||
*/
|
||||
void checkAlreadySameHostPort(final ServerName serverName)
|
||||
throws PleaseHoldException {
|
||||
ServerName existingServer =
|
||||
ServerName.findServerWithSameHostnamePort(getOnlineServersList(), serverName);
|
||||
boolean checkAlreadySameHostPortAndRecordNewServer(
|
||||
final ServerName serverName, final ServerLoad sl) {
|
||||
ServerName existingServer = findServerWithSameHostnamePort(serverName);
|
||||
if (existingServer != null) {
|
||||
String message = "Server serverName=" + serverName +
|
||||
" rejected; we already have " + existingServer.toString() +
|
||||
" registered with same hostname and port";
|
||||
LOG.info(message);
|
||||
if (existingServer.getStartcode() < serverName.getStartcode()) {
|
||||
LOG.info("Triggering server recovery; existingServer " +
|
||||
existingServer + " looks stale, new server:" + serverName);
|
||||
expireServer(existingServer);
|
||||
}
|
||||
if (services.isServerShutdownHandlerEnabled()) {
|
||||
// master has completed the initialization
|
||||
throw new PleaseHoldException(message);
|
||||
if (existingServer.getStartcode() > serverName.getStartcode()) {
|
||||
LOG.info("Server serverName=" + serverName +
|
||||
" rejected; we already have " + existingServer.toString() +
|
||||
" registered with same hostname and port");
|
||||
return false;
|
||||
}
|
||||
LOG.info("Triggering server recovery; existingServer " +
|
||||
existingServer + " looks stale, new server:" + serverName);
|
||||
expireServer(existingServer);
|
||||
}
|
||||
recordNewServer(serverName, sl);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -344,6 +348,17 @@ public class ServerManager {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return ServerName with matching hostname and port.
|
||||
*/
|
||||
private ServerName findServerWithSameHostnamePort(
|
||||
final ServerName serverName) {
|
||||
for (ServerName sn: getOnlineServersList()) {
|
||||
if (ServerName.isSameHostnameAndPort(serverName, sn)) return sn;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the onlineServers list.
|
||||
* @param serverName The remote servers name.
|
||||
|
|
|
@ -19,11 +19,14 @@ package org.apache.hadoop.hbase.master;
|
|||
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.net.UnknownHostException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.Abortable;
|
||||
|
@ -49,6 +52,7 @@ import org.apache.hadoop.hbase.zookeeper.ZKUtil;
|
|||
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
||||
import org.apache.hadoop.hbase.MediumTests;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
|
||||
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.NameStringPair;
|
||||
import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerReportRequest;
|
||||
|
@ -341,4 +345,62 @@ public class TestMasterNoCluster {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNotPullingDeadRegionServerFromZK()
|
||||
throws IOException, KeeperException, InterruptedException {
|
||||
final Configuration conf = TESTUTIL.getConfiguration();
|
||||
final ServerName newServer = new ServerName("test.sample", 1, 101);
|
||||
final ServerName deadServer = new ServerName("test.sample", 1, 100);
|
||||
final MockRegionServer rs0 = new MockRegionServer(conf, newServer);
|
||||
|
||||
HMaster master = new HMaster(conf) {
|
||||
@Override
|
||||
boolean assignMeta(MonitoredTask status) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
void initializeZKBasedSystemTrackers() throws IOException,
|
||||
InterruptedException, KeeperException {
|
||||
super.initializeZKBasedSystemTrackers();
|
||||
// Record a newer server in server manager at first
|
||||
serverManager.recordNewServer(newServer, ServerLoad.EMPTY_SERVERLOAD);
|
||||
|
||||
List<ServerName> onlineServers = new ArrayList<ServerName>();
|
||||
onlineServers.add(deadServer);
|
||||
onlineServers.add(newServer);
|
||||
// Mock the region server tracker to pull the dead server from zk
|
||||
regionServerTracker = Mockito.spy(regionServerTracker);
|
||||
Mockito.doReturn(onlineServers).when(
|
||||
regionServerTracker).getOnlineServers();
|
||||
}
|
||||
|
||||
@Override
|
||||
CatalogTracker createCatalogTracker(ZooKeeperWatcher zk,
|
||||
Configuration conf, Abortable abortable)
|
||||
throws IOException {
|
||||
// Insert a mock for the connection used by the CatalogTracker. Any
|
||||
// regionserver should do. Use TESTUTIL.getConfiguration rather than
|
||||
// the conf from the master; the conf will already have an HConnection
|
||||
// associate so the below mocking of a connection will fail.
|
||||
HConnection connection =
|
||||
HConnectionTestingUtility.getMockedConnectionAndDecorate(TESTUTIL.getConfiguration(),
|
||||
rs0, rs0, rs0.getServerName(), HRegionInfo.ROOT_REGIONINFO);
|
||||
return new CatalogTracker(zk, conf, connection, abortable);
|
||||
}
|
||||
};
|
||||
master.start();
|
||||
|
||||
try {
|
||||
// Wait till master is initialized.
|
||||
while (!master.initialized) Threads.sleep(10);
|
||||
LOG.info("Master is initialized");
|
||||
|
||||
assertFalse("The dead server should not be pulled in",
|
||||
master.serverManager.isServerOnline(deadServer));
|
||||
} finally {
|
||||
master.stopMaster();
|
||||
master.join();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue