HBASE-14536 Balancer & SSH interfering with each other leading to unavailability

This commit is contained in:
Stephen Yuan Jiang 2015-10-16 22:38:28 -07:00
parent e8c69a5921
commit 9bdb88a572
4 changed files with 69 additions and 18 deletions

View File

@ -259,6 +259,10 @@ public class AssignmentManager extends ZooKeeperListener {
private RegionStateListener regionStateListener;
public enum ServerHostRegion {
NOT_HOSTING_REGION, HOSTING_REGION, UNKNOWN,
}
/**
* Constructs a new assignment manager.
*
@ -3371,16 +3375,16 @@ public class AssignmentManager extends ZooKeeperListener {
threadPoolExecutorService.submit(new UnAssignCallable(this, regionInfo));
}
public boolean isCarryingMeta(ServerName serverName) {
public ServerHostRegion isCarryingMeta(ServerName serverName) {
return isCarryingRegion(serverName, HRegionInfo.FIRST_META_REGIONINFO);
}
public boolean isCarryingMetaReplica(ServerName serverName, int replicaId) {
public ServerHostRegion isCarryingMetaReplica(ServerName serverName, int replicaId) {
return isCarryingRegion(serverName,
RegionReplicaUtil.getRegionInfoForReplica(HRegionInfo.FIRST_META_REGIONINFO, replicaId));
}
public boolean isCarryingMetaReplica(ServerName serverName, HRegionInfo metaHri) {
public ServerHostRegion isCarryingMetaReplica(ServerName serverName, HRegionInfo metaHri) {
return isCarryingRegion(serverName, metaHri);
}
@ -3394,7 +3398,7 @@ public class AssignmentManager extends ZooKeeperListener {
* processing hasn't finished yet when server shutdown occurs.
* @return whether the serverName currently hosts the region
*/
private boolean isCarryingRegion(ServerName serverName, HRegionInfo hri) {
private ServerHostRegion isCarryingRegion(ServerName serverName, HRegionInfo hri) {
RegionTransition rt = null;
try {
byte [] data = ZKAssign.getData(watcher, hri.getEncodedName());
@ -3412,17 +3416,37 @@ public class AssignmentManager extends ZooKeeperListener {
boolean matchZK = addressFromZK.equals(serverName);
LOG.debug("Checking region=" + hri.getRegionNameAsString() + ", zk server=" + addressFromZK +
" current=" + serverName + ", matches=" + matchZK);
return matchZK;
return matchZK ? ServerHostRegion.HOSTING_REGION : ServerHostRegion.NOT_HOSTING_REGION;
}
ServerName addressFromAM = regionStates.getRegionServerOfRegion(hri);
boolean matchAM = (addressFromAM != null &&
addressFromAM.equals(serverName));
if (LOG.isDebugEnabled()) {
LOG.debug("based on AM, current region=" + hri.getRegionNameAsString() +
" is on server=" + (addressFromAM != null ? addressFromAM : "null") +
" server being checked: " + serverName);
}
if (addressFromAM != null) {
return addressFromAM.equals(serverName) ?
ServerHostRegion.HOSTING_REGION : ServerHostRegion.NOT_HOSTING_REGION;
}
return matchAM;
if (hri.isMetaRegion() && RegionReplicaUtil.isDefaultReplica(hri)) {
// For the Meta region (default replica), we can do one more check on MetaTableLocator
final ServerName serverNameInZK =
server.getMetaTableLocator().getMetaRegionLocation(this.server.getZooKeeper());
if (LOG.isDebugEnabled()) {
LOG.debug("Based on MetaTableLocator, the META region is on server=" +
(serverNameInZK == null ? "null" : serverNameInZK) +
" server being checked: " + serverName);
}
if (serverNameInZK != null) {
return serverNameInZK.equals(serverName) ?
ServerHostRegion.HOSTING_REGION : ServerHostRegion.NOT_HOSTING_REGION;
}
}
// Checked everywhere, if reaching here, we are unsure whether the server is carrying region.
return ServerHostRegion.UNKNOWN;
}
/**

View File

@ -617,7 +617,8 @@ public class ServerManager {
return;
}
boolean carryingMeta = services.getAssignmentManager().isCarryingMeta(serverName);
boolean carryingMeta = services.getAssignmentManager().isCarryingMeta(serverName) ==
AssignmentManager.ServerHostRegion.HOSTING_REGION;
this.services.getMasterProcedureExecutor().
submitProcedure(new ServerCrashProcedure(serverName, true, carryingMeta));
LOG.debug("Added=" + serverName +

View File

@ -313,8 +313,9 @@ implements ServerProcedureInterface {
private boolean processMeta(final MasterProcedureEnv env)
throws IOException {
if (LOG.isDebugEnabled()) LOG.debug("Processing hbase:meta that was on " + this.serverName);
MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem();
AssignmentManager am = env.getMasterServices().getAssignmentManager();
MasterServices services = env.getMasterServices();
MasterFileSystem mfs = services.getMasterFileSystem();
AssignmentManager am = services.getAssignmentManager();
HRegionInfo metaHRI = HRegionInfo.FIRST_META_REGIONINFO;
if (this.shouldSplitWal) {
if (this.distributedLogReplay) {
@ -328,9 +329,31 @@ implements ServerProcedureInterface {
// Assign meta if still carrying it. Check again: region may be assigned because of RIT timeout
boolean processed = true;
if (am.isCarryingMeta(serverName)) {
// TODO: May block here if hard time figuring state of meta.
boolean shouldAssignMeta = false;
AssignmentManager.ServerHostRegion rsCarryingMetaRegion = am.isCarryingMeta(serverName);
switch (rsCarryingMetaRegion) {
case HOSTING_REGION:
LOG.info("Server " + serverName + " was carrying META. Trying to assign.");
am.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
shouldAssignMeta = true;
break;
case UNKNOWN:
if (!services.getMetaTableLocator().isLocationAvailable(services.getZooKeeper())) {
// the meta location as per master is null. This could happen in case when meta
// assignment in previous run failed, while meta znode has been updated to null.
// We should try to assign the meta again.
shouldAssignMeta = true;
break;
}
// fall through
case NOT_HOSTING_REGION:
LOG.info("META has been assigned to otherwhere, skip assigning.");
break;
default:
throw new IOException("Unsupported action in MetaServerShutdownHandler");
}
if (shouldAssignMeta) {
// TODO: May block here if hard time figuring state of meta.
verifyAndAssignMetaWithRetries(env);
if (this.shouldSplitWal && distributedLogReplay) {
int timeout = env.getMasterConfiguration().getInt(KEY_WAIT_ON_RIT, DEFAULT_WAIT_ON_RIT);
@ -409,7 +432,8 @@ implements ServerProcedureInterface {
for (int i = 1; i < replicaCount; i++) {
HRegionInfo metaHri =
RegionReplicaUtil.getRegionInfoForReplica(HRegionInfo.FIRST_META_REGIONINFO, i);
if (am.isCarryingMetaReplica(this.serverName, metaHri)) {
if (am.isCarryingMetaReplica(this.serverName, metaHri) ==
AssignmentManager.ServerHostRegion.HOSTING_REGION) {
if (LOG.isDebugEnabled()) {
LOG.debug("Reassigning meta replica" + metaHri + " that was on " + this.serverName);
}

View File

@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.master.AssignmentManager;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
@ -103,7 +104,8 @@ public class TestServerCrashProcedure {
master.setServerCrashProcessingEnabled(false);
// Kill a server. Master will notice but do nothing other than add it to list of dead servers.
HRegionServer hrs = this.util.getHBaseCluster().getRegionServer(0);
boolean carryingMeta = master.getAssignmentManager().isCarryingMeta(hrs.getServerName());
boolean carryingMeta = (master.getAssignmentManager().isCarryingMeta(hrs.getServerName()) ==
AssignmentManager.ServerHostRegion.HOSTING_REGION);
this.util.getHBaseCluster().killRegionServer(hrs.getServerName());
hrs.join();
// Wait until the expiration of the server has arrived at the master. We won't process it