HBASE-15251 During a cluster restart, Hmaster thinks it is a failover by mistake (Clara Xiong)

This commit is contained in:
tedyu 2016-02-18 23:46:54 -08:00
parent f352f3c371
commit 8eedc96751
1 changed files with 61 additions and 19 deletions

View File

@ -18,6 +18,8 @@
*/
package org.apache.hadoop.hbase.master;
import com.google.common.annotations.VisibleForTesting;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
@ -44,6 +46,7 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.CoordinatedStateException;
@ -92,8 +95,6 @@ import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.util.StringUtils;
import org.apache.zookeeper.KeeperException;
import com.google.common.annotations.VisibleForTesting;
/**
* Manages and performs region assignment.
* Related communications with regionserver are all done over RPC.
@ -443,31 +444,43 @@ public class AssignmentManager {
if (LOG.isDebugEnabled()) {
LOG.debug("Found dead servers out on cluster " + serverManager.getDeadServers());
}
} else {
// Check if there are any regions on these servers
failover = false;
for (ServerName serverName : serverManager.getDeadServers().copyServerNames()) {
if (regionStates.getRegionAssignments().values().contains(serverName)) {
LOG.debug("Found regions on dead server: " + serverName);
failover = true;
break;
}
}
}
Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
if (!failover) {
// If any one region except meta is assigned, it's a failover.
Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
for (Map.Entry<HRegionInfo, ServerName> en:
regionStates.getRegionAssignments().entrySet()) {
HRegionInfo hri = en.getKey();
if (!hri.isMetaTable()
&& onlineServers.contains(en.getValue())) {
LOG.debug("Found " + hri + " out on cluster");
LOG.debug("Found region " + hri + " out on cluster");
failover = true;
break;
}
}
if (!failover) {
// If any region except meta is in transition on a live server, it's a failover.
Map<String, RegionState> regionsInTransition = regionStates.getRegionsInTransition();
if (!regionsInTransition.isEmpty()) {
for (RegionState regionState: regionsInTransition.values()) {
ServerName serverName = regionState.getServerName();
if (!regionState.getRegion().isMetaRegion()
&& serverName != null && onlineServers.contains(serverName)) {
LOG.debug("Found " + regionState + " in RITs");
failover = true;
break;
}
}
if (!failover) {
// If any region except meta is in transition on a live server, it's a failover.
Map<String, RegionState> regionsInTransition = regionStates.getRegionsInTransition();
if (!regionsInTransition.isEmpty()) {
for (RegionState regionState: regionsInTransition.values()) {
ServerName serverName = regionState.getServerName();
if (!regionState.getRegion().isMetaRegion()
&& serverName != null && onlineServers.contains(serverName)) {
LOG.debug("Found " + regionState + " for region " +
regionState.getRegion().getRegionNameAsString() + " for server " +
serverName + "in RITs");
failover = true;
break;
}
}
}
@ -488,7 +501,7 @@ public class AssignmentManager {
Path logDir = new Path(rootdir,
DefaultWALProvider.getWALDirectoryName(serverName.toString()));
Path splitDir = logDir.suffix(DefaultWALProvider.SPLITTING_EXT);
if (fs.exists(logDir) || fs.exists(splitDir)) {
if (checkWals(fs, logDir) || checkWals(fs, splitDir)) {
LOG.debug("Found queued dead server " + serverName);
failover = true;
break;
@ -538,8 +551,10 @@ public class AssignmentManager {
failoverCleanupDone();
if (!failover) {
// Fresh cluster startup.
LOG.info("Clean cluster startup. Assigning user regions");
LOG.info("Clean cluster startup. Don't reassign user regions");
assignAllUserRegions(allRegions);
} else {
LOG.info("Failover! Reassign user regions");
}
// unassign replicas of the split parents and the merged regions
// the daughter replicas are opened in assignAllUserRegions if it was
@ -551,6 +566,33 @@ public class AssignmentManager {
return failover;
}
private boolean checkWals(FileSystem fs, Path dir) throws IOException {
if (!fs.exists(dir)) {
LOG.debug(dir + " doesn't exist");
return false;
}
if (!fs.getFileStatus(dir).isDirectory()) {
LOG.warn(dir + " is not a directory");
return false;
}
FileStatus[] files = FSUtils.listStatus(fs, dir);
if (files == null || files.length == 0) {
LOG.debug(dir + " has no files");
return false;
}
for (int i = 0; i < files.length; i++) {
if (files[i].isFile() && files[i].getLen() > 0) {
LOG.debug(dir + " has a non-empty file: " + files[i].getPath());
return true;
} else if (files[i].isDirectory() && checkWals(fs, dir)) {
LOG.debug(dir + " is a directory and has a non-empty file: " + files[i].getPath());
return true;
}
}
LOG.debug("Found 0 non-empty wal files for :" + dir);
return false;
}
/**
* When a region is closed, it should be removed from the regionsToReopen
* @param hri HRegionInfo of the region which was closed