hbase-8666: META region isn't fully recovered during master initialization when META region recovery had chained failures
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1489606 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
aefb339ce4
commit
ccb9fd364d
|
@ -795,6 +795,7 @@ MasterServices, Server {
|
|||
// Note: we can't remove oldMetaServerLocation from previousFailedServers list because it
|
||||
// may also host user regions
|
||||
}
|
||||
Set<ServerName> previouslyFailedMetaRSs = getPreviouselyFailedMetaServersFromZK();
|
||||
|
||||
this.initializationBeforeMetaAssignment = true;
|
||||
// Make sure meta assigned before proceeding.
|
||||
|
@ -804,11 +805,19 @@ MasterServices, Server {
|
|||
// assigned when master is shutting down
|
||||
if(this.stopped) return;
|
||||
|
||||
if (this.distributedLogReplay && oldMetaServerLocation != null
|
||||
&& previouslyFailedServers.contains(oldMetaServerLocation)) {
|
||||
if (this.distributedLogReplay && (!previouslyFailedMetaRSs.isEmpty())) {
|
||||
// replay WAL edits mode need new .META. RS is assigned firstly
|
||||
status.setStatus("replaying log for Meta Region");
|
||||
this.fileSystemManager.splitMetaLog(oldMetaServerLocation);
|
||||
// need to use union of previouslyFailedMetaRSs recorded in ZK and previouslyFailedServers
|
||||
// instead of oldMetaServerLocation to address the following two situations:
|
||||
// 1) the chained failure situation(recovery failed multiple times in a row).
|
||||
// 2) master get killed right before it could delete the recovering META from ZK while the
|
||||
// same server still has non-meta wals to be replayed so that
|
||||
// removeStaleRecoveringRegionsFromZK can't delete the stale META region
|
||||
// Passing more servers into splitMetaLog is all right. If a server doesn't have .META. wal,
|
||||
// there is no op for the server.
|
||||
previouslyFailedMetaRSs.addAll(previouslyFailedServers);
|
||||
this.fileSystemManager.splitMetaLog(previouslyFailedMetaRSs);
|
||||
}
|
||||
|
||||
enableServerShutdownHandler();
|
||||
|
@ -992,6 +1001,25 @@ MasterServices, Server {
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* This function returns a set of region server names under .META. recovering region ZK node
|
||||
* @return Set of meta server names which were recorded in ZK
|
||||
* @throws KeeperException
|
||||
*/
|
||||
private Set<ServerName> getPreviouselyFailedMetaServersFromZK() throws KeeperException {
|
||||
Set<ServerName> result = new HashSet<ServerName>();
|
||||
String metaRecoveringZNode = ZKUtil.joinZNode(zooKeeper.recoveringRegionsZNode,
|
||||
HRegionInfo.FIRST_META_REGIONINFO.getEncodedName());
|
||||
List<String> regionFailedServers = ZKUtil.listChildrenNoWatch(zooKeeper, metaRecoveringZNode);
|
||||
if (regionFailedServers == null) return result;
|
||||
|
||||
for(String failedServer : regionFailedServers) {
|
||||
ServerName server = ServerName.parseServerName(failedServer);
|
||||
result.add(server);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TableDescriptors getTableDescriptors() {
|
||||
return this.tableDescriptors;
|
||||
|
|
|
@ -294,9 +294,18 @@ public class MasterFileSystem {
|
|||
* @throws IOException
|
||||
*/
|
||||
public void splitMetaLog(final ServerName serverName) throws IOException {
|
||||
long splitTime = 0, splitLogSize = 0;
|
||||
Set<ServerName> serverNames = new HashSet<ServerName>();
|
||||
serverNames.add(serverName);
|
||||
splitMetaLog(serverNames);
|
||||
}
|
||||
|
||||
/**
|
||||
* Specialized method to handle the splitting for meta HLog
|
||||
* @param serverNames
|
||||
* @throws IOException
|
||||
*/
|
||||
public void splitMetaLog(final Set<ServerName> serverNames) throws IOException {
|
||||
long splitTime = 0, splitLogSize = 0;
|
||||
List<Path> logDirs = getLogDirs(serverNames);
|
||||
|
||||
splitLogManager.handleDeadWorkers(serverNames);
|
||||
|
|
|
@ -124,7 +124,7 @@ public class SplitLogManager extends ZooKeeperListener {
|
|||
private long unassignedTimeout;
|
||||
private long lastNodeCreateTime = Long.MAX_VALUE;
|
||||
public boolean ignoreZKDeleteForTesting = false;
|
||||
private volatile long lastRecoveringNodeCreationTime = Long.MAX_VALUE;
|
||||
private volatile long lastRecoveringNodeCreationTime = 0;
|
||||
// When lastRecoveringNodeCreationTime is older than the following threshold, we'll check
|
||||
// whether to GC stale recovering znodes
|
||||
private long checkRecoveringTimeThreshold = 15000; // 15 seconds
|
||||
|
|
Loading…
Reference in New Issue