HBASE-5916 RS restart just before master intialization we make the cluster non operative (Rajesh)
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1343324 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9aa80d2696
commit
3ee5280b6b
|
@ -343,7 +343,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
* @throws KeeperException
|
||||
* @throws InterruptedException
|
||||
*/
|
||||
void joinCluster(final Set<ServerName> onlineServers) throws IOException,
|
||||
void joinCluster() throws IOException,
|
||||
KeeperException, InterruptedException {
|
||||
// Concurrency note: In the below the accesses on regionsInTransition are
|
||||
// outside of a synchronization block where usually all accesses to RIT are
|
||||
|
@ -355,7 +355,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
|
||||
// Scan META to build list of existing regions, servers, and assignment
|
||||
// Returns servers who have not checked in (assumed dead) and their regions
|
||||
Map<ServerName, List<Pair<HRegionInfo, Result>>> deadServers = rebuildUserRegions(onlineServers);
|
||||
Map<ServerName, List<Pair<HRegionInfo, Result>>> deadServers = rebuildUserRegions();
|
||||
|
||||
// This method will assign all user regions if a clean server startup or
|
||||
// it will reconstitute master state and cleanup any leftovers from
|
||||
|
@ -368,16 +368,6 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
recoverTableInEnablingState(this.enablingTables, isWatcherCreated);
|
||||
}
|
||||
|
||||
/**
|
||||
* Only used for tests
|
||||
* @throws IOException
|
||||
* @throws KeeperException
|
||||
* @throws InterruptedException
|
||||
*/
|
||||
void joinCluster() throws IOException, KeeperException, InterruptedException {
|
||||
joinCluster(serverManager.getOnlineServers().keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Process all regions that are in transition up in zookeeper. Used by
|
||||
* master joining an already running cluster.
|
||||
|
@ -2509,11 +2499,12 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
* in META
|
||||
* @throws IOException
|
||||
*/
|
||||
Map<ServerName, List<Pair<HRegionInfo, Result>>> rebuildUserRegions(
|
||||
final Set<ServerName> onlineServers)
|
||||
Map<ServerName, List<Pair<HRegionInfo, Result>>> rebuildUserRegions()
|
||||
throws IOException, KeeperException {
|
||||
// Region assignment from META
|
||||
List<Result> results = MetaReader.fullScan(this.catalogTracker);
|
||||
// Get any new but slow to checkin region server that joined the cluster
|
||||
Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
|
||||
// Map of offline servers and their regions to be returned
|
||||
Map<ServerName, List<Pair<HRegionInfo,Result>>> offlineServers =
|
||||
new TreeMap<ServerName, List<Pair<HRegionInfo, Result>>>();
|
||||
|
@ -2722,7 +2713,13 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
final List<String> nodes)
|
||||
throws KeeperException, IOException {
|
||||
if (deadServers == null) return;
|
||||
Set<ServerName> actualDeadServers = this.serverManager.getDeadServers();
|
||||
for (Map.Entry<ServerName, List<Pair<HRegionInfo, Result>>> deadServer: deadServers.entrySet()) {
|
||||
// skip regions of dead servers because SSH will process regions during rs expiration.
|
||||
// see HBASE-5916
|
||||
if (actualDeadServers.contains(deadServer.getKey())) {
|
||||
continue;
|
||||
}
|
||||
List<Pair<HRegionInfo, Result>> regions = deadServer.getValue();
|
||||
for (Pair<HRegionInfo, Result> region : regions) {
|
||||
HRegionInfo regionInfo = region.getFirst();
|
||||
|
|
|
@ -597,11 +597,9 @@ Server {
|
|||
}
|
||||
|
||||
this.assignmentManager.startTimeOutMonitor();
|
||||
Set<ServerName> onlineServers = new HashSet<ServerName>(serverManager
|
||||
.getOnlineServers().keySet());
|
||||
// TODO: Should do this in background rather than block master startup
|
||||
status.setStatus("Splitting logs after master startup");
|
||||
splitLogAfterStartup(this.fileSystemManager, onlineServers);
|
||||
splitLogAfterStartup(this.fileSystemManager);
|
||||
|
||||
// Make sure root and meta assigned before proceeding.
|
||||
if (!assignRootAndMeta(status)) return;
|
||||
|
@ -618,7 +616,7 @@ Server {
|
|||
this.balancer.setMasterServices(this);
|
||||
// Fixup assignment manager status
|
||||
status.setStatus("Starting assignment manager");
|
||||
this.assignmentManager.joinCluster(onlineServers);
|
||||
this.assignmentManager.joinCluster();
|
||||
|
||||
this.balancer.setClusterStatus(getClusterStatus());
|
||||
|
||||
|
@ -638,7 +636,11 @@ Server {
|
|||
status.markComplete("Initialization successful");
|
||||
LOG.info("Master has completed initialization");
|
||||
initialized = true;
|
||||
|
||||
// clear the dead servers with same host name and port of online server because we are not
|
||||
// removing dead server with same hostname and port of rs which is trying to check in before
|
||||
// master initialization. See HBASE-5916.
|
||||
this.serverManager.clearDeadServersWithSameHostNameAndPortOfOnlineServer();
|
||||
|
||||
if (this.cpHost != null) {
|
||||
// don't let cp initialization errors kill the master
|
||||
try {
|
||||
|
@ -662,9 +664,8 @@ Server {
|
|||
* @param mfs
|
||||
* @param onlineServers
|
||||
*/
|
||||
protected void splitLogAfterStartup(final MasterFileSystem mfs,
|
||||
Set<ServerName> onlineServers) {
|
||||
mfs.splitLogAfterStartup(onlineServers);
|
||||
protected void splitLogAfterStartup(final MasterFileSystem mfs) {
|
||||
mfs.splitLogAfterStartup();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -190,7 +190,7 @@ public class MasterFileSystem {
|
|||
* @param onlineServers Set of online servers keyed by
|
||||
* {@link ServerName}
|
||||
*/
|
||||
void splitLogAfterStartup(final Set<ServerName> onlineServers) {
|
||||
void splitLogAfterStartup() {
|
||||
boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
|
||||
HLog.SPLIT_SKIP_ERRORS_DEFAULT);
|
||||
Path logsDirPath = new Path(this.rootdir, HConstants.HREGION_LOGDIR_NAME);
|
||||
|
@ -199,6 +199,10 @@ public class MasterFileSystem {
|
|||
try {
|
||||
if (!this.fs.exists(logsDirPath)) return;
|
||||
FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null);
|
||||
// Get online servers after getting log folders to avoid log folder deletion of newly
|
||||
// checked in region servers . see HBASE-5916
|
||||
Set<ServerName> onlineServers = ((HMaster) master).getServerManager().getOnlineServers()
|
||||
.keySet();
|
||||
|
||||
if (logFolders == null || logFolders.length == 0) {
|
||||
LOG.debug("No log files to split, proceeding...");
|
||||
|
|
|
@ -200,7 +200,10 @@ public class ServerManager {
|
|||
existingServer + " looks stale, new server:" + serverName);
|
||||
expireServer(existingServer);
|
||||
}
|
||||
throw new PleaseHoldException(message);
|
||||
if (services.isServerShutdownHandlerEnabled()) {
|
||||
// master has completed the initialization
|
||||
throw new PleaseHoldException(message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -247,8 +250,10 @@ public class ServerManager {
|
|||
LOG.debug(message);
|
||||
throw new YouAreDeadException(message);
|
||||
}
|
||||
|
||||
if (this.deadservers.cleanPreviousInstance(serverName)) {
|
||||
// remove dead server with same hostname and port of newly checking in rs after master
|
||||
// initialization.See HBASE-5916 for more information.
|
||||
if ((this.services == null || ((HMaster) this.services).isInitialized())
|
||||
&& this.deadservers.cleanPreviousInstance(serverName)) {
|
||||
// This server has now become alive after we marked it as dead.
|
||||
// We removed it's previous entry from the dead list to reflect it.
|
||||
LOG.debug(what + ":" + " Server " + serverName + " came back up," +
|
||||
|
@ -737,5 +742,18 @@ public class ServerManager {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* To clear any dead server with same host name and port of any online server
|
||||
*/
|
||||
void clearDeadServersWithSameHostNameAndPortOfOnlineServer() {
|
||||
ServerName sn = null;
|
||||
for (ServerName serverName : getOnlineServersList()) {
|
||||
while ((sn = ServerName.
|
||||
findServerWithSameHostnamePort(this.deadservers, serverName)) != null) {
|
||||
this.deadservers.remove(sn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -49,6 +49,8 @@ import org.apache.hadoop.hbase.master.TestMasterFailover;
|
|||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
|
||||
import org.apache.hadoop.hbase.util.Threads;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
||||
import org.apache.hadoop.hbase.LargeTests;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.junit.AfterClass;
|
||||
|
@ -98,9 +100,8 @@ public class TestRSKilledWhenMasterInitializing {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void splitLogAfterStartup(MasterFileSystem mfs,
|
||||
Set<ServerName> onlineServers) {
|
||||
super.splitLogAfterStartup(mfs, onlineServers);
|
||||
protected void splitLogAfterStartup(MasterFileSystem mfs) {
|
||||
super.splitLogAfterStartup(mfs);
|
||||
logSplit = true;
|
||||
// If "TestingMaster.sleep" is set, sleep after log split.
|
||||
if (getConfiguration().getBoolean("TestingMaster.sleep", false)) {
|
||||
|
@ -215,6 +216,10 @@ public class TestRSKilledWhenMasterInitializing {
|
|||
while (serverManager.areDeadServersInProgress()) {
|
||||
Thread.sleep(100);
|
||||
}
|
||||
// Create a ZKW to use in the test
|
||||
ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(TESTUTIL);
|
||||
ZKAssign.blockUntilNoRIT(zkw);
|
||||
|
||||
table = new HTable(TESTUTIL.getConfiguration(), TABLENAME);
|
||||
resultScanner = table.getScanner(new Scan());
|
||||
count = 0;
|
||||
|
|
Loading…
Reference in New Issue