HBASE-5916 RS restart just before master intialization we make the cluster non operative (Rajesh)

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1343324 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
ramkrishna 2012-05-28 17:14:56 +00:00
parent 9aa80d2696
commit 3ee5280b6b
5 changed files with 54 additions and 29 deletions

View File

@ -343,7 +343,7 @@ public class AssignmentManager extends ZooKeeperListener {
* @throws KeeperException
* @throws InterruptedException
*/
void joinCluster(final Set<ServerName> onlineServers) throws IOException,
void joinCluster() throws IOException,
KeeperException, InterruptedException {
// Concurrency note: In the below the accesses on regionsInTransition are
// outside of a synchronization block where usually all accesses to RIT are
@ -355,7 +355,7 @@ public class AssignmentManager extends ZooKeeperListener {
// Scan META to build list of existing regions, servers, and assignment
// Returns servers who have not checked in (assumed dead) and their regions
Map<ServerName, List<Pair<HRegionInfo, Result>>> deadServers = rebuildUserRegions(onlineServers);
Map<ServerName, List<Pair<HRegionInfo, Result>>> deadServers = rebuildUserRegions();
// This method will assign all user regions if a clean server startup or
// it will reconstitute master state and cleanup any leftovers from
@ -368,16 +368,6 @@ public class AssignmentManager extends ZooKeeperListener {
recoverTableInEnablingState(this.enablingTables, isWatcherCreated);
}
/**
* Only used for tests
* @throws IOException
* @throws KeeperException
* @throws InterruptedException
*/
void joinCluster() throws IOException, KeeperException, InterruptedException {
joinCluster(serverManager.getOnlineServers().keySet());
}
/**
* Process all regions that are in transition up in zookeeper. Used by
* master joining an already running cluster.
@ -2509,11 +2499,12 @@ public class AssignmentManager extends ZooKeeperListener {
* in META
* @throws IOException
*/
Map<ServerName, List<Pair<HRegionInfo, Result>>> rebuildUserRegions(
final Set<ServerName> onlineServers)
Map<ServerName, List<Pair<HRegionInfo, Result>>> rebuildUserRegions()
throws IOException, KeeperException {
// Region assignment from META
List<Result> results = MetaReader.fullScan(this.catalogTracker);
// Get any new but slow to checkin region server that joined the cluster
Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
// Map of offline servers and their regions to be returned
Map<ServerName, List<Pair<HRegionInfo,Result>>> offlineServers =
new TreeMap<ServerName, List<Pair<HRegionInfo, Result>>>();
@ -2722,7 +2713,13 @@ public class AssignmentManager extends ZooKeeperListener {
final List<String> nodes)
throws KeeperException, IOException {
if (deadServers == null) return;
Set<ServerName> actualDeadServers = this.serverManager.getDeadServers();
for (Map.Entry<ServerName, List<Pair<HRegionInfo, Result>>> deadServer: deadServers.entrySet()) {
// skip regions of dead servers because SSH will process regions during rs expiration.
// see HBASE-5916
if (actualDeadServers.contains(deadServer.getKey())) {
continue;
}
List<Pair<HRegionInfo, Result>> regions = deadServer.getValue();
for (Pair<HRegionInfo, Result> region : regions) {
HRegionInfo regionInfo = region.getFirst();

View File

@ -597,11 +597,9 @@ Server {
}
this.assignmentManager.startTimeOutMonitor();
Set<ServerName> onlineServers = new HashSet<ServerName>(serverManager
.getOnlineServers().keySet());
// TODO: Should do this in background rather than block master startup
status.setStatus("Splitting logs after master startup");
splitLogAfterStartup(this.fileSystemManager, onlineServers);
splitLogAfterStartup(this.fileSystemManager);
// Make sure root and meta assigned before proceeding.
if (!assignRootAndMeta(status)) return;
@ -618,7 +616,7 @@ Server {
this.balancer.setMasterServices(this);
// Fixup assignment manager status
status.setStatus("Starting assignment manager");
this.assignmentManager.joinCluster(onlineServers);
this.assignmentManager.joinCluster();
this.balancer.setClusterStatus(getClusterStatus());
@ -638,7 +636,11 @@ Server {
status.markComplete("Initialization successful");
LOG.info("Master has completed initialization");
initialized = true;
// clear the dead servers with same host name and port of online server because we are not
// removing dead server with same hostname and port of rs which is trying to check in before
// master initialization. See HBASE-5916.
this.serverManager.clearDeadServersWithSameHostNameAndPortOfOnlineServer();
if (this.cpHost != null) {
// don't let cp initialization errors kill the master
try {
@ -662,9 +664,8 @@ Server {
* @param mfs
* @param onlineServers
*/
protected void splitLogAfterStartup(final MasterFileSystem mfs,
Set<ServerName> onlineServers) {
mfs.splitLogAfterStartup(onlineServers);
protected void splitLogAfterStartup(final MasterFileSystem mfs) {
mfs.splitLogAfterStartup();
}
/**

View File

@ -190,7 +190,7 @@ public class MasterFileSystem {
* @param onlineServers Set of online servers keyed by
* {@link ServerName}
*/
void splitLogAfterStartup(final Set<ServerName> onlineServers) {
void splitLogAfterStartup() {
boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
HLog.SPLIT_SKIP_ERRORS_DEFAULT);
Path logsDirPath = new Path(this.rootdir, HConstants.HREGION_LOGDIR_NAME);
@ -199,6 +199,10 @@ public class MasterFileSystem {
try {
if (!this.fs.exists(logsDirPath)) return;
FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null);
// Get online servers after getting log folders to avoid log folder deletion of newly
// checked in region servers . see HBASE-5916
Set<ServerName> onlineServers = ((HMaster) master).getServerManager().getOnlineServers()
.keySet();
if (logFolders == null || logFolders.length == 0) {
LOG.debug("No log files to split, proceeding...");

View File

@ -200,7 +200,10 @@ public class ServerManager {
existingServer + " looks stale, new server:" + serverName);
expireServer(existingServer);
}
throw new PleaseHoldException(message);
if (services.isServerShutdownHandlerEnabled()) {
// master has completed the initialization
throw new PleaseHoldException(message);
}
}
}
@ -247,8 +250,10 @@ public class ServerManager {
LOG.debug(message);
throw new YouAreDeadException(message);
}
if (this.deadservers.cleanPreviousInstance(serverName)) {
// remove dead server with same hostname and port of newly checking in rs after master
// initialization.See HBASE-5916 for more information.
if ((this.services == null || ((HMaster) this.services).isInitialized())
&& this.deadservers.cleanPreviousInstance(serverName)) {
// This server has now become alive after we marked it as dead.
// We removed it's previous entry from the dead list to reflect it.
LOG.debug(what + ":" + " Server " + serverName + " came back up," +
@ -737,5 +742,18 @@ public class ServerManager {
}
}
}
/**
* To clear any dead server with same host name and port of any online server
*/
void clearDeadServersWithSameHostNameAndPortOfOnlineServer() {
ServerName sn = null;
for (ServerName serverName : getOnlineServersList()) {
while ((sn = ServerName.
findServerWithSameHostnamePort(this.deadservers, serverName)) != null) {
this.deadservers.remove(sn);
}
}
}
}

View File

@ -49,6 +49,8 @@ import org.apache.hadoop.hbase.master.TestMasterFailover;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.hadoop.hbase.LargeTests;
import org.apache.zookeeper.KeeperException;
import org.junit.AfterClass;
@ -98,9 +100,8 @@ public class TestRSKilledWhenMasterInitializing {
}
@Override
protected void splitLogAfterStartup(MasterFileSystem mfs,
Set<ServerName> onlineServers) {
super.splitLogAfterStartup(mfs, onlineServers);
protected void splitLogAfterStartup(MasterFileSystem mfs) {
super.splitLogAfterStartup(mfs);
logSplit = true;
// If "TestingMaster.sleep" is set, sleep after log split.
if (getConfiguration().getBoolean("TestingMaster.sleep", false)) {
@ -215,6 +216,10 @@ public class TestRSKilledWhenMasterInitializing {
while (serverManager.areDeadServersInProgress()) {
Thread.sleep(100);
}
// Create a ZKW to use in the test
ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(TESTUTIL);
ZKAssign.blockUntilNoRIT(zkw);
table = new HTable(TESTUTIL.getConfiguration(), TABLENAME);
resultScanner = table.getScanner(new Scan());
count = 0;