HBASE-3829 TestMasterFailover failures in jenkins

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1097676 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2011-04-29 03:33:27 +00:00
parent d6ed1130d5
commit 7d42c217a9
2 changed files with 53 additions and 63 deletions

View File

@ -91,6 +91,7 @@ Release 0.91.0 - Unreleased
HBASE-3210 HBASE-1921 for the new master
HBASE-3827 hbase-1502, removing heartbeats, broke master joining a running
cluster and was returning master hostname for rs to use
HBASE-3829 TestMasterFailover failures in jenkins
IMPROVEMENTS
HBASE-3290 Max Compaction Size (Nicolas Spiegelberg via Stack)

View File

@ -599,11 +599,17 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler,
}
try {
// Try and register with the Master; tell it we are here.
while (!this.stopped) {
if (tryReportForDuty()) break;
LOG.warn("No response on reportForDuty. Sleeping and then retrying.");
this.sleeper.sleep();
// Try and register with the Master; tell it we are here. Break if
// server is stopped or the clusterup flag is down of hdfs went wacky.
while (keepLooping()) {
MapWritable w = reportForDuty();
if (w == null) {
LOG.warn("reportForDuty failed; sleeping and then retrying.");
this.sleeper.sleep();
} else {
handleReportForDutyResponse(w);
break;
}
}
// We registered with the Master. Go into run mode.
@ -617,8 +623,10 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler,
stop("Exiting; cluster shutdown set and not carrying any regions");
} else if (!this.stopping) {
this.stopping = true;
LOG.info("Closing user regions");
closeUserRegions(this.abortRequested);
} else if (this.stopping && LOG.isDebugEnabled()) {
LOG.info("Only meta regions remain open");
if (!onlyMetaRegionsRemaining) {
onlyMetaRegionsRemaining = isOnlyMetaRegionsRemaining();
}
@ -730,22 +738,19 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler,
HServerLoad hsl = buildServerLoad();
// Why we do this?
this.requestCount.set(0);
while (!this.stopped) {
try {
this.hbaseMaster.regionServerReport(this.serverNameFromMasterPOV.getBytes(), hsl);
break;
} catch (IOException ioe) {
if (ioe instanceof RemoteException) {
ioe = ((RemoteException)ioe).unwrapRemoteException();
}
if (ioe instanceof YouAreDeadException) {
// This will be caught and handled as a fatal error in run()
throw ioe;
}
// Couldn't connect to the master, get location from zk and reconnect
// Method blocks until new master is found or we are stopped
getMaster();
try {
this.hbaseMaster.regionServerReport(this.serverNameFromMasterPOV.getBytes(), hsl);
} catch (IOException ioe) {
if (ioe instanceof RemoteException) {
ioe = ((RemoteException)ioe).unwrapRemoteException();
}
if (ioe instanceof YouAreDeadException) {
// This will be caught and handled as a fatal error in run()
throw ioe;
}
// Couldn't connect to the master, get location from zk and reconnect
// Method blocks until new master is found or we are stopped
getMaster();
}
}
@ -1431,7 +1436,7 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler,
Threads.shutdown(this.cacheFlusher);
Threads.shutdown(this.compactSplitThread);
Threads.shutdown(this.hlogRoller);
this.service.shutdown();
if (this.service != null) this.service.shutdown();
if (this.replicationHandler != null) {
this.replicationHandler.join();
}
@ -1448,16 +1453,14 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler,
private ServerName getMaster() {
ServerName masterServerName = null;
while ((masterServerName = this.masterAddressManager.getMasterAddress()) == null) {
if (stopped) {
return null;
}
LOG.debug("No master found, will retry");
if (!keepLooping()) return null;
LOG.debug("No master found; retry");
sleeper.sleep();
}
InetSocketAddress isa =
new InetSocketAddress(masterServerName.getHostname(), masterServerName.getPort());
HMasterRegionInterface master = null;
while (!stopped && master == null) {
while (keepLooping() && master == null) {
LOG.info("Attempting connect to Master server at " +
this.masterAddressManager.getMasterAddress());
try {
@ -1484,16 +1487,11 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler,
}
/**
* @return True if successfully invoked {@link #reportForDuty()}
* @throws IOException
* @return True if we should break loop because cluster is going down or
* this server has been stopped or hdfs has gone bad.
*/
private boolean tryReportForDuty() throws IOException {
MapWritable w = reportForDuty();
if (w != null) {
handleReportForDutyResponse(w);
return true;
}
return false;
private boolean keepLooping() {
return !this.stopped && isClusterUp();
}
/*
@ -1504,36 +1502,27 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler,
* @throws IOException
*/
private MapWritable reportForDuty() throws IOException {
ServerName masterServerName = null;
while (!stopped && (masterServerName = getMaster()) == null) {
LOG.warn("Unable to get master for initialization -- sleeping");
sleeper.sleep();
}
MapWritable result = null;
long lastMsg = 0;
while (!stopped) {
try {
this.requestCount.set(0);
LOG.info("Telling master at " + masterServerName + " that we are up " +
"with port=" + this.isa.getPort() + ", startcode=" + this.startcode);
lastMsg = EnvironmentEdgeManager.currentTimeMillis();
int port = this.isa.getPort();
result = this.hbaseMaster.regionServerStartup(port, this.startcode, lastMsg);
break;
} catch (RemoteException e) {
IOException ioe = e.unwrapRemoteException();
if (ioe instanceof ClockOutOfSyncException) {
LOG.fatal("Master rejected startup because clock is out of sync",
ioe);
// Re-throw IOE will cause RS to abort
throw ioe;
} else {
LOG.warn("remote error telling master we are up", e);
}
} catch (IOException e) {
LOG.warn("error telling master we are up", e);
ServerName masterServerName = getMaster();
if (masterServerName == null) return result;
try {
this.requestCount.set(0);
LOG.info("Telling master at " + masterServerName + " that we are up " +
"with port=" + this.isa.getPort() + ", startcode=" + this.startcode);
long now = EnvironmentEdgeManager.currentTimeMillis();
int port = this.isa.getPort();
result = this.hbaseMaster.regionServerStartup(port, this.startcode, now);
} catch (RemoteException e) {
IOException ioe = e.unwrapRemoteException();
if (ioe instanceof ClockOutOfSyncException) {
LOG.fatal("Master rejected startup because clock is out of sync", ioe);
// Re-throw IOE will cause RS to abort
throw ioe;
} else {
LOG.warn("remote error telling master we are up", e);
}
sleeper.sleep(lastMsg);
} catch (IOException e) {
LOG.warn("error telling master we are up", e);
}
return result;
}