HBASE-25774 ServerManager.getOnlineServer may miss some region servers when refreshing state in some procedure implementations
Revert "HBASE-25032 Wait for region server to become online before adding it to online servers in Master (#2770)"
This reverts commit 7652bef985
.
This commit is contained in:
parent
e2b5c20dc9
commit
39fdeae838
|
@ -255,8 +255,7 @@ public class ServerManager {
|
|||
}
|
||||
|
||||
/**
|
||||
* Let the server manager know a regionserver is requesting configurations.
|
||||
* Regionserver will not be added here, but in its first report.
|
||||
* Let the server manager know a new regionserver has come online
|
||||
* @param request the startup request
|
||||
* @param ia the InetAddress from which request is received
|
||||
* @return The ServerName we know this server as.
|
||||
|
@ -278,6 +277,10 @@ public class ServerManager {
|
|||
request.getServerStartCode());
|
||||
checkClockSkew(sn, request.getServerCurrentTime());
|
||||
checkIsDead(sn, "STARTUP");
|
||||
if (!checkAndRecordNewServer(sn, ServerLoad.EMPTY_SERVERLOAD)) {
|
||||
LOG.warn("THIS SHOULD NOT HAPPEN, RegionServerStartup"
|
||||
+ " could not record the server: " + sn);
|
||||
}
|
||||
return sn;
|
||||
}
|
||||
|
||||
|
@ -340,6 +343,7 @@ public class ServerManager {
|
|||
if (null == this.onlineServers.replace(sn, sl)) {
|
||||
// Already have this host+port combo and its just different start code?
|
||||
// Just let the server in. Presume master joining a running cluster.
|
||||
// recordNewServer is what happens at the end of reportServerStartup.
|
||||
// The only thing we are skipping is passing back to the regionserver
|
||||
// the ServerName to use. Here we presume a master has already done
|
||||
// that so we'll press on with whatever it gave us for ServerName.
|
||||
|
|
|
@ -1027,9 +1027,9 @@ public class HRegionServer extends HasThread implements
|
|||
this.rsHost = new RegionServerCoprocessorHost(this, this.conf);
|
||||
}
|
||||
|
||||
// Get configurations from the Master. Break if server is stopped or
|
||||
// the clusterup flag is down or hdfs went wacky. Then start up all Services.
|
||||
// Use RetryCounter to get backoff in case Master is struggling to come up.
|
||||
// Try and register with the Master; tell it we are here. Break if server is stopped or the
|
||||
// clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and start
|
||||
// up all Services. Use RetryCounter to get backoff in case Master is struggling to come up.
|
||||
RetryCounterFactory rcf = new RetryCounterFactory(Integer.MAX_VALUE,
|
||||
this.sleeper.getPeriod(), 1000 * 60 * 5);
|
||||
RetryCounter rc = rcf.create();
|
||||
|
@ -1056,7 +1056,7 @@ public class HRegionServer extends HasThread implements
|
|||
rsQuotaManager.start(getRpcServer().getScheduler());
|
||||
}
|
||||
|
||||
// Run mode.
|
||||
// We registered with the Master. Go into run mode.
|
||||
long lastMsg = System.currentTimeMillis();
|
||||
long oldRequestCount = -1;
|
||||
// The main run loop.
|
||||
|
@ -1090,14 +1090,7 @@ public class HRegionServer extends HasThread implements
|
|||
}
|
||||
long now = System.currentTimeMillis();
|
||||
if ((now - lastMsg) >= msgInterval) {
|
||||
// Register with the Master now that our setup is complete.
|
||||
if (tryRegionServerReport(lastMsg, now) && !online.get()) {
|
||||
// Wake up anyone waiting for this server to online
|
||||
synchronized (online) {
|
||||
online.set(true);
|
||||
online.notifyAll();
|
||||
}
|
||||
}
|
||||
tryRegionServerReport(lastMsg, now);
|
||||
lastMsg = System.currentTimeMillis();
|
||||
}
|
||||
if (!isStopped() && !isAborted()) {
|
||||
|
@ -1289,12 +1282,12 @@ public class HRegionServer extends HasThread implements
|
|||
}
|
||||
|
||||
@InterfaceAudience.Private
|
||||
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
|
||||
protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
|
||||
throws IOException {
|
||||
RegionServerStatusService.BlockingInterface rss = rssStub;
|
||||
if (rss == null) {
|
||||
// the current server could be stopping.
|
||||
return false;
|
||||
return;
|
||||
}
|
||||
ClusterStatusProtos.ServerLoad sl = buildServerLoad(reportStartTime, reportEndTime);
|
||||
try {
|
||||
|
@ -1314,9 +1307,7 @@ public class HRegionServer extends HasThread implements
|
|||
// Couldn't connect to the master, get location from zk and reconnect
|
||||
// Method blocks until new master is found or we are stopped
|
||||
createRegionServerStatusStub(true);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
ClusterStatusProtos.ServerLoad buildServerLoad(long reportStartTime, long reportEndTime)
|
||||
|
@ -1580,6 +1571,11 @@ public class HRegionServer extends HasThread implements
|
|||
", sessionid=0x" +
|
||||
Long.toHexString(this.zooKeeper.getRecoverableZooKeeper().getSessionId()));
|
||||
|
||||
// Wake up anyone waiting for this server to online
|
||||
synchronized (online) {
|
||||
online.set(true);
|
||||
online.notifyAll();
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
stop("Failed initialization");
|
||||
throw convertThrowableToIOE(cleanup(e, "Failed init"),
|
||||
|
@ -2558,9 +2554,10 @@ public class HRegionServer extends HasThread implements
|
|||
}
|
||||
|
||||
/*
|
||||
* Run initialization using parameters passed us by the master.
|
||||
* Let the master know we're here Run initialization using parameters passed
|
||||
* us by the master.
|
||||
* @return A Map of key/value configurations we got from the Master else
|
||||
* null if we failed during report.
|
||||
* null if we failed to register.
|
||||
* @throws IOException
|
||||
*/
|
||||
private RegionServerStartupResponse reportForDuty() throws IOException {
|
||||
|
|
|
@ -60,9 +60,8 @@ public class TestGetReplicationLoad {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime) {
|
||||
protected void tryRegionServerReport(long reportStartTime, long reportEndTime) {
|
||||
// do nothing
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -54,6 +54,12 @@ public class TestMasterMetrics {
|
|||
KeeperException, InterruptedException {
|
||||
super(conf, cp);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void tryRegionServerReport(
|
||||
long reportStartTime, long reportEndTime) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
|
||||
@BeforeClass
|
||||
|
|
|
@ -74,14 +74,13 @@ public class TestCompactionInDeadRegionServer {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
|
||||
protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
|
||||
throws IOException {
|
||||
try {
|
||||
super.tryRegionServerReport(reportStartTime, reportEndTime);
|
||||
} catch (YouAreDeadException e) {
|
||||
// ignore, do not abort
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue