HBASE-25774 ServerManager.getOnlineServer may miss some region servers when refreshing state in some procedure implementations

Revert "HBASE-25032 Wait for region server to become online before adding it to online servers in Master (#2770)"

This reverts commit 7652bef985.
This commit is contained in:
Andrew Purtell 2021-05-07 17:56:22 -07:00
parent e2b5c20dc9
commit 39fdeae838
5 changed files with 29 additions and 24 deletions

View File

@ -255,8 +255,7 @@ public class ServerManager {
} }
/** /**
* Let the server manager know a regionserver is requesting configurations. * Let the server manager know a new regionserver has come online
* Regionserver will not be added here, but in its first report.
* @param request the startup request * @param request the startup request
* @param ia the InetAddress from which request is received * @param ia the InetAddress from which request is received
* @return The ServerName we know this server as. * @return The ServerName we know this server as.
@ -278,6 +277,10 @@ public class ServerManager {
request.getServerStartCode()); request.getServerStartCode());
checkClockSkew(sn, request.getServerCurrentTime()); checkClockSkew(sn, request.getServerCurrentTime());
checkIsDead(sn, "STARTUP"); checkIsDead(sn, "STARTUP");
if (!checkAndRecordNewServer(sn, ServerLoad.EMPTY_SERVERLOAD)) {
LOG.warn("THIS SHOULD NOT HAPPEN, RegionServerStartup"
+ " could not record the server: " + sn);
}
return sn; return sn;
} }
@ -340,6 +343,7 @@ public class ServerManager {
if (null == this.onlineServers.replace(sn, sl)) { if (null == this.onlineServers.replace(sn, sl)) {
// Already have this host+port combo and its just different start code? // Already have this host+port combo and its just different start code?
// Just let the server in. Presume master joining a running cluster. // Just let the server in. Presume master joining a running cluster.
// recordNewServer is what happens at the end of reportServerStartup.
// The only thing we are skipping is passing back to the regionserver // The only thing we are skipping is passing back to the regionserver
// the ServerName to use. Here we presume a master has already done // the ServerName to use. Here we presume a master has already done
// that so we'll press on with whatever it gave us for ServerName. // that so we'll press on with whatever it gave us for ServerName.

View File

@ -1027,9 +1027,9 @@ public class HRegionServer extends HasThread implements
this.rsHost = new RegionServerCoprocessorHost(this, this.conf); this.rsHost = new RegionServerCoprocessorHost(this, this.conf);
} }
// Get configurations from the Master. Break if server is stopped or // Try and register with the Master; tell it we are here. Break if server is stopped or the
// the clusterup flag is down or hdfs went wacky. Then start up all Services. // clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and start
// Use RetryCounter to get backoff in case Master is struggling to come up. // up all Services. Use RetryCounter to get backoff in case Master is struggling to come up.
RetryCounterFactory rcf = new RetryCounterFactory(Integer.MAX_VALUE, RetryCounterFactory rcf = new RetryCounterFactory(Integer.MAX_VALUE,
this.sleeper.getPeriod(), 1000 * 60 * 5); this.sleeper.getPeriod(), 1000 * 60 * 5);
RetryCounter rc = rcf.create(); RetryCounter rc = rcf.create();
@ -1056,7 +1056,7 @@ public class HRegionServer extends HasThread implements
rsQuotaManager.start(getRpcServer().getScheduler()); rsQuotaManager.start(getRpcServer().getScheduler());
} }
// Run mode. // We registered with the Master. Go into run mode.
long lastMsg = System.currentTimeMillis(); long lastMsg = System.currentTimeMillis();
long oldRequestCount = -1; long oldRequestCount = -1;
// The main run loop. // The main run loop.
@ -1090,14 +1090,7 @@ public class HRegionServer extends HasThread implements
} }
long now = System.currentTimeMillis(); long now = System.currentTimeMillis();
if ((now - lastMsg) >= msgInterval) { if ((now - lastMsg) >= msgInterval) {
// Register with the Master now that our setup is complete. tryRegionServerReport(lastMsg, now);
if (tryRegionServerReport(lastMsg, now) && !online.get()) {
// Wake up anyone waiting for this server to online
synchronized (online) {
online.set(true);
online.notifyAll();
}
}
lastMsg = System.currentTimeMillis(); lastMsg = System.currentTimeMillis();
} }
if (!isStopped() && !isAborted()) { if (!isStopped() && !isAborted()) {
@ -1289,12 +1282,12 @@ public class HRegionServer extends HasThread implements
} }
@InterfaceAudience.Private @InterfaceAudience.Private
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime) protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
throws IOException { throws IOException {
RegionServerStatusService.BlockingInterface rss = rssStub; RegionServerStatusService.BlockingInterface rss = rssStub;
if (rss == null) { if (rss == null) {
// the current server could be stopping. // the current server could be stopping.
return false; return;
} }
ClusterStatusProtos.ServerLoad sl = buildServerLoad(reportStartTime, reportEndTime); ClusterStatusProtos.ServerLoad sl = buildServerLoad(reportStartTime, reportEndTime);
try { try {
@ -1314,9 +1307,7 @@ public class HRegionServer extends HasThread implements
// Couldn't connect to the master, get location from zk and reconnect // Couldn't connect to the master, get location from zk and reconnect
// Method blocks until new master is found or we are stopped // Method blocks until new master is found or we are stopped
createRegionServerStatusStub(true); createRegionServerStatusStub(true);
return false;
} }
return true;
} }
ClusterStatusProtos.ServerLoad buildServerLoad(long reportStartTime, long reportEndTime) ClusterStatusProtos.ServerLoad buildServerLoad(long reportStartTime, long reportEndTime)
@ -1580,6 +1571,11 @@ public class HRegionServer extends HasThread implements
", sessionid=0x" + ", sessionid=0x" +
Long.toHexString(this.zooKeeper.getRecoverableZooKeeper().getSessionId())); Long.toHexString(this.zooKeeper.getRecoverableZooKeeper().getSessionId()));
// Wake up anyone waiting for this server to online
synchronized (online) {
online.set(true);
online.notifyAll();
}
} catch (Throwable e) { } catch (Throwable e) {
stop("Failed initialization"); stop("Failed initialization");
throw convertThrowableToIOE(cleanup(e, "Failed init"), throw convertThrowableToIOE(cleanup(e, "Failed init"),
@ -2558,9 +2554,10 @@ public class HRegionServer extends HasThread implements
} }
/* /*
* Run initialization using parameters passed us by the master. * Let the master know we're here Run initialization using parameters passed
* us by the master.
* @return A Map of key/value configurations we got from the Master else * @return A Map of key/value configurations we got from the Master else
* null if we failed during report. * null if we failed to register.
* @throws IOException * @throws IOException
*/ */
private RegionServerStartupResponse reportForDuty() throws IOException { private RegionServerStartupResponse reportForDuty() throws IOException {

View File

@ -60,9 +60,8 @@ public class TestGetReplicationLoad {
} }
@Override @Override
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime) { protected void tryRegionServerReport(long reportStartTime, long reportEndTime) {
// do nothing // do nothing
return true;
} }
} }

View File

@ -54,6 +54,12 @@ public class TestMasterMetrics {
KeeperException, InterruptedException { KeeperException, InterruptedException {
super(conf, cp); super(conf, cp);
} }
@Override
protected void tryRegionServerReport(
long reportStartTime, long reportEndTime) {
// do nothing
}
} }
@BeforeClass @BeforeClass

View File

@ -74,14 +74,13 @@ public class TestCompactionInDeadRegionServer {
} }
@Override @Override
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime) protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
throws IOException { throws IOException {
try { try {
super.tryRegionServerReport(reportStartTime, reportEndTime); super.tryRegionServerReport(reportStartTime, reportEndTime);
} catch (YouAreDeadException e) { } catch (YouAreDeadException e) {
// ignore, do not abort // ignore, do not abort
} }
return true;
} }
} }