HBASE-19998 Flakey TestVisibilityLabelsWithDefaultVisLabelService

Only call server.checkIfShouldMoveSystemRegionAsync if a node has been
added. Do not call it if only one regionserver in cluster. Make it
so ServerCrashProcedure runs before it. Add logging if
server.checkIfShouldMoveSystemRegionAsync was responsible for
MOVE (Previous was a mystery when it cut in).

Previous we'd call it when there was a nodeChildrenChanged. These
happen before nodeDeleted. If a server crashed,
checkIfShouldMoveSystemRegionAsync could run first, find the
server that had not yet registered as crashed, find system
tables on it and then try to move them. It would fail because
server would not respond to RPC. The region move would then
be waiting on the servercrashprocedure to wake it up when
done processing but this move had locked the region so
SCP couldn't run....
This commit is contained in:
Michael Stack 2018-02-14 22:32:29 -08:00
parent 816d860228
commit 50c705dad9
4 changed files with 39 additions and 6 deletions

View File

@ -486,6 +486,11 @@ public interface MasterServices extends Server {
public String getRegionServerVersion(final ServerName sn);
/**
* Called when a new RegionServer is added to the cluster.
* Checks if new server has a newer version than any existing server and will move system tables
* there if so.
*/
public void checkIfShouldMoveSystemRegionAsync();
/**

View File

@ -104,9 +104,6 @@ public class RegionServerTracker extends ZKListener {
}
}
}
if (server.isInitialized()) {
server.checkIfShouldMoveSystemRegionAsync();
}
}
private void remove(final ServerName sn) {
@ -115,6 +112,19 @@ public class RegionServerTracker extends ZKListener {
}
}
@Override
public void nodeCreated(String path) {
if (path.startsWith(watcher.znodePaths.rsZNode)) {
String serverName = ZKUtil.getNodeName(path);
LOG.info("RegionServer ephemeral node created, adding [" + serverName + "]");
if (server.isInitialized()) {
// Only call the check to move servers if a RegionServer was added to the cluster; in this
// case it could be a server with a new version so it makes sense to run the check.
server.checkIfShouldMoveSystemRegionAsync();
}
}
}
@Override
public void nodeDeleted(String path) {
if (path.startsWith(watcher.znodePaths.rsZNode)) {

View File

@ -455,12 +455,27 @@ public class AssignmentManager implements ServerListener {
* If so, move all system table regions to RS with the highest version to keep compatibility.
* The reason is, RS in new version may not be able to access RS in old version when there are
* some incompatible changes.
* <p>This method is called when a new RegionServer is added to cluster only.</p>
*/
public void checkIfShouldMoveSystemRegionAsync() {
// TODO: Fix this thread. If a server is killed and a new one started, this thread thinks that
// it should 'move' the system tables from the old server to the new server but
// ServerCrashProcedure is on it; and it will take care of the assign without dataloss.
if (this.master.getServerManager().countOfRegionServers() <= 1) {
return;
}
// This thread used to run whenever there was a change in the cluster. The ZooKeeper
// childrenChanged notification came in before the nodeDeleted message and so this method
// cold run before a ServerCrashProcedure could run. That meant that this thread could see
// a Crashed Server before ServerCrashProcedure and it could find system regions on the
// crashed server and go move them before ServerCrashProcedure had a chance; could be
// dataloss too if WALs were not recovered.
new Thread(() -> {
try {
synchronized (checkIfShouldMoveSystemRegionLock) {
List<RegionPlan> plans = new ArrayList<>();
// TODO: I don't think this code does a good job if all servers in cluster have same
// version. It looks like it will schedule unnecessary moves.
for (ServerName server : getExcludedServersForSystemTable()) {
if (master.getServerManager().isServerDead(server)) {
// TODO: See HBASE-18494 and HBASE-18495. Though getExcludedServersForSystemTable()
@ -471,13 +486,15 @@ public class AssignmentManager implements ServerListener {
// handling.
continue;
}
List<RegionInfo> regionsShouldMove = getCarryingSystemTables(server);
List<RegionInfo> regionsShouldMove = getSystemTables(server);
if (!regionsShouldMove.isEmpty()) {
for (RegionInfo regionInfo : regionsShouldMove) {
// null value for dest forces destination server to be selected by balancer
RegionPlan plan = new RegionPlan(regionInfo, server, null);
if (regionInfo.isMetaRegion()) {
// Must move meta region first.
LOG.info("Async MOVE of {} to newer Server={}",
regionInfo.getEncodedName(), server);
moveAsync(plan);
} else {
plans.add(plan);
@ -485,6 +502,8 @@ public class AssignmentManager implements ServerListener {
}
}
for (RegionPlan plan : plans) {
LOG.info("Async MOVE of {} to newer Server={}",
plan.getRegionInfo().getEncodedName(), server);
moveAsync(plan);
}
}
@ -495,7 +514,7 @@ public class AssignmentManager implements ServerListener {
}).start();
}
private List<RegionInfo> getCarryingSystemTables(ServerName serverName) {
private List<RegionInfo> getSystemTables(ServerName serverName) {
Set<RegionStateNode> regions = this.getRegionStates().getServerNode(serverName).getRegions();
if (regions == null) {
return new ArrayList<>();

View File

@ -54,7 +54,6 @@ public class MoveRegionProcedure extends AbstractStateMachineRegionProcedure<Mov
public MoveRegionProcedure(final MasterProcedureEnv env, final RegionPlan plan) {
super(env, plan.getRegionInfo());
this.plan = plan;
LOG.info("REMOVE", new Throwable("REMOVE: Just to see who is calling Move!!!"));
}
@Override