HBASE-24117 Shutdown AssignmentManager before ProcedureExecutor may cause SCP to accidentally skip assigning a region (#1865)

Signed-off-by: Michael Stack <stack@apache.org>
This commit is contained in:
Duo Zhang 2020-06-09 11:07:16 +08:00
parent 6d96694a25
commit c5dacfbbea
3 changed files with 13 additions and 3 deletions

View File

@ -1502,6 +1502,11 @@ public class HMaster extends HRegionServer implements MasterServices {
LOG.debug("Stopping service threads");
// stop procedure executor prior to other services such as server manager and assignment
// manager, as these services are important for some running procedures. See HBASE-24117 for
// example.
stopProcedureExecutor();
if (this.quotaManager != null) {
this.quotaManager.stop();
}
@ -1516,8 +1521,6 @@ public class HMaster extends HRegionServer implements MasterServices {
this.assignmentManager.stop();
}
stopProcedureExecutor();
if (masterRegion != null) {
masterRegion.close(isAborted());
}

View File

@ -24,6 +24,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
@ -485,6 +486,12 @@ public class ServerCrashProcedure
// UPDATE: HBCKServerCrashProcedure overrides isMatchingRegionLocation; this check can get
// in the way of our clearing out 'Unknown Servers'.
if (!isMatchingRegionLocation(regionNode)) {
// See HBASE-24117, though we have already changed the shutdown order, it is still worth
// double checking here to confirm that we do not skip assignment incorrectly.
if (!am.isRunning()) {
throw new DoNotRetryIOException(
"AssignmentManager has been stopped, can not process assignment any more");
}
LOG.info("{} found {} whose regionLocation no longer matches {}, skipping assign...",
this, regionNode, serverName);
continue;

View File

@ -163,7 +163,7 @@ public class TestCloseRegionWhileRSCrash {
UTIL.shutdownMiniCluster();
}
@org.junit.Ignore @Test // Until root-cause of flakeyness, HBASE-24117, is addressed.
@Test
public void testRetryBackoff() throws IOException, InterruptedException {
HRegionServer srcRs = UTIL.getRSForFirstRegionInTable(TABLE_NAME);
RegionInfo region = srcRs.getRegions(TABLE_NAME).get(0).getRegionInfo();