diff --git a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto
index 6ebb252d07d..765b3f35a84 100644
--- a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto
+++ b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto
@@ -480,7 +480,6 @@ message RegionStateTransitionStateData {
message RegionRemoteProcedureBaseStateData {
required RegionInfo region = 1;
required ServerName target_server = 2;
- required bool dispatched = 3;
}
message OpenRegionProcedureStateData {
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java
index 0d22a9c0164..fd672fa036d 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java
@@ -83,6 +83,6 @@ public class CloseRegionProcedure extends RegionRemoteProcedureBase {
@Override
protected boolean shouldDispatch(RegionStateNode regionNode) {
- return !regionNode.isInState(RegionState.State.CLOSED, RegionState.State.ABNORMALLY_CLOSED);
+ return regionNode.isInState(RegionState.State.CLOSING);
}
}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java
index 125ef115735..ed5836dcfba 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java
@@ -68,6 +68,6 @@ public class OpenRegionProcedure extends RegionRemoteProcedureBase {
@Override
protected boolean shouldDispatch(RegionStateNode regionNode) {
- return !regionNode.isInState(RegionState.State.OPEN);
+ return regionNode.isInState(RegionState.State.OPENING);
}
}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java
index 3e05cde1d21..08097593a43 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java
@@ -107,6 +107,17 @@ public abstract class RegionRemoteProcedureBase extends Procedure
- * Usually this will not happen if we do not allow assigning a already onlined region. But if we
- * have something wrong in the RSProcedureDispatcher, where we have already sent the request to
- * RS, but then we tell the upper layer the remote call is failed due to rpc timeout or connection
- * closed or anything else, then this issue can still happen. So here we add a check to make it
- * more robust.
+ * This could happen when master restarts. Since we do not know whether a request has already been
+ * sent to the region server after we add a remote operation to the dispatcher, so the safe way is
+ * to not persist the dispatched field and try to add the remote operation again. But it is
+ * possible that we do have already sent the request to region server and it has also sent back
+ * the response, so here we need to check the region state, if it is not in the expecting state,
+ * we should give up, otherwise we may hang for ever, as the region server will just ignore
+ * redundant calls.
*/
protected abstract boolean shouldDispatch(RegionStateNode regionNode);
@@ -165,7 +178,7 @@ public abstract class RegionRemoteProcedureBase extends Procedure EXCLUDE_SERVERS = new ArrayList<>();
+
+ private static boolean HALT = false;
+
+ private static boolean KILL = false;
+
+ private static CountDownLatch ARRIVE;
+
+ private static final class ServerManagerForTest extends ServerManager {
+
+ public ServerManagerForTest(MasterServices master) {
+ super(master);
+ }
+
+ @Override
+ public List createDestinationServersList() {
+ return super.createDestinationServersList(EXCLUDE_SERVERS);
+ }
+ }
+
+ private static final class AssignmentManagerForTest extends AssignmentManager {
+
+ public AssignmentManagerForTest(MasterServices master) {
+ super(master);
+ }
+
+ @Override
+ public ReportRegionStateTransitionResponse reportRegionStateTransition(
+ ReportRegionStateTransitionRequest req) throws PleaseHoldException {
+ if (req.getTransition(0).getTransitionCode() == TransitionCode.OPENED) {
+ if (ARRIVE != null) {
+ ARRIVE.countDown();
+ ARRIVE = null;
+ }
+ while (HALT) {
+ try {
+ Thread.sleep(100);
+ } catch (InterruptedException e) {
+ throw new RuntimeException(e);
+ }
+ if (KILL) {
+ throw new PleaseHoldException("Inject error!");
+ }
+ }
+ }
+ return super.reportRegionStateTransition(req);
+ }
+ }
+
+ public static final class HMasterForTest extends HMaster {
+
+ public HMasterForTest(Configuration conf) throws IOException, KeeperException {
+ super(conf);
+ }
+
+ @Override
+ protected AssignmentManager createAssignmentManager(MasterServices master) {
+ return new AssignmentManagerForTest(master);
+ }
+
+ @Override
+ protected ServerManager createServerManager(MasterServices master) throws IOException {
+ setupClusterConnection();
+ return new ServerManagerForTest(master);
+ }
+ }
+
+ private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
+
+ private static TableName NAME = TableName.valueOf("Assign");
+
+ private static byte[] CF = Bytes.toBytes("cf");
+
+ @BeforeClass
+ public static void setUp() throws Exception {
+ UTIL.getConfiguration().setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class);
+ UTIL
+ .startMiniCluster(StartMiniClusterOption.builder().numMasters(2).numRegionServers(2).build());
+ UTIL.createTable(NAME, CF);
+ UTIL.waitTableAvailable(NAME);
+ UTIL.getAdmin().balancerSwitch(false, true);
+ }
+
+ @AfterClass
+ public static void tearDown() throws Exception {
+ UTIL.shutdownMiniCluster();
+ }
+
+ @Test
+ public void test() throws Exception {
+ RegionInfo region = UTIL.getMiniHBaseCluster().getRegions(NAME).get(0).getRegionInfo();
+ AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager();
+ RegionStateNode rsn = am.getRegionStates().getRegionStateNode(region);
+
+ ServerName sn = rsn.getRegionLocation();
+ ARRIVE = new CountDownLatch(1);
+ HALT = true;
+ am.moveAsync(new RegionPlan(region, sn, sn));
+ ARRIVE.await();
+
+ // let's restart the master
+ EXCLUDE_SERVERS.add(rsn.getRegionLocation());
+ KILL = true;
+ HMaster activeMaster = UTIL.getMiniHBaseCluster().getMaster();
+ activeMaster.abort("For testing");
+ activeMaster.getThread().join();
+ KILL = false;
+
+ // sleep a while to reproduce the problem, as after the fix in HBASE-21472 the execution logic
+ // is changed so the old code to reproduce the problem can not compile...
+ Thread.sleep(10000);
+ HALT = false;
+ Thread.sleep(5000);
+
+ HRegionServer rs = UTIL.getMiniHBaseCluster().getRegionServer(sn);
+ assertNotNull(rs.getRegion(region.getEncodedName()));
+ assertNull(UTIL.getOtherRegionServer(rs).getRegion(region.getEncodedName()));
+ }
+}