From 17055cdc70fd5b19b8828ff235b8170ab43c6894 Mon Sep 17 00:00:00 2001 From: zhangduo Date: Wed, 14 Nov 2018 21:22:24 +0800 Subject: [PATCH] HBASE-21472 Should not persist the dispatched field for RegionRemoteProcedureBase --- .../src/main/protobuf/MasterProcedure.proto | 1 - .../assignment/CloseRegionProcedure.java | 2 +- .../assignment/OpenRegionProcedure.java | 2 +- .../assignment/RegionRemoteProcedureBase.java | 26 ++- ...RegionAssignedToMultipleRegionServers.java | 180 ++++++++++++++++++ 5 files changed, 201 insertions(+), 10 deletions(-) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRegionAssignedToMultipleRegionServers.java diff --git a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto index 6ebb252d07d..765b3f35a84 100644 --- a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto +++ b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto @@ -480,7 +480,6 @@ message RegionStateTransitionStateData { message RegionRemoteProcedureBaseStateData { required RegionInfo region = 1; required ServerName target_server = 2; - required bool dispatched = 3; } message OpenRegionProcedureStateData { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java index 0d22a9c0164..fd672fa036d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java @@ -83,6 +83,6 @@ public class CloseRegionProcedure extends RegionRemoteProcedureBase { @Override protected boolean shouldDispatch(RegionStateNode regionNode) { - return !regionNode.isInState(RegionState.State.CLOSED, RegionState.State.ABNORMALLY_CLOSED); + return regionNode.isInState(RegionState.State.CLOSING); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java index 125ef115735..ed5836dcfba 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java @@ -68,6 +68,6 @@ public class OpenRegionProcedure extends RegionRemoteProcedureBase { @Override protected boolean shouldDispatch(RegionStateNode regionNode) { - return !regionNode.isInState(RegionState.State.OPEN); + return regionNode.isInState(RegionState.State.OPENING); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java index 3e05cde1d21..08097593a43 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java @@ -107,6 +107,17 @@ public abstract class RegionRemoteProcedureBase extends Procedure - * Usually this will not happen if we do not allow assigning a already onlined region. But if we - * have something wrong in the RSProcedureDispatcher, where we have already sent the request to - * RS, but then we tell the upper layer the remote call is failed due to rpc timeout or connection - * closed or anything else, then this issue can still happen. So here we add a check to make it - * more robust. + * This could happen when master restarts. Since we do not know whether a request has already been + * sent to the region server after we add a remote operation to the dispatcher, so the safe way is + * to not persist the dispatched field and try to add the remote operation again. But it is + * possible that we do have already sent the request to region server and it has also sent back + * the response, so here we need to check the region state, if it is not in the expecting state, + * we should give up, otherwise we may hang for ever, as the region server will just ignore + * redundant calls. */ protected abstract boolean shouldDispatch(RegionStateNode regionNode); @@ -165,7 +178,7 @@ public abstract class RegionRemoteProcedureBase extends Procedure EXCLUDE_SERVERS = new ArrayList<>(); + + private static boolean HALT = false; + + private static boolean KILL = false; + + private static CountDownLatch ARRIVE; + + private static final class ServerManagerForTest extends ServerManager { + + public ServerManagerForTest(MasterServices master) { + super(master); + } + + @Override + public List createDestinationServersList() { + return super.createDestinationServersList(EXCLUDE_SERVERS); + } + } + + private static final class AssignmentManagerForTest extends AssignmentManager { + + public AssignmentManagerForTest(MasterServices master) { + super(master); + } + + @Override + public ReportRegionStateTransitionResponse reportRegionStateTransition( + ReportRegionStateTransitionRequest req) throws PleaseHoldException { + if (req.getTransition(0).getTransitionCode() == TransitionCode.OPENED) { + if (ARRIVE != null) { + ARRIVE.countDown(); + ARRIVE = null; + } + while (HALT) { + try { + Thread.sleep(100); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + if (KILL) { + throw new PleaseHoldException("Inject error!"); + } + } + } + return super.reportRegionStateTransition(req); + } + } + + public static final class HMasterForTest extends HMaster { + + public HMasterForTest(Configuration conf) throws IOException, KeeperException { + super(conf); + } + + @Override + protected AssignmentManager createAssignmentManager(MasterServices master) { + return new AssignmentManagerForTest(master); + } + + @Override + protected ServerManager createServerManager(MasterServices master) throws IOException { + setupClusterConnection(); + return new ServerManagerForTest(master); + } + } + + private static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); + + private static TableName NAME = TableName.valueOf("Assign"); + + private static byte[] CF = Bytes.toBytes("cf"); + + @BeforeClass + public static void setUp() throws Exception { + UTIL.getConfiguration().setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class); + UTIL + .startMiniCluster(StartMiniClusterOption.builder().numMasters(2).numRegionServers(2).build()); + UTIL.createTable(NAME, CF); + UTIL.waitTableAvailable(NAME); + UTIL.getAdmin().balancerSwitch(false, true); + } + + @AfterClass + public static void tearDown() throws Exception { + UTIL.shutdownMiniCluster(); + } + + @Test + public void test() throws Exception { + RegionInfo region = UTIL.getMiniHBaseCluster().getRegions(NAME).get(0).getRegionInfo(); + AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(); + RegionStateNode rsn = am.getRegionStates().getRegionStateNode(region); + + ServerName sn = rsn.getRegionLocation(); + ARRIVE = new CountDownLatch(1); + HALT = true; + am.moveAsync(new RegionPlan(region, sn, sn)); + ARRIVE.await(); + + // let's restart the master + EXCLUDE_SERVERS.add(rsn.getRegionLocation()); + KILL = true; + HMaster activeMaster = UTIL.getMiniHBaseCluster().getMaster(); + activeMaster.abort("For testing"); + activeMaster.getThread().join(); + KILL = false; + + // sleep a while to reproduce the problem, as after the fix in HBASE-21472 the execution logic + // is changed so the old code to reproduce the problem can not compile... + Thread.sleep(10000); + HALT = false; + Thread.sleep(5000); + + HRegionServer rs = UTIL.getMiniHBaseCluster().getRegionServer(sn); + assertNotNull(rs.getRegion(region.getEncodedName())); + assertNull(UTIL.getOtherRegionServer(rs).getRegion(region.getEncodedName())); + } +}