From 8cdb2cca4461d6adad3f44af001055848a205370 Mon Sep 17 00:00:00 2001 From: BukrosSzabolcs Date: Wed, 24 Jun 2020 19:38:36 +0200 Subject: [PATCH] HBASE-24562: Stabilize master startup with meta replicas enabled (#1903) Signed-off-by: Wellington Chevreuil Signed-off-by: Huaxiang Sun --- .../apache/hadoop/hbase/master/HMaster.java | 6 +- .../hbase/master/MasterMetaBootstrap.java | 4 +- .../master/assignment/AssignmentManager.java | 34 ++++++++- .../hbase/client/TestMetaWithReplicas.java | 70 +++++++++++++++++++ 4 files changed, 108 insertions(+), 6 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 9fdb0e52e55..5a9404a7277 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -1166,7 +1166,11 @@ public class HMaster extends HRegionServer implements MasterServices { assignmentManager.checkIfShouldMoveSystemRegionAsync(); status.setStatus("Assign meta replicas"); MasterMetaBootstrap metaBootstrap = createMetaBootstrap(); - metaBootstrap.assignMetaReplicas(); + try { + metaBootstrap.assignMetaReplicas(); + } catch (IOException | KeeperException e){ + LOG.error("Assigning meta replica failed: ", e); + } status.setStatus("Starting quota manager"); initQuotaManager(); if (QuotaUtil.isQuotaEnabled(conf)) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java index 6e38bdd50bc..ef12d074887 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java @@ -81,9 +81,9 @@ class MasterMetaBootstrap { // down hosting server which calls AM#stop. if (metaState != null && metaState.getServerName() != null) { // Try to retain old assignment. - assignmentManager.assign(hri, metaState.getServerName()); + assignmentManager.assignAsync(hri, metaState.getServerName()); } else { - assignmentManager.assign(hri); + assignmentManager.assignAsync(hri); } } unassignExcessMetaReplica(numReplicas); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index ef6e30cc9fd..4c42237801b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -595,9 +595,9 @@ public class AssignmentManager { } } - // TODO: Need an async version of this for hbck2. - public long assign(RegionInfo regionInfo, ServerName sn) throws IOException { - // TODO: should we use getRegionStateNode? + private TransitRegionStateProcedure createAssignProcedure(RegionInfo regionInfo, ServerName sn) + throws IOException { + // TODO: should we use getRegionStateNode? RegionStateNode regionNode = regionStates.getOrCreateRegionStateNode(regionInfo); TransitRegionStateProcedure proc; regionNode.lock(); @@ -608,6 +608,12 @@ public class AssignmentManager { } finally { regionNode.unlock(); } + return proc; + } + + // TODO: Need an async version of this for hbck2. + public long assign(RegionInfo regionInfo, ServerName sn) throws IOException { + TransitRegionStateProcedure proc = createAssignProcedure(regionInfo, sn); ProcedureSyncWait.submitAndWaitProcedure(master.getMasterProcedureExecutor(), proc); return proc.getProcId(); } @@ -616,6 +622,28 @@ public class AssignmentManager { return assign(regionInfo, null); } + /** + * Submits a procedure that assigns a region to a target server without waiting for it to finish + * @param regionInfo the region we would like to assign + * @param sn target server name + * @return + * @throws IOException + */ + public Future assignAsync(RegionInfo regionInfo, ServerName sn) throws IOException { + TransitRegionStateProcedure proc = createAssignProcedure(regionInfo, sn); + return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc); + } + + /** + * Submits a procedure that assigns a region without waiting for it to finish + * @param regionInfo the region we would like to assign + * @return + * @throws IOException + */ + public Future assignAsync(RegionInfo regionInfo) throws IOException { + return assignAsync(regionInfo, null); + } + public long unassign(RegionInfo regionInfo) throws IOException { RegionStateNode regionNode = regionStates.getRegionStateNode(regionInfo); if (regionNode == null) { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java index 56e96fbec01..65dc38ebea4 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java @@ -23,12 +23,14 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.EnumSet; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.Abortable; @@ -42,8 +44,12 @@ import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.StartMiniClusterOption; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.TableNotFoundException; +import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.assignment.AssignmentManager; import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil; +import org.apache.hadoop.hbase.master.assignment.RegionStateNode; +import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure; import org.apache.hadoop.hbase.regionserver.StorefileRefresherChore; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.util.Bytes; @@ -53,6 +59,7 @@ import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.hadoop.hbase.zookeeper.ZKWatcher; import org.apache.hadoop.hbase.zookeeper.ZNodePaths; import org.junit.After; +import org.junit.Assert; import org.junit.Before; import org.junit.ClassRule; import org.junit.Rule; @@ -355,4 +362,67 @@ public class TestMetaWithReplicas { assertNotEquals(3, i); } } + + @Test + public void testFailedReplicaAssigment() throws InterruptedException, IOException { + //using our rigged master, to force a failed meta replica assignment + TEST_UTIL.getMiniHBaseCluster().getConfiguration().setClass(HConstants.MASTER_IMPL, BrokenMetaReplicaMaster.class, HMaster.class); + TEST_UTIL.getMiniHBaseCluster().stopMaster(0).join(); + HMaster newMaster = TEST_UTIL.getMiniHBaseCluster().startMaster().getMaster(); + //waiting for master to come up + TEST_UTIL.waitFor(30000, () -> newMaster.isInitialized()); + TEST_UTIL.getMiniHBaseCluster().getConfiguration().unset(HConstants.MASTER_IMPL); + + + AssignmentManager am = newMaster.getAssignmentManager(); + //showing one of the replicas got assigned + RegionInfo metaReplicaHri = RegionReplicaUtil.getRegionInfoForReplica( + RegionInfoBuilder.FIRST_META_REGIONINFO, 1); + RegionStateNode metaReplicaRegionNode = am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri); + Assert.assertNotNull(metaReplicaRegionNode.getRegionLocation()); + //showing one of the replicas failed to be assigned + RegionInfo metaReplicaHri2 = RegionReplicaUtil.getRegionInfoForReplica( + RegionInfoBuilder.FIRST_META_REGIONINFO, 2); + RegionStateNode metaReplicaRegionNode2 = am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri2); + Assert.assertNull(metaReplicaRegionNode2.getRegionLocation()); + + //showing master is active and running + Assert.assertFalse(newMaster.isStopping()); + Assert.assertFalse(newMaster.isStopped()); + Assert.assertTrue(newMaster.isActiveMaster()); + } + + public static class BrokenTransitRegionStateProcedure extends TransitRegionStateProcedure { + protected BrokenTransitRegionStateProcedure() { + //super(env, hri, assignCandidate, forceNewPlan, type); + super(null, null, null, false,TransitionType.ASSIGN); + } + } + + public static class BrokenMetaReplicaMaster extends HMaster{ + public BrokenMetaReplicaMaster(final Configuration conf) throws IOException { + super(conf); + } + + @Override + public AssignmentManager createAssignmentManager(MasterServices master) { + return new BrokenMasterMetaAssignmentManager(master); + } + } + + public static class BrokenMasterMetaAssignmentManager extends AssignmentManager{ + MasterServices master; + public BrokenMasterMetaAssignmentManager(final MasterServices master) { + super(master); + this.master = master; + } + + public Future assignAsync(RegionInfo regionInfo, ServerName sn) throws IOException { + RegionStateNode regionNode = getRegionStates().getOrCreateRegionStateNode(regionInfo); + if (regionNode.getRegionInfo().getReplicaId() == 2) { + regionNode.setProcedure(new BrokenTransitRegionStateProcedure()); + } + return super.assignAsync(regionInfo, sn); + } + } }