From 573b57d43798ba172597721097b9ea98ddac3303 Mon Sep 17 00:00:00 2001 From: zhangduo Date: Mon, 11 Jun 2018 14:57:31 +0800 Subject: [PATCH] HBASE-20700 Move meta region when server crash can cause the procedure to be stuck --- .../hbase/procedure2/LockedResourceType.java | 2 +- .../apache/hadoop/hbase/master/HMaster.java | 23 ++-- .../hbase/master/MasterMetaBootstrap.java | 15 +-- .../master/assignment/AssignmentManager.java | 55 ++++++--- .../hbase/master/assignment/RegionStates.java | 68 +++++++---- .../assignment/RegionTransitionProcedure.java | 2 +- .../master/assignment/UnassignProcedure.java | 27 ++++- .../procedure/MasterProcedureScheduler.java | 112 ++++++++++++++---- .../procedure/MetaProcedureInterface.java | 32 +++++ .../hbase/master/procedure/MetaQueue.java | 36 ++++++ .../procedure/PeerProcedureInterface.java | 2 - .../procedure/RecoverMetaProcedure.java | 26 ++-- .../hbase/master/procedure/SchemaLocking.java | 14 ++- .../procedure/ServerCrashProcedure.java | 16 +-- .../procedure/ServerProcedureInterface.java | 2 - .../procedure/TableProcedureInterface.java | 3 - .../hbase/master/DummyRegionProcedure.java | 82 +++++++++++++ .../master/DummyRegionProcedureState.java | 22 ++++ .../hbase/master/TestMasterNoCluster.java | 5 +- ...ServerCrashProcedureCarryingMetaStuck.java | 95 +++++++++++++++ .../master/TestServerCrashProcedureStuck.java | 66 +---------- .../MasterProcedureTestingUtility.java | 5 +- 22 files changed, 514 insertions(+), 196 deletions(-) create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MetaProcedureInterface.java create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MetaQueue.java create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/DummyRegionProcedure.java create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/DummyRegionProcedureState.java create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestServerCrashProcedureCarryingMetaStuck.java diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/LockedResourceType.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/LockedResourceType.java index dc9b5d428df..55d195b3920 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/LockedResourceType.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/LockedResourceType.java @@ -22,5 +22,5 @@ import org.apache.yetus.audience.InterfaceAudience; @InterfaceAudience.Private public enum LockedResourceType { - SERVER, NAMESPACE, TABLE, REGION, PEER + SERVER, NAMESPACE, TABLE, REGION, PEER, META } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 0871482a5a0..f20cc61d923 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -910,7 +910,7 @@ public class HMaster extends HRegionServer implements MasterServices { // Bring up hbase:meta. recoverMeta is a blocking call waiting until hbase:meta is deployed. // It also starts the TableStateManager. - MasterMetaBootstrap metaBootstrap = createMetaBootstrap(this, status); + MasterMetaBootstrap metaBootstrap = createMetaBootstrap(); metaBootstrap.recoverMeta(); //Initialize after meta as it scans meta @@ -1055,12 +1055,18 @@ public class HMaster extends HRegionServer implements MasterServices { } /** + *

* Create a {@link MasterMetaBootstrap} instance. + *

+ *

+ * Will be overridden in tests. + *

*/ - MasterMetaBootstrap createMetaBootstrap(final HMaster master, final MonitoredTask status) { + @VisibleForTesting + protected MasterMetaBootstrap createMetaBootstrap() { // We put this out here in a method so can do a Mockito.spy and stub it out // w/ a mocked up MasterMetaBootstrap. - return new MasterMetaBootstrap(master, status); + return new MasterMetaBootstrap(this); } /** @@ -3161,7 +3167,8 @@ public class HMaster extends HRegionServer implements MasterServices { cpHost.preGetLocks(); } - MasterProcedureScheduler procedureScheduler = procedureExecutor.getEnvironment().getProcedureScheduler(); + MasterProcedureScheduler procedureScheduler = + procedureExecutor.getEnvironment().getProcedureScheduler(); final List lockedResources = procedureScheduler.getLocks(); @@ -3606,11 +3613,13 @@ public class HMaster extends HRegionServer implements MasterServices { @Override public boolean recoverMeta() throws IOException { - ProcedurePrepareLatch latch = ProcedurePrepareLatch.createLatch(2, 0); + // we need to block here so the latch should be greater than the current version to make sure + // that we will block. + ProcedurePrepareLatch latch = ProcedurePrepareLatch.createLatch(Integer.MAX_VALUE, 0); procedureExecutor.submitProcedure(new RecoverMetaProcedure(null, true, latch)); latch.await(); - LOG.info("hbase:meta deployed at=" + - getMetaTableLocator().getMetaRegionLocation(getZooKeeper())); + LOG.info("hbase:meta deployed at={}", + getMetaTableLocator().getMetaRegionLocation(getZooKeeper())); return assignmentManager.isMetaInitialized(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java index 59f123395be..dd46e411ca8 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java @@ -21,14 +21,12 @@ package org.apache.hadoop.hbase.master; import java.io.IOException; import java.util.List; import java.util.Set; - import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfoBuilder; import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.master.assignment.AssignmentManager; -import org.apache.hadoop.hbase.monitoring.MonitoredTask; import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.hadoop.hbase.zookeeper.ZKWatcher; @@ -44,12 +42,10 @@ import org.slf4j.LoggerFactory; public class MasterMetaBootstrap { private static final Logger LOG = LoggerFactory.getLogger(MasterMetaBootstrap.class); - private final MonitoredTask status; private final HMaster master; - public MasterMetaBootstrap(final HMaster master, final MonitoredTask status) { + public MasterMetaBootstrap(HMaster master) { this.master = master; - this.status = status; } public void recoverMeta() throws InterruptedException, IOException { @@ -58,7 +54,7 @@ public class MasterMetaBootstrap { // Now we can start the TableStateManager. It is backed by hbase:meta. master.getTableStateManager().start(); // Enable server crash procedure handling - enableCrashedServerProcessing(false); + enableCrashedServerProcessing(); } public void processDeadServers() { @@ -142,8 +138,7 @@ public class MasterMetaBootstrap { } } - private void enableCrashedServerProcessing(final boolean waitForMeta) - throws InterruptedException { + private void enableCrashedServerProcessing() throws InterruptedException { // If crashed server processing is disabled, we enable it and expire those dead but not expired // servers. This is required so that if meta is assigning to a server which dies after // assignMeta starts assignment, ServerCrashProcedure can re-assign it. Otherwise, we will be @@ -152,9 +147,5 @@ public class MasterMetaBootstrap { master.setServerCrashProcessingEnabled(true); master.getServerManager().processQueuedDeadServers(); } - - if (waitForMeta) { - master.getMetaTableLocator().waitMetaRegionLocation(master.getZooKeeper()); - } } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 1f20e889f92..3412c82f2d6 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -35,7 +35,6 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; import java.util.stream.Collectors; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseIOException; import org.apache.hadoop.hbase.HConstants; @@ -79,16 +78,9 @@ import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; import org.apache.hadoop.hbase.procedure2.ProcedureInMemoryChore; import org.apache.hadoop.hbase.procedure2.util.StringUtils; import org.apache.hadoop.hbase.regionserver.SequenceId; -import org.apache.hadoop.hbase.util.HasThread; -import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting; -import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; -import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState; -import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition; -import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; -import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest; -import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.hbase.util.HasThread; import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.util.VersionInfo; @@ -96,6 +88,15 @@ import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting; + +import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse; + /** * The AssignmentManager is the coordinator for region assign/unassign operations. *