HBASE-18261 Created RecoverMetaProcedure and used it from ServerCrashProcedure and HMaster.finishActiveMasterInitialization().

This procedure can be used from any code before accessing meta, to initialize/ recover meta Signed-off-by: Michael Stack <stack@apache.org>
2017-06-23 14:44:28 -07:00 · 2017-06-23 14:44:28 -07:00 · 7bdabed275
commit 7bdabed275
parent 5490c558bc
14 changed files with 1403 additions and 308 deletions
--- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/StateMachineProcedure.java
+++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/StateMachineProcedure.java
@ -59,6 +59,20 @@ public abstract class StateMachineProcedure<TEnvironment, TState>
  private List<Procedure<TEnvironment>> subProcList = null;
  protected final int getCycles() {
    return cycles;
  }
  /**
   * Cycles on same state. Good for figuring if we are stuck.
   */
  private int cycles = 0;
  /**
   * Ordinal of the previous state. So we can tell if we are progressing or not.
   */
  private int previousState;
  protected enum Flow {
    HAS_MORE_STATE,
    NO_MORE_STATE,
@ -152,6 +166,18 @@ public abstract class StateMachineProcedure<TEnvironment, TState>
      if (stateCount == 0) {
        setNextState(getStateId(state));
      }
      if (LOG.isTraceEnabled()) {
        LOG.trace(state  + " " + this + "; cycles=" + this.cycles);
      }
      // Keep running count of cycles
      if (getStateId(state) != this.previousState) {
        this.previousState = getStateId(state);
        this.cycles = 0;
      } else {
        this.cycles++;
      }
      stateFlow = executeFromState(env, state);
      if (!hasMoreState()) setNextState(EOF_STATE);
      if (subProcList != null && !subProcList.isEmpty()) {
--- a/hbase-protocol-shaded/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/generated/MasterProcedureProtos.java
+++ b/hbase-protocol-shaded/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/generated/MasterProcedureProtos.java
--- a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto
+++ b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto
@ -330,6 +330,12 @@ message ServerCrashStateData {
  optional bool should_split_wal = 6 [default = true];
 }
 message RecoverMetaStateData {
  optional ServerName failed_meta_server = 1;
  optional bool should_split_wal = 2 [default = true];
  optional int32 replica_id = 3 [default = 0];
 }
 enum ServerCrashState {
  SERVER_CRASH_START = 1;
  SERVER_CRASH_PROCESS_META = 2;
@ -343,6 +349,11 @@ enum ServerCrashState {
  SERVER_CRASH_FINISH = 100;
 }
 enum RecoverMetaState {
  RECOVER_META_SPLIT_LOGS = 1;
  RECOVER_META_ASSIGN_REGIONS = 2;
 }
 enum RegionTransitionState {
  REGION_TRANSITION_QUEUE = 1;
  REGION_TRANSITION_DISPATCH = 2;
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@ -123,6 +123,7 @@ import org.apache.hadoop.hbase.master.procedure.MasterProcedureUtil;
 import org.apache.hadoop.hbase.master.procedure.ModifyColumnFamilyProcedure;
 import org.apache.hadoop.hbase.master.procedure.ModifyTableProcedure;
 import org.apache.hadoop.hbase.master.procedure.ProcedurePrepareLatch;
 import org.apache.hadoop.hbase.master.procedure.RecoverMetaProcedure;
 import org.apache.hadoop.hbase.master.procedure.TruncateTableProcedure;
 import org.apache.hadoop.hbase.master.replication.ReplicationManager;
 import org.apache.hadoop.hbase.master.snapshot.SnapshotManager;
@ -396,9 +397,6 @@ public class HMaster extends HRegionServer implements MasterServices {
  private long splitPlanCount;
  private long mergePlanCount;
  /** flag used in test cases in order to simulate RS failures during master initialization */
  private volatile boolean initializationBeforeMetaAssignment = false;
  /* Handle favored nodes information */
  private FavoredNodesManager favoredNodesManager;
@ -794,14 +792,6 @@ public class HMaster extends HRegionServer implements MasterServices {
    status.setStatus("Wait for region servers to report in");
    waitForRegionServers(status);
    // get a list for previously failed RS which need log splitting work
    // we recover hbase:meta region servers inside master initialization and
    // handle other failed servers in SSH in order to start up master node ASAP
    MasterMetaBootstrap metaBootstrap = createMetaBootstrap(this, status);
    metaBootstrap.splitMetaLogsBeforeAssignment();
    this.initializationBeforeMetaAssignment = true;
    if (this.balancer instanceof FavoredNodesPromoter) {
      favoredNodesManager = new FavoredNodesManager(this);
    }
@ -820,8 +810,12 @@ public class HMaster extends HRegionServer implements MasterServices {
    if (isStopped()) return;
    // Make sure meta assigned before proceeding.
-    status.setStatus("Assigning Meta Region");
+    status.setStatus("Recovering  Meta Region");
-    metaBootstrap.assignMeta();
+
    // we recover hbase:meta region servers inside master initialization and
    // handle other failed servers in SSH in order to start up master node ASAP
    MasterMetaBootstrap metaBootstrap = createMetaBootstrap(this, status);
    metaBootstrap.recoverMeta();
    // check if master is shutting down because above assignMeta could return even hbase:meta isn't
    // assigned when master is shutting down
@ -2709,14 +2703,6 @@ public class HMaster extends HRegionServer implements MasterServices {
    return serverCrashProcessingEnabled;
  }
  /**
   * Report whether this master has started initialization and is about to do meta region assignment
   * @return true if master is in initialization &amp; about to assign hbase:meta regions
   */
  public boolean isInitializationStartsMetaRegionAssignment() {
    return this.initializationBeforeMetaAssignment;
  }
  /**
   * Compute the average load across all region servers.
   * Currently, this uses a very naive computation - just uses the number of
@ -3424,6 +3410,17 @@ public class HMaster extends HRegionServer implements MasterServices {
    return lockManager;
  }
  @Override
  public boolean recoverMeta() throws IOException {
    ProcedurePrepareLatch latch = ProcedurePrepareLatch.createLatch(2, 0);
    long procId = procedureExecutor.submitProcedure(new RecoverMetaProcedure(null, true, latch));
    LOG.info("Waiting on RecoverMetaProcedure submitted with procId=" + procId);
    latch.await();
    LOG.info("Default replica of hbase:meta, location=" +
        getMetaTableLocator().getMetaRegionLocation(getZooKeeper()));
    return assignmentManager.isMetaInitialized();
  }
  public QuotaObserverChore getQuotaObserverChore() {
    return this.quotaObserverChore;
  }
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java
@ -19,7 +19,6 @@
 package org.apache.hadoop.hbase.master;
 import java.io.IOException;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
@ -49,45 +48,24 @@ public class MasterMetaBootstrap {
  private final MonitoredTask status;
  private final HMaster master;
  private Set<ServerName> previouslyFailedServers;
  private Set<ServerName> previouslyFailedMetaRSs;
  public MasterMetaBootstrap(final HMaster master, final MonitoredTask status) {
    this.master = master;
    this.status = status;
  }
-  public void splitMetaLogsBeforeAssignment() throws IOException, KeeperException {
+  public void recoverMeta() throws InterruptedException, IOException {
    master.recoverMeta();
    master.getTableStateManager().start();
    enableCrashedServerProcessing(false);
  }
  public void processDeadServers() {
    // get a list for previously failed RS which need log splitting work
    // we recover hbase:meta region servers inside master initialization and
    // handle other failed servers in SSH in order to start up master node ASAP
-    previouslyFailedServers = master.getMasterWalManager().getFailedServersFromLogFolders();
+    Set<ServerName> previouslyFailedServers =
        master.getMasterWalManager().getFailedServersFromLogFolders();
    // log splitting for hbase:meta server
    ServerName oldMetaServerLocation = master.getMetaTableLocator()
        .getMetaRegionLocation(master.getZooKeeper());
    if (oldMetaServerLocation != null && previouslyFailedServers.contains(oldMetaServerLocation)) {
      splitMetaLogBeforeAssignment(oldMetaServerLocation);
      // Note: we can't remove oldMetaServerLocation from previousFailedServers list because it
      // may also host user regions
    }
    previouslyFailedMetaRSs = getPreviouselyFailedMetaServersFromZK();
    // need to use union of previouslyFailedMetaRSs recorded in ZK and previouslyFailedServers
    // instead of previouslyFailedMetaRSs alone to address the following two situations:
    // 1) the chained failure situation(recovery failed multiple times in a row).
    // 2) master get killed right before it could delete the recovering hbase:meta from ZK while the
    // same server still has non-meta wals to be replayed so that
    // removeStaleRecoveringRegionsFromZK can't delete the stale hbase:meta region
    // Passing more servers into splitMetaLog is all right. If a server doesn't have hbase:meta wal,
    // there is no op for the server.
    previouslyFailedMetaRSs.addAll(previouslyFailedServers);
  }
  public void assignMeta() throws InterruptedException, IOException, KeeperException {
    assignMeta(previouslyFailedMetaRSs, HRegionInfo.DEFAULT_REPLICA_ID);
  }
  public void processDeadServers() throws IOException {
    // Master has recovered hbase:meta region server and we put
    // other failed region servers in a queue to be handled later by SSH
    for (ServerName tmpServer : previouslyFailedServers) {
@ -99,17 +77,12 @@ public class MasterMetaBootstrap {
      throws IOException, InterruptedException, KeeperException {
    int numReplicas = master.getConfiguration().getInt(HConstants.META_REPLICAS_NUM,
           HConstants.DEFAULT_META_REPLICA_NUM);
    final Set<ServerName> EMPTY_SET = new HashSet<>();
    for (int i = 1; i < numReplicas; i++) {
-      assignMeta(EMPTY_SET, i);
+      assignMeta(i);
    }
    unassignExcessMetaReplica(numReplicas);
  }
  private void splitMetaLogBeforeAssignment(ServerName currentMetaServer) throws IOException {
    master.getMasterWalManager().splitMetaLog(currentMetaServer);
  }
  private void unassignExcessMetaReplica(int numMetaReplicasConfigured) {
    final ZooKeeperWatcher zooKeeper = master.getZooKeeper();
    // unassign the unneeded replicas (for e.g., if the previous master was configured
@ -137,12 +110,11 @@ public class MasterMetaBootstrap {
  /**
   * Check <code>hbase:meta</code> is assigned. If not, assign it.
   */
-  protected void assignMeta(Set<ServerName> previouslyFailedMetaRSs, int replicaId)
+  protected void assignMeta(int replicaId)
      throws InterruptedException, IOException, KeeperException {
    final AssignmentManager assignmentManager = master.getAssignmentManager();
    // Work on meta region
    int assigned = 0;
    // TODO: Unimplemented
    // long timeout =
    //   master.getConfiguration().getLong("hbase.catalog.verification.timeout", 1000);
@ -172,14 +144,14 @@ public class MasterMetaBootstrap {
    // if the meta region server is died at this time, we need it to be re-assigned
    // by SSH so that system tables can be assigned.
    // No need to wait for meta is assigned = 0 when meta is just verified.
-    if (replicaId == HRegionInfo.DEFAULT_REPLICA_ID) enableCrashedServerProcessing(assigned != 0);
+    if (replicaId == HRegionInfo.DEFAULT_REPLICA_ID) enableCrashedServerProcessing(false);
    LOG.info("hbase:meta with replicaId " + replicaId + ", location="
      + master.getMetaTableLocator().getMetaRegionLocation(master.getZooKeeper(), replicaId));
    status.setStatus("META assigned.");
  }
  private void enableCrashedServerProcessing(final boolean waitForMeta)
-      throws IOException, InterruptedException {
+      throws InterruptedException {
    // If crashed server processing is disabled, we enable it and expire those dead but not expired
    // servers. This is required so that if meta is assigning to a server which dies after
    // assignMeta starts assignment, ServerCrashProcedure can re-assign it. Otherwise, we will be
@ -193,23 +165,4 @@ public class MasterMetaBootstrap {
      master.getMetaTableLocator().waitMetaRegionLocation(master.getZooKeeper());
    }
  }
  /**
   * This function returns a set of region server names under hbase:meta recovering region ZK node
   * @return Set of meta server names which were recorded in ZK
   */
  private Set<ServerName> getPreviouselyFailedMetaServersFromZK() throws KeeperException {
    final ZooKeeperWatcher zooKeeper = master.getZooKeeper();
    Set<ServerName> result = new HashSet<>();
    String metaRecoveringZNode = ZKUtil.joinZNode(zooKeeper.znodePaths.recoveringRegionsZNode,
      HRegionInfo.FIRST_META_REGIONINFO.getEncodedName());
    List<String> regionFailedServers = ZKUtil.listChildrenNoWatch(zooKeeper, metaRecoveringZNode);
    if (regionFailedServers == null) return result;
    for (String failedServer : regionFailedServers) {
      ServerName server = ServerName.parseServerName(failedServer);
      result.add(server);
    }
    return result;
  }
 }
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java
@ -500,4 +500,11 @@ public interface MasterServices extends Server {
  public String getRegionServerVersion(final ServerName sn);
  public void checkIfShouldMoveSystemRegionAsync();
  /**
   * Recover meta table. Will result in no-op is meta is already initialized. Any code that has
   * access to master and requires to access meta during process initialization can call this
   * method to make sure meta is initialized.
   */
  boolean recoverMeta() throws IOException;
 }
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java
@ -151,7 +151,7 @@ public class MasterWalManager {
   * Inspect the log directory to find dead servers which need recovery work
   * @return A set of ServerNames which aren't running but still have WAL files left in file system
   */
-  Set<ServerName> getFailedServersFromLogFolders() {
+  public Set<ServerName> getFailedServersFromLogFolders() {
    boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
        WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT);
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
@ -1448,6 +1448,8 @@ public class AssignmentManager implements ServerListener {
    synchronized (regionNode) {
      State state = regionNode.transitionState(State.OPEN, RegionStates.STATES_EXPECTED_ON_OPEN);
      if (isMetaRegion(hri)) {
        master.getTableStateManager().setTableState(TableName.META_TABLE_NAME,
            TableState.State.ENABLED);
        setMetaInitialized(hri, true);
      }
      regionStates.addRegionToServer(regionNode.getRegionLocation(), regionNode);
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RecoverMetaProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RecoverMetaProcedure.java
@ -0,0 +1,253 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.hadoop.hbase.master.procedure;
 import com.google.common.base.Preconditions;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.hbase.HRegionInfo;
 import org.apache.hadoop.hbase.ServerName;
 import org.apache.hadoop.hbase.TableName;
 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
 import org.apache.hadoop.hbase.master.HMaster;
 import org.apache.hadoop.hbase.master.assignment.AssignProcedure;
 import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
 import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
 import org.apache.hadoop.hbase.procedure2.StateMachineProcedure;
 import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RecoverMetaState;
 import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
 import org.apache.zookeeper.KeeperException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.util.Set;
 /**
 * This procedure recovers meta from prior shutdown/ crash of a server, and brings meta online by
 * assigning meta region/s. Any place where meta is accessed and requires meta to be online, need to
 * submit this procedure instead of duplicating steps to recover meta in the code.
 */
 public class RecoverMetaProcedure
    extends StateMachineProcedure<MasterProcedureEnv, MasterProcedureProtos.RecoverMetaState>
    implements TableProcedureInterface {
  private static final Log LOG = LogFactory.getLog(RecoverMetaProcedure.class);
  private ServerName failedMetaServer;
  private boolean shouldSplitWal;
  private int replicaId;
  private final ProcedurePrepareLatch syncLatch;
  private HMaster master;
  /**
   * Call this constructor to queue up a {@link RecoverMetaProcedure} in response to meta
   * carrying server crash
   * @param failedMetaServer failed/ crashed region server that was carrying meta
   * @param shouldSplitLog split log file of meta region
   */
  public RecoverMetaProcedure(final ServerName failedMetaServer, final boolean shouldSplitLog) {
    this(failedMetaServer, shouldSplitLog, null);
  }
  /**
   * Constructor with latch, for blocking/ sync usage
   */
  public RecoverMetaProcedure(final ServerName failedMetaServer, final boolean shouldSplitLog,
                              final ProcedurePrepareLatch latch) {
    this.failedMetaServer = failedMetaServer;
    this.shouldSplitWal = shouldSplitLog;
    this.replicaId = HRegionInfo.DEFAULT_REPLICA_ID;
    this.syncLatch = latch;
  }
  /**
   * This constructor is also used when deserializing from a procedure store; we'll construct one
   * of these then call {@link #deserializeStateData(InputStream)}. Do not use directly.
   */
  public RecoverMetaProcedure() {
    this(null, false);
  }
  @Override
  protected Flow executeFromState(MasterProcedureEnv env,
      MasterProcedureProtos.RecoverMetaState state)
      throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
    prepare(env);
    if (!isRunRequired()) {
      LOG.info(this + "; Meta already initialized. Skipping run");
      return Flow.NO_MORE_STATE;
    }
    try {
      switch (state) {
        case RECOVER_META_SPLIT_LOGS:
          LOG.info("Start " + this);
          if (shouldSplitWal) {
            // TODO: Matteo. We BLOCK here but most important thing to be doing at this moment.
            if (failedMetaServer != null) {
              master.getMasterWalManager().splitMetaLog(failedMetaServer);
            } else {
              ServerName serverName =
                  master.getMetaTableLocator().getMetaRegionLocation(master.getZooKeeper());
              Set<ServerName> previouslyFailedServers =
                  master.getMasterWalManager().getFailedServersFromLogFolders();
              if (serverName != null && previouslyFailedServers.contains(serverName)) {
                master.getMasterWalManager().splitMetaLog(serverName);
              }
            }
          }
          setNextState(RecoverMetaState.RECOVER_META_ASSIGN_REGIONS);
          break;
        case RECOVER_META_ASSIGN_REGIONS:
          HRegionInfo hri = RegionReplicaUtil.getRegionInfoForReplica(
              HRegionInfo.FIRST_META_REGIONINFO, this.replicaId);
          AssignProcedure metaAssignProcedure;
          if (failedMetaServer != null) {
            LOG.info(this + "; Assigning meta with new plan. previous meta server=" +
                failedMetaServer);
            metaAssignProcedure = master.getAssignmentManager().createAssignProcedure(hri, true);
          } else {
            // get server carrying meta from zk
            ServerName metaServer =
                MetaTableLocator.getMetaRegionState(master.getZooKeeper()).getServerName();
            LOG.info(this + "; Retaining meta assignment to server=" + metaServer);
            metaAssignProcedure =
                master.getAssignmentManager().createAssignProcedure(hri, metaServer);
          }
          addChildProcedure(metaAssignProcedure);
          return Flow.NO_MORE_STATE;
        default:
          throw new UnsupportedOperationException("unhandled state=" + state);
      }
    } catch (IOException|KeeperException e) {
      LOG.warn(this + "; Failed state=" + state + ", retry " + this + "; cycles=" +
          getCycles(), e);
    }
    return Flow.HAS_MORE_STATE;
  }
  @Override
  protected void rollbackState(MasterProcedureEnv env,
      MasterProcedureProtos.RecoverMetaState recoverMetaState)
      throws IOException, InterruptedException {
    // Can't rollback
    throw new UnsupportedOperationException("unhandled state=" + recoverMetaState);
  }
  @Override
  protected MasterProcedureProtos.RecoverMetaState getState(int stateId) {
    return RecoverMetaState.forNumber(stateId);
  }
  @Override
  protected int getStateId(MasterProcedureProtos.RecoverMetaState recoverMetaState) {
    return recoverMetaState.getNumber();
  }
  @Override
  protected MasterProcedureProtos.RecoverMetaState getInitialState() {
    return RecoverMetaState.RECOVER_META_SPLIT_LOGS;
  }
  @Override
  protected void toStringClassDetails(StringBuilder sb) {
    sb.append(getClass().getSimpleName());
    sb.append(" failedMetaServer=");
    sb.append(failedMetaServer);
    sb.append(", splitWal=");
    sb.append(shouldSplitWal);
  }
  @Override
  protected void serializeStateData(OutputStream stream) throws IOException {
    super.serializeStateData(stream);
    MasterProcedureProtos.RecoverMetaStateData.Builder state =
        MasterProcedureProtos.RecoverMetaStateData.newBuilder().setShouldSplitWal(shouldSplitWal);
    if (failedMetaServer != null) {
      state.setFailedMetaServer(ProtobufUtil.toServerName(failedMetaServer));
    }
    state.setReplicaId(replicaId);
    state.build().writeDelimitedTo(stream);
  }
  @Override
  protected void deserializeStateData(InputStream stream) throws IOException {
    super.deserializeStateData(stream);
    MasterProcedureProtos.RecoverMetaStateData state =
        MasterProcedureProtos.RecoverMetaStateData.parseDelimitedFrom(stream);
    this.shouldSplitWal = state.hasShouldSplitWal() && state.getShouldSplitWal();
    this.failedMetaServer = state.hasFailedMetaServer() ?
        ProtobufUtil.toServerName(state.getFailedMetaServer()) : null;
    this.replicaId = state.hasReplicaId() ? state.getReplicaId() : HRegionInfo.DEFAULT_REPLICA_ID;
  }
  @Override
  protected LockState acquireLock(MasterProcedureEnv env) {
    if (env.getProcedureScheduler().waitTableExclusiveLock(this, TableName.META_TABLE_NAME)) {
      return LockState.LOCK_EVENT_WAIT;
    }
    return LockState.LOCK_ACQUIRED;
  }
  @Override
  protected void releaseLock(MasterProcedureEnv env) {
    env.getProcedureScheduler().wakeTableExclusiveLock(this, TableName.META_TABLE_NAME);
  }
  @Override
  protected void completionCleanup(MasterProcedureEnv env) {
    ProcedurePrepareLatch.releaseLatch(syncLatch, this);
  }
  @Override
  public TableName getTableName() {
    return TableName.META_TABLE_NAME;
  }
  @Override
  public TableOperationType getTableOperationType() {
    return TableOperationType.ENABLE;
  }
  /**
   * @return true if failedMetaServer is not null (meta carrying server crashed) or meta is
   * already initialized
   */
  private boolean isRunRequired() {
    return failedMetaServer != null || !master.getAssignmentManager().isMetaInitialized();
  }
  /**
   * Prepare for execution
   */
  private void prepare(MasterProcedureEnv env) {
    if (master == null) {
      master = (HMaster) env.getMasterServices();
      Preconditions.checkArgument(master != null);
    }
  }
 }
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
@ -1,4 +1,4 @@
-/**
+/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
@ -21,7 +21,6 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Iterator;
 import java.util.List;
@ -78,17 +77,6 @@ implements ServerProcedureInterface {
  private boolean carryingMeta = false;
  private boolean shouldSplitWal;
  /**
   * Cycles on same state. Good for figuring if we are stuck.
   */
  private int cycles = 0;
  /**
   * Ordinal of the previous state. So we can tell if we are progressing or not. TODO: if useful,
   * move this back up into StateMachineProcedure
   */
  private int previousState;
  /**
   * Call this constructor queuing up a Procedure.
   * @param serverName Name of the crashed server.
@ -117,16 +105,6 @@ implements ServerProcedureInterface {
  @Override
  protected Flow executeFromState(MasterProcedureEnv env, ServerCrashState state)
      throws ProcedureSuspendedException, ProcedureYieldException {
    if (LOG.isTraceEnabled()) {
      LOG.trace(state  + " " + this + "; cycles=" + this.cycles);
    }
    // Keep running count of cycles
    if (state.ordinal() != this.previousState) {
      this.previousState = state.ordinal();
      this.cycles = 0;
    } else {
      this.cycles++;
    }
    final MasterServices services = env.getMasterServices();
    // HBASE-14802
    // If we have not yet notified that we are processing a dead server, we should do now.
@ -182,7 +160,7 @@ implements ServerProcedureInterface {
          if (LOG.isTraceEnabled()) {
            LOG.trace("Assigning regions " +
              HRegionInfo.getShortNameToLog(regionsOnCrashedServer) + ", " + this +
-              "; cycles=" + this.cycles);
+              "; cycles=" + getCycles());
          }
          handleRIT(env, regionsOnCrashedServer);
          AssignmentManager am = env.getAssignmentManager();
@ -200,7 +178,7 @@ implements ServerProcedureInterface {
        throw new UnsupportedOperationException("unhandled state=" + state);
      }
    } catch (IOException e) {
-      LOG.warn("Failed state=" + state + ", retry " + this + "; cycles=" + this.cycles, e);
+      LOG.warn("Failed state=" + state + ", retry " + this + "; cycles=" + getCycles(), e);
    }
    return Flow.HAS_MORE_STATE;
  }
@ -208,15 +186,10 @@ implements ServerProcedureInterface {
  /**
   * @param env
   * @throws IOException
   * @throws InterruptedException
   */
  private void processMeta(final MasterProcedureEnv env) throws IOException {
-    if (LOG.isDebugEnabled()) LOG.debug("Processing hbase:meta that was on " + this.serverName);
+    if (LOG.isDebugEnabled()) LOG.debug(this + "; Processing hbase:meta that was on " +
-
+        this.serverName);
    if (this.shouldSplitWal) {
      // TODO: Matteo. We BLOCK here but most important thing to be doing at this moment.
      env.getMasterServices().getMasterWalManager().splitMetaLog(serverName);
    }
    // Assign meta if still carrying it. Check again: region may be assigned because of RIT timeout
    final AssignmentManager am = env.getMasterServices().getAssignmentManager();
@ -224,19 +197,13 @@ implements ServerProcedureInterface {
      if (!isDefaultMetaRegion(hri)) continue;
      am.offlineRegion(hri);
-      addChildProcedure(am.createAssignProcedure(hri, true));
+      addChildProcedure(new RecoverMetaProcedure(serverName, this.shouldSplitWal));
    }
  }
  private boolean filterDefaultMetaRegions(final List<HRegionInfo> regions) {
    if (regions == null) return false;
-    final Iterator<HRegionInfo> it = regions.iterator();
+    regions.removeIf(this::isDefaultMetaRegion);
    while (it.hasNext()) {
      final HRegionInfo hri = it.next();
      if (isDefaultMetaRegion(hri)) {
        it.remove();
      }
    }
    return !regions.isEmpty();
  }
@ -260,10 +227,6 @@ implements ServerProcedureInterface {
    am.getRegionStates().logSplit(this.serverName);
  }
  static int size(final Collection<HRegionInfo> hris) {
    return hris == null? 0: hris.size();
  }
  @Override
  protected void rollbackState(MasterProcedureEnv env, ServerCrashState state)
  throws IOException {
@ -273,7 +236,7 @@ implements ServerProcedureInterface {
  @Override
  protected ServerCrashState getState(int stateId) {
-    return ServerCrashState.valueOf(stateId);
+    return ServerCrashState.forNumber(stateId);
  }
  @Override
@ -394,9 +357,8 @@ implements ServerProcedureInterface {
   * Notify them of crash. Remove assign entries from the passed in <code>regions</code>
   * otherwise we have two assigns going on and they will fight over who has lock.
   * Notify Unassigns also.
-   * @param crashedServer Server that crashed.
+   * @param env
   * @param regions Regions that were on crashed server
   * @return Subset of <code>regions</code> that were RIT against <code>crashedServer</code>
   */
  private void handleRIT(final MasterProcedureEnv env, final List<HRegionInfo> regions) {
    if (regions == null) return;
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java
@ -52,6 +52,8 @@ import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
 import com.google.protobuf.Service;
 import static org.mockito.Mockito.mock;
 public class MockNoopMasterServices implements MasterServices, Server {
  private final Configuration conf;
  private final MetricsMaster metricsMaster;
@ -324,7 +326,7 @@ public class MockNoopMasterServices implements MasterServices, Server {
  @Override
  public TableStateManager getTableStateManager() {
-    return null;
+    return mock(TableStateManager.class);
  }
  @Override
@ -451,6 +453,11 @@ public class MockNoopMasterServices implements MasterServices, Server {
  public void checkIfShouldMoveSystemRegionAsync() {
  }
  @Override
  public boolean recoverMeta() throws IOException {
    return false;
  }
  @Override
  public ProcedureEvent getInitializedEvent() {
    // TODO Auto-generated method stub
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterNoCluster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterNoCluster.java
@ -272,7 +272,7 @@ public class TestMasterNoCluster {
      MasterMetaBootstrap createMetaBootstrap(final HMaster master, final MonitoredTask status) {
        return new MasterMetaBootstrap(this, status) {
          @Override
-          protected void assignMeta(Set<ServerName> previouslyFailedMeatRSs, int replicaId) { }
+          protected void assignMeta(int replicaId) { }
        };
      }
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureTestingUtility.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureTestingUtility.java
@ -92,13 +92,10 @@ public class MasterProcedureTestingUtility {
        public Void call() throws Exception {
          final AssignmentManager am = env.getAssignmentManager();
          am.start();
-          if (true) {
+          MasterMetaBootstrap metaBootstrap = new MasterMetaBootstrap(master,
-            MasterMetaBootstrap metaBootstrap = new MasterMetaBootstrap(master,
+              TaskMonitor.get().createStatus("meta"));
-                TaskMonitor.get().createStatus("meta"));
+          metaBootstrap.recoverMeta();
-            metaBootstrap.splitMetaLogsBeforeAssignment();
+          metaBootstrap.processDeadServers();
            metaBootstrap.assignMeta();
            metaBootstrap.processDeadServers();
          }
          am.joinCluster();
          master.setInitialized(true);
          return null;
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerCrashProcedure.java
@ -84,17 +84,18 @@ public class TestServerCrashProcedure {
  @Test(timeout=60000)
  public void testCrashTargetRs() throws Exception {
    testRecoveryAndDoubleExecution(false, false);
  }
  @Ignore  // HBASE-18366... To be enabled again.
  @Test(timeout=60000)
  public void testRecoveryAndDoubleExecutionOnRsWithMeta() throws Exception {
-    testRecoveryAndDoubleExecution(true);
+    testRecoveryAndDoubleExecution(true, true);
  }
  @Test(timeout=60000)
  public void testRecoveryAndDoubleExecutionOnRsWithoutMeta() throws Exception {
-    testRecoveryAndDoubleExecution(false);
+    testRecoveryAndDoubleExecution(false, true);
  }
  /**
@ -102,7 +103,8 @@ public class TestServerCrashProcedure {
   * needed state.
   * @throws Exception
   */
-  private void testRecoveryAndDoubleExecution(final boolean carryingMeta) throws Exception {
+  private void testRecoveryAndDoubleExecution(final boolean carryingMeta,
                                              final boolean doubleExecution) throws Exception {
    final TableName tableName = TableName.valueOf(
      "testRecoveryAndDoubleExecution-carryingMeta-" + carryingMeta);
    final Table t = this.util.createTable(tableName, HBaseTestingUtility.COLUMNS,
@ -120,7 +122,7 @@ public class TestServerCrashProcedure {
      master.setServerCrashProcessingEnabled(false);
      // find the first server that match the request and executes the test
      ServerName rsToKill = null;
-      for (HRegionInfo hri: util.getHBaseAdmin().getTableRegions(tableName)) {
+      for (HRegionInfo hri : util.getHBaseAdmin().getTableRegions(tableName)) {
        final ServerName serverName = AssignmentTestingUtil.getServerHoldingRegion(util, hri);
        if (AssignmentTestingUtil.isServerHoldingMeta(util, serverName) == carryingMeta) {
          rsToKill = serverName;
@ -135,14 +137,22 @@ public class TestServerCrashProcedure {
      master.getServerManager().moveFromOnlineToDeadServers(rsToKill);
      // Enable test flags and then queue the crash procedure.
      ProcedureTestingUtility.waitNoProcedureRunning(procExec);
-      ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
+      ServerCrashProcedure scp = new ServerCrashProcedure(procExec.getEnvironment(), rsToKill,
-      long procId = procExec.submitProcedure(new ServerCrashProcedure(
+          true, carryingMeta);
-          procExec.getEnvironment(), rsToKill, true, carryingMeta));
+      if (doubleExecution) {
-      // Now run through the procedure twice crashing the executor on each step...
+        ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
-      MasterProcedureTestingUtility.testRecoveryAndDoubleExecution(procExec, procId);
+        long procId = procExec.submitProcedure(scp);
        // Now run through the procedure twice crashing the executor on each step...
        MasterProcedureTestingUtility.testRecoveryAndDoubleExecution(procExec, procId);
      } else {
        ProcedureTestingUtility.submitAndWait(procExec, scp);
      }
      // Assert all data came back.
      assertEquals(count, util.countRows(t));
      assertEquals(checksum, util.checksumRows(t));
    } catch(Throwable throwable) {
      LOG.error("Test failed!", throwable);
      throw throwable;
    } finally {
      t.close();
    }