HBASE-18261 Created RecoverMetaProcedure and used it from ServerCrashProcedure and HMaster.finishActiveMasterInitialization().
This procedure can be used from any code before accessing meta, to initialize/ recover meta Signed-off-by: Michael Stack <stack@apache.org>
This commit is contained in:
parent
5490c558bc
commit
7bdabed275
@ -59,6 +59,20 @@ public abstract class StateMachineProcedure<TEnvironment, TState>
|
|||||||
|
|
||||||
private List<Procedure<TEnvironment>> subProcList = null;
|
private List<Procedure<TEnvironment>> subProcList = null;
|
||||||
|
|
||||||
|
protected final int getCycles() {
|
||||||
|
return cycles;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cycles on same state. Good for figuring if we are stuck.
|
||||||
|
*/
|
||||||
|
private int cycles = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ordinal of the previous state. So we can tell if we are progressing or not.
|
||||||
|
*/
|
||||||
|
private int previousState;
|
||||||
|
|
||||||
protected enum Flow {
|
protected enum Flow {
|
||||||
HAS_MORE_STATE,
|
HAS_MORE_STATE,
|
||||||
NO_MORE_STATE,
|
NO_MORE_STATE,
|
||||||
@ -152,6 +166,18 @@ public abstract class StateMachineProcedure<TEnvironment, TState>
|
|||||||
if (stateCount == 0) {
|
if (stateCount == 0) {
|
||||||
setNextState(getStateId(state));
|
setNextState(getStateId(state));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (LOG.isTraceEnabled()) {
|
||||||
|
LOG.trace(state + " " + this + "; cycles=" + this.cycles);
|
||||||
|
}
|
||||||
|
// Keep running count of cycles
|
||||||
|
if (getStateId(state) != this.previousState) {
|
||||||
|
this.previousState = getStateId(state);
|
||||||
|
this.cycles = 0;
|
||||||
|
} else {
|
||||||
|
this.cycles++;
|
||||||
|
}
|
||||||
|
|
||||||
stateFlow = executeFromState(env, state);
|
stateFlow = executeFromState(env, state);
|
||||||
if (!hasMoreState()) setNextState(EOF_STATE);
|
if (!hasMoreState()) setNextState(EOF_STATE);
|
||||||
if (subProcList != null && !subProcList.isEmpty()) {
|
if (subProcList != null && !subProcList.isEmpty()) {
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -330,6 +330,12 @@ message ServerCrashStateData {
|
|||||||
optional bool should_split_wal = 6 [default = true];
|
optional bool should_split_wal = 6 [default = true];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
message RecoverMetaStateData {
|
||||||
|
optional ServerName failed_meta_server = 1;
|
||||||
|
optional bool should_split_wal = 2 [default = true];
|
||||||
|
optional int32 replica_id = 3 [default = 0];
|
||||||
|
}
|
||||||
|
|
||||||
enum ServerCrashState {
|
enum ServerCrashState {
|
||||||
SERVER_CRASH_START = 1;
|
SERVER_CRASH_START = 1;
|
||||||
SERVER_CRASH_PROCESS_META = 2;
|
SERVER_CRASH_PROCESS_META = 2;
|
||||||
@ -343,6 +349,11 @@ enum ServerCrashState {
|
|||||||
SERVER_CRASH_FINISH = 100;
|
SERVER_CRASH_FINISH = 100;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum RecoverMetaState {
|
||||||
|
RECOVER_META_SPLIT_LOGS = 1;
|
||||||
|
RECOVER_META_ASSIGN_REGIONS = 2;
|
||||||
|
}
|
||||||
|
|
||||||
enum RegionTransitionState {
|
enum RegionTransitionState {
|
||||||
REGION_TRANSITION_QUEUE = 1;
|
REGION_TRANSITION_QUEUE = 1;
|
||||||
REGION_TRANSITION_DISPATCH = 2;
|
REGION_TRANSITION_DISPATCH = 2;
|
||||||
|
@ -123,6 +123,7 @@ import org.apache.hadoop.hbase.master.procedure.MasterProcedureUtil;
|
|||||||
import org.apache.hadoop.hbase.master.procedure.ModifyColumnFamilyProcedure;
|
import org.apache.hadoop.hbase.master.procedure.ModifyColumnFamilyProcedure;
|
||||||
import org.apache.hadoop.hbase.master.procedure.ModifyTableProcedure;
|
import org.apache.hadoop.hbase.master.procedure.ModifyTableProcedure;
|
||||||
import org.apache.hadoop.hbase.master.procedure.ProcedurePrepareLatch;
|
import org.apache.hadoop.hbase.master.procedure.ProcedurePrepareLatch;
|
||||||
|
import org.apache.hadoop.hbase.master.procedure.RecoverMetaProcedure;
|
||||||
import org.apache.hadoop.hbase.master.procedure.TruncateTableProcedure;
|
import org.apache.hadoop.hbase.master.procedure.TruncateTableProcedure;
|
||||||
import org.apache.hadoop.hbase.master.replication.ReplicationManager;
|
import org.apache.hadoop.hbase.master.replication.ReplicationManager;
|
||||||
import org.apache.hadoop.hbase.master.snapshot.SnapshotManager;
|
import org.apache.hadoop.hbase.master.snapshot.SnapshotManager;
|
||||||
@ -396,9 +397,6 @@ public class HMaster extends HRegionServer implements MasterServices {
|
|||||||
private long splitPlanCount;
|
private long splitPlanCount;
|
||||||
private long mergePlanCount;
|
private long mergePlanCount;
|
||||||
|
|
||||||
/** flag used in test cases in order to simulate RS failures during master initialization */
|
|
||||||
private volatile boolean initializationBeforeMetaAssignment = false;
|
|
||||||
|
|
||||||
/* Handle favored nodes information */
|
/* Handle favored nodes information */
|
||||||
private FavoredNodesManager favoredNodesManager;
|
private FavoredNodesManager favoredNodesManager;
|
||||||
|
|
||||||
@ -794,14 +792,6 @@ public class HMaster extends HRegionServer implements MasterServices {
|
|||||||
status.setStatus("Wait for region servers to report in");
|
status.setStatus("Wait for region servers to report in");
|
||||||
waitForRegionServers(status);
|
waitForRegionServers(status);
|
||||||
|
|
||||||
// get a list for previously failed RS which need log splitting work
|
|
||||||
// we recover hbase:meta region servers inside master initialization and
|
|
||||||
// handle other failed servers in SSH in order to start up master node ASAP
|
|
||||||
MasterMetaBootstrap metaBootstrap = createMetaBootstrap(this, status);
|
|
||||||
metaBootstrap.splitMetaLogsBeforeAssignment();
|
|
||||||
|
|
||||||
this.initializationBeforeMetaAssignment = true;
|
|
||||||
|
|
||||||
if (this.balancer instanceof FavoredNodesPromoter) {
|
if (this.balancer instanceof FavoredNodesPromoter) {
|
||||||
favoredNodesManager = new FavoredNodesManager(this);
|
favoredNodesManager = new FavoredNodesManager(this);
|
||||||
}
|
}
|
||||||
@ -820,8 +810,12 @@ public class HMaster extends HRegionServer implements MasterServices {
|
|||||||
if (isStopped()) return;
|
if (isStopped()) return;
|
||||||
|
|
||||||
// Make sure meta assigned before proceeding.
|
// Make sure meta assigned before proceeding.
|
||||||
status.setStatus("Assigning Meta Region");
|
status.setStatus("Recovering Meta Region");
|
||||||
metaBootstrap.assignMeta();
|
|
||||||
|
// we recover hbase:meta region servers inside master initialization and
|
||||||
|
// handle other failed servers in SSH in order to start up master node ASAP
|
||||||
|
MasterMetaBootstrap metaBootstrap = createMetaBootstrap(this, status);
|
||||||
|
metaBootstrap.recoverMeta();
|
||||||
|
|
||||||
// check if master is shutting down because above assignMeta could return even hbase:meta isn't
|
// check if master is shutting down because above assignMeta could return even hbase:meta isn't
|
||||||
// assigned when master is shutting down
|
// assigned when master is shutting down
|
||||||
@ -2709,14 +2703,6 @@ public class HMaster extends HRegionServer implements MasterServices {
|
|||||||
return serverCrashProcessingEnabled;
|
return serverCrashProcessingEnabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Report whether this master has started initialization and is about to do meta region assignment
|
|
||||||
* @return true if master is in initialization & about to assign hbase:meta regions
|
|
||||||
*/
|
|
||||||
public boolean isInitializationStartsMetaRegionAssignment() {
|
|
||||||
return this.initializationBeforeMetaAssignment;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compute the average load across all region servers.
|
* Compute the average load across all region servers.
|
||||||
* Currently, this uses a very naive computation - just uses the number of
|
* Currently, this uses a very naive computation - just uses the number of
|
||||||
@ -3424,6 +3410,17 @@ public class HMaster extends HRegionServer implements MasterServices {
|
|||||||
return lockManager;
|
return lockManager;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean recoverMeta() throws IOException {
|
||||||
|
ProcedurePrepareLatch latch = ProcedurePrepareLatch.createLatch(2, 0);
|
||||||
|
long procId = procedureExecutor.submitProcedure(new RecoverMetaProcedure(null, true, latch));
|
||||||
|
LOG.info("Waiting on RecoverMetaProcedure submitted with procId=" + procId);
|
||||||
|
latch.await();
|
||||||
|
LOG.info("Default replica of hbase:meta, location=" +
|
||||||
|
getMetaTableLocator().getMetaRegionLocation(getZooKeeper()));
|
||||||
|
return assignmentManager.isMetaInitialized();
|
||||||
|
}
|
||||||
|
|
||||||
public QuotaObserverChore getQuotaObserverChore() {
|
public QuotaObserverChore getQuotaObserverChore() {
|
||||||
return this.quotaObserverChore;
|
return this.quotaObserverChore;
|
||||||
}
|
}
|
||||||
|
@ -19,7 +19,6 @@
|
|||||||
package org.apache.hadoop.hbase.master;
|
package org.apache.hadoop.hbase.master;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
@ -49,45 +48,24 @@ public class MasterMetaBootstrap {
|
|||||||
private final MonitoredTask status;
|
private final MonitoredTask status;
|
||||||
private final HMaster master;
|
private final HMaster master;
|
||||||
|
|
||||||
private Set<ServerName> previouslyFailedServers;
|
|
||||||
private Set<ServerName> previouslyFailedMetaRSs;
|
|
||||||
|
|
||||||
public MasterMetaBootstrap(final HMaster master, final MonitoredTask status) {
|
public MasterMetaBootstrap(final HMaster master, final MonitoredTask status) {
|
||||||
this.master = master;
|
this.master = master;
|
||||||
this.status = status;
|
this.status = status;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void splitMetaLogsBeforeAssignment() throws IOException, KeeperException {
|
public void recoverMeta() throws InterruptedException, IOException {
|
||||||
|
master.recoverMeta();
|
||||||
|
master.getTableStateManager().start();
|
||||||
|
enableCrashedServerProcessing(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void processDeadServers() {
|
||||||
// get a list for previously failed RS which need log splitting work
|
// get a list for previously failed RS which need log splitting work
|
||||||
// we recover hbase:meta region servers inside master initialization and
|
// we recover hbase:meta region servers inside master initialization and
|
||||||
// handle other failed servers in SSH in order to start up master node ASAP
|
// handle other failed servers in SSH in order to start up master node ASAP
|
||||||
previouslyFailedServers = master.getMasterWalManager().getFailedServersFromLogFolders();
|
Set<ServerName> previouslyFailedServers =
|
||||||
|
master.getMasterWalManager().getFailedServersFromLogFolders();
|
||||||
|
|
||||||
// log splitting for hbase:meta server
|
|
||||||
ServerName oldMetaServerLocation = master.getMetaTableLocator()
|
|
||||||
.getMetaRegionLocation(master.getZooKeeper());
|
|
||||||
if (oldMetaServerLocation != null && previouslyFailedServers.contains(oldMetaServerLocation)) {
|
|
||||||
splitMetaLogBeforeAssignment(oldMetaServerLocation);
|
|
||||||
// Note: we can't remove oldMetaServerLocation from previousFailedServers list because it
|
|
||||||
// may also host user regions
|
|
||||||
}
|
|
||||||
previouslyFailedMetaRSs = getPreviouselyFailedMetaServersFromZK();
|
|
||||||
// need to use union of previouslyFailedMetaRSs recorded in ZK and previouslyFailedServers
|
|
||||||
// instead of previouslyFailedMetaRSs alone to address the following two situations:
|
|
||||||
// 1) the chained failure situation(recovery failed multiple times in a row).
|
|
||||||
// 2) master get killed right before it could delete the recovering hbase:meta from ZK while the
|
|
||||||
// same server still has non-meta wals to be replayed so that
|
|
||||||
// removeStaleRecoveringRegionsFromZK can't delete the stale hbase:meta region
|
|
||||||
// Passing more servers into splitMetaLog is all right. If a server doesn't have hbase:meta wal,
|
|
||||||
// there is no op for the server.
|
|
||||||
previouslyFailedMetaRSs.addAll(previouslyFailedServers);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void assignMeta() throws InterruptedException, IOException, KeeperException {
|
|
||||||
assignMeta(previouslyFailedMetaRSs, HRegionInfo.DEFAULT_REPLICA_ID);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void processDeadServers() throws IOException {
|
|
||||||
// Master has recovered hbase:meta region server and we put
|
// Master has recovered hbase:meta region server and we put
|
||||||
// other failed region servers in a queue to be handled later by SSH
|
// other failed region servers in a queue to be handled later by SSH
|
||||||
for (ServerName tmpServer : previouslyFailedServers) {
|
for (ServerName tmpServer : previouslyFailedServers) {
|
||||||
@ -99,17 +77,12 @@ public class MasterMetaBootstrap {
|
|||||||
throws IOException, InterruptedException, KeeperException {
|
throws IOException, InterruptedException, KeeperException {
|
||||||
int numReplicas = master.getConfiguration().getInt(HConstants.META_REPLICAS_NUM,
|
int numReplicas = master.getConfiguration().getInt(HConstants.META_REPLICAS_NUM,
|
||||||
HConstants.DEFAULT_META_REPLICA_NUM);
|
HConstants.DEFAULT_META_REPLICA_NUM);
|
||||||
final Set<ServerName> EMPTY_SET = new HashSet<>();
|
|
||||||
for (int i = 1; i < numReplicas; i++) {
|
for (int i = 1; i < numReplicas; i++) {
|
||||||
assignMeta(EMPTY_SET, i);
|
assignMeta(i);
|
||||||
}
|
}
|
||||||
unassignExcessMetaReplica(numReplicas);
|
unassignExcessMetaReplica(numReplicas);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void splitMetaLogBeforeAssignment(ServerName currentMetaServer) throws IOException {
|
|
||||||
master.getMasterWalManager().splitMetaLog(currentMetaServer);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void unassignExcessMetaReplica(int numMetaReplicasConfigured) {
|
private void unassignExcessMetaReplica(int numMetaReplicasConfigured) {
|
||||||
final ZooKeeperWatcher zooKeeper = master.getZooKeeper();
|
final ZooKeeperWatcher zooKeeper = master.getZooKeeper();
|
||||||
// unassign the unneeded replicas (for e.g., if the previous master was configured
|
// unassign the unneeded replicas (for e.g., if the previous master was configured
|
||||||
@ -137,12 +110,11 @@ public class MasterMetaBootstrap {
|
|||||||
/**
|
/**
|
||||||
* Check <code>hbase:meta</code> is assigned. If not, assign it.
|
* Check <code>hbase:meta</code> is assigned. If not, assign it.
|
||||||
*/
|
*/
|
||||||
protected void assignMeta(Set<ServerName> previouslyFailedMetaRSs, int replicaId)
|
protected void assignMeta(int replicaId)
|
||||||
throws InterruptedException, IOException, KeeperException {
|
throws InterruptedException, IOException, KeeperException {
|
||||||
final AssignmentManager assignmentManager = master.getAssignmentManager();
|
final AssignmentManager assignmentManager = master.getAssignmentManager();
|
||||||
|
|
||||||
// Work on meta region
|
// Work on meta region
|
||||||
int assigned = 0;
|
|
||||||
// TODO: Unimplemented
|
// TODO: Unimplemented
|
||||||
// long timeout =
|
// long timeout =
|
||||||
// master.getConfiguration().getLong("hbase.catalog.verification.timeout", 1000);
|
// master.getConfiguration().getLong("hbase.catalog.verification.timeout", 1000);
|
||||||
@ -172,14 +144,14 @@ public class MasterMetaBootstrap {
|
|||||||
// if the meta region server is died at this time, we need it to be re-assigned
|
// if the meta region server is died at this time, we need it to be re-assigned
|
||||||
// by SSH so that system tables can be assigned.
|
// by SSH so that system tables can be assigned.
|
||||||
// No need to wait for meta is assigned = 0 when meta is just verified.
|
// No need to wait for meta is assigned = 0 when meta is just verified.
|
||||||
if (replicaId == HRegionInfo.DEFAULT_REPLICA_ID) enableCrashedServerProcessing(assigned != 0);
|
if (replicaId == HRegionInfo.DEFAULT_REPLICA_ID) enableCrashedServerProcessing(false);
|
||||||
LOG.info("hbase:meta with replicaId " + replicaId + ", location="
|
LOG.info("hbase:meta with replicaId " + replicaId + ", location="
|
||||||
+ master.getMetaTableLocator().getMetaRegionLocation(master.getZooKeeper(), replicaId));
|
+ master.getMetaTableLocator().getMetaRegionLocation(master.getZooKeeper(), replicaId));
|
||||||
status.setStatus("META assigned.");
|
status.setStatus("META assigned.");
|
||||||
}
|
}
|
||||||
|
|
||||||
private void enableCrashedServerProcessing(final boolean waitForMeta)
|
private void enableCrashedServerProcessing(final boolean waitForMeta)
|
||||||
throws IOException, InterruptedException {
|
throws InterruptedException {
|
||||||
// If crashed server processing is disabled, we enable it and expire those dead but not expired
|
// If crashed server processing is disabled, we enable it and expire those dead but not expired
|
||||||
// servers. This is required so that if meta is assigning to a server which dies after
|
// servers. This is required so that if meta is assigning to a server which dies after
|
||||||
// assignMeta starts assignment, ServerCrashProcedure can re-assign it. Otherwise, we will be
|
// assignMeta starts assignment, ServerCrashProcedure can re-assign it. Otherwise, we will be
|
||||||
@ -193,23 +165,4 @@ public class MasterMetaBootstrap {
|
|||||||
master.getMetaTableLocator().waitMetaRegionLocation(master.getZooKeeper());
|
master.getMetaTableLocator().waitMetaRegionLocation(master.getZooKeeper());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* This function returns a set of region server names under hbase:meta recovering region ZK node
|
|
||||||
* @return Set of meta server names which were recorded in ZK
|
|
||||||
*/
|
|
||||||
private Set<ServerName> getPreviouselyFailedMetaServersFromZK() throws KeeperException {
|
|
||||||
final ZooKeeperWatcher zooKeeper = master.getZooKeeper();
|
|
||||||
Set<ServerName> result = new HashSet<>();
|
|
||||||
String metaRecoveringZNode = ZKUtil.joinZNode(zooKeeper.znodePaths.recoveringRegionsZNode,
|
|
||||||
HRegionInfo.FIRST_META_REGIONINFO.getEncodedName());
|
|
||||||
List<String> regionFailedServers = ZKUtil.listChildrenNoWatch(zooKeeper, metaRecoveringZNode);
|
|
||||||
if (regionFailedServers == null) return result;
|
|
||||||
|
|
||||||
for (String failedServer : regionFailedServers) {
|
|
||||||
ServerName server = ServerName.parseServerName(failedServer);
|
|
||||||
result.add(server);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -500,4 +500,11 @@ public interface MasterServices extends Server {
|
|||||||
public String getRegionServerVersion(final ServerName sn);
|
public String getRegionServerVersion(final ServerName sn);
|
||||||
|
|
||||||
public void checkIfShouldMoveSystemRegionAsync();
|
public void checkIfShouldMoveSystemRegionAsync();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Recover meta table. Will result in no-op is meta is already initialized. Any code that has
|
||||||
|
* access to master and requires to access meta during process initialization can call this
|
||||||
|
* method to make sure meta is initialized.
|
||||||
|
*/
|
||||||
|
boolean recoverMeta() throws IOException;
|
||||||
}
|
}
|
||||||
|
@ -151,7 +151,7 @@ public class MasterWalManager {
|
|||||||
* Inspect the log directory to find dead servers which need recovery work
|
* Inspect the log directory to find dead servers which need recovery work
|
||||||
* @return A set of ServerNames which aren't running but still have WAL files left in file system
|
* @return A set of ServerNames which aren't running but still have WAL files left in file system
|
||||||
*/
|
*/
|
||||||
Set<ServerName> getFailedServersFromLogFolders() {
|
public Set<ServerName> getFailedServersFromLogFolders() {
|
||||||
boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
|
boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors",
|
||||||
WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT);
|
WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT);
|
||||||
|
|
||||||
|
@ -1448,6 +1448,8 @@ public class AssignmentManager implements ServerListener {
|
|||||||
synchronized (regionNode) {
|
synchronized (regionNode) {
|
||||||
State state = regionNode.transitionState(State.OPEN, RegionStates.STATES_EXPECTED_ON_OPEN);
|
State state = regionNode.transitionState(State.OPEN, RegionStates.STATES_EXPECTED_ON_OPEN);
|
||||||
if (isMetaRegion(hri)) {
|
if (isMetaRegion(hri)) {
|
||||||
|
master.getTableStateManager().setTableState(TableName.META_TABLE_NAME,
|
||||||
|
TableState.State.ENABLED);
|
||||||
setMetaInitialized(hri, true);
|
setMetaInitialized(hri, true);
|
||||||
}
|
}
|
||||||
regionStates.addRegionToServer(regionNode.getRegionLocation(), regionNode);
|
regionStates.addRegionToServer(regionNode.getRegionLocation(), regionNode);
|
||||||
|
@ -0,0 +1,253 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.hbase.master.procedure;
|
||||||
|
|
||||||
|
import com.google.common.base.Preconditions;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.hadoop.hbase.HRegionInfo;
|
||||||
|
import org.apache.hadoop.hbase.ServerName;
|
||||||
|
import org.apache.hadoop.hbase.TableName;
|
||||||
|
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
|
||||||
|
import org.apache.hadoop.hbase.master.HMaster;
|
||||||
|
import org.apache.hadoop.hbase.master.assignment.AssignProcedure;
|
||||||
|
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
|
||||||
|
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
|
||||||
|
import org.apache.hadoop.hbase.procedure2.StateMachineProcedure;
|
||||||
|
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||||
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos;
|
||||||
|
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RecoverMetaState;
|
||||||
|
import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
|
||||||
|
import org.apache.zookeeper.KeeperException;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This procedure recovers meta from prior shutdown/ crash of a server, and brings meta online by
|
||||||
|
* assigning meta region/s. Any place where meta is accessed and requires meta to be online, need to
|
||||||
|
* submit this procedure instead of duplicating steps to recover meta in the code.
|
||||||
|
*/
|
||||||
|
public class RecoverMetaProcedure
|
||||||
|
extends StateMachineProcedure<MasterProcedureEnv, MasterProcedureProtos.RecoverMetaState>
|
||||||
|
implements TableProcedureInterface {
|
||||||
|
private static final Log LOG = LogFactory.getLog(RecoverMetaProcedure.class);
|
||||||
|
|
||||||
|
private ServerName failedMetaServer;
|
||||||
|
private boolean shouldSplitWal;
|
||||||
|
private int replicaId;
|
||||||
|
|
||||||
|
private final ProcedurePrepareLatch syncLatch;
|
||||||
|
private HMaster master;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Call this constructor to queue up a {@link RecoverMetaProcedure} in response to meta
|
||||||
|
* carrying server crash
|
||||||
|
* @param failedMetaServer failed/ crashed region server that was carrying meta
|
||||||
|
* @param shouldSplitLog split log file of meta region
|
||||||
|
*/
|
||||||
|
public RecoverMetaProcedure(final ServerName failedMetaServer, final boolean shouldSplitLog) {
|
||||||
|
this(failedMetaServer, shouldSplitLog, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor with latch, for blocking/ sync usage
|
||||||
|
*/
|
||||||
|
public RecoverMetaProcedure(final ServerName failedMetaServer, final boolean shouldSplitLog,
|
||||||
|
final ProcedurePrepareLatch latch) {
|
||||||
|
this.failedMetaServer = failedMetaServer;
|
||||||
|
this.shouldSplitWal = shouldSplitLog;
|
||||||
|
this.replicaId = HRegionInfo.DEFAULT_REPLICA_ID;
|
||||||
|
this.syncLatch = latch;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This constructor is also used when deserializing from a procedure store; we'll construct one
|
||||||
|
* of these then call {@link #deserializeStateData(InputStream)}. Do not use directly.
|
||||||
|
*/
|
||||||
|
public RecoverMetaProcedure() {
|
||||||
|
this(null, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Flow executeFromState(MasterProcedureEnv env,
|
||||||
|
MasterProcedureProtos.RecoverMetaState state)
|
||||||
|
throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
|
||||||
|
prepare(env);
|
||||||
|
|
||||||
|
if (!isRunRequired()) {
|
||||||
|
LOG.info(this + "; Meta already initialized. Skipping run");
|
||||||
|
return Flow.NO_MORE_STATE;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
switch (state) {
|
||||||
|
case RECOVER_META_SPLIT_LOGS:
|
||||||
|
LOG.info("Start " + this);
|
||||||
|
if (shouldSplitWal) {
|
||||||
|
// TODO: Matteo. We BLOCK here but most important thing to be doing at this moment.
|
||||||
|
if (failedMetaServer != null) {
|
||||||
|
master.getMasterWalManager().splitMetaLog(failedMetaServer);
|
||||||
|
} else {
|
||||||
|
ServerName serverName =
|
||||||
|
master.getMetaTableLocator().getMetaRegionLocation(master.getZooKeeper());
|
||||||
|
Set<ServerName> previouslyFailedServers =
|
||||||
|
master.getMasterWalManager().getFailedServersFromLogFolders();
|
||||||
|
if (serverName != null && previouslyFailedServers.contains(serverName)) {
|
||||||
|
master.getMasterWalManager().splitMetaLog(serverName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
setNextState(RecoverMetaState.RECOVER_META_ASSIGN_REGIONS);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case RECOVER_META_ASSIGN_REGIONS:
|
||||||
|
HRegionInfo hri = RegionReplicaUtil.getRegionInfoForReplica(
|
||||||
|
HRegionInfo.FIRST_META_REGIONINFO, this.replicaId);
|
||||||
|
|
||||||
|
AssignProcedure metaAssignProcedure;
|
||||||
|
if (failedMetaServer != null) {
|
||||||
|
LOG.info(this + "; Assigning meta with new plan. previous meta server=" +
|
||||||
|
failedMetaServer);
|
||||||
|
metaAssignProcedure = master.getAssignmentManager().createAssignProcedure(hri, true);
|
||||||
|
} else {
|
||||||
|
// get server carrying meta from zk
|
||||||
|
ServerName metaServer =
|
||||||
|
MetaTableLocator.getMetaRegionState(master.getZooKeeper()).getServerName();
|
||||||
|
LOG.info(this + "; Retaining meta assignment to server=" + metaServer);
|
||||||
|
metaAssignProcedure =
|
||||||
|
master.getAssignmentManager().createAssignProcedure(hri, metaServer);
|
||||||
|
}
|
||||||
|
|
||||||
|
addChildProcedure(metaAssignProcedure);
|
||||||
|
return Flow.NO_MORE_STATE;
|
||||||
|
|
||||||
|
default:
|
||||||
|
throw new UnsupportedOperationException("unhandled state=" + state);
|
||||||
|
}
|
||||||
|
} catch (IOException|KeeperException e) {
|
||||||
|
LOG.warn(this + "; Failed state=" + state + ", retry " + this + "; cycles=" +
|
||||||
|
getCycles(), e);
|
||||||
|
}
|
||||||
|
return Flow.HAS_MORE_STATE;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void rollbackState(MasterProcedureEnv env,
|
||||||
|
MasterProcedureProtos.RecoverMetaState recoverMetaState)
|
||||||
|
throws IOException, InterruptedException {
|
||||||
|
// Can't rollback
|
||||||
|
throw new UnsupportedOperationException("unhandled state=" + recoverMetaState);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected MasterProcedureProtos.RecoverMetaState getState(int stateId) {
|
||||||
|
return RecoverMetaState.forNumber(stateId);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected int getStateId(MasterProcedureProtos.RecoverMetaState recoverMetaState) {
|
||||||
|
return recoverMetaState.getNumber();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected MasterProcedureProtos.RecoverMetaState getInitialState() {
|
||||||
|
return RecoverMetaState.RECOVER_META_SPLIT_LOGS;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void toStringClassDetails(StringBuilder sb) {
|
||||||
|
sb.append(getClass().getSimpleName());
|
||||||
|
sb.append(" failedMetaServer=");
|
||||||
|
sb.append(failedMetaServer);
|
||||||
|
sb.append(", splitWal=");
|
||||||
|
sb.append(shouldSplitWal);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void serializeStateData(OutputStream stream) throws IOException {
|
||||||
|
super.serializeStateData(stream);
|
||||||
|
MasterProcedureProtos.RecoverMetaStateData.Builder state =
|
||||||
|
MasterProcedureProtos.RecoverMetaStateData.newBuilder().setShouldSplitWal(shouldSplitWal);
|
||||||
|
if (failedMetaServer != null) {
|
||||||
|
state.setFailedMetaServer(ProtobufUtil.toServerName(failedMetaServer));
|
||||||
|
}
|
||||||
|
state.setReplicaId(replicaId);
|
||||||
|
state.build().writeDelimitedTo(stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void deserializeStateData(InputStream stream) throws IOException {
|
||||||
|
super.deserializeStateData(stream);
|
||||||
|
MasterProcedureProtos.RecoverMetaStateData state =
|
||||||
|
MasterProcedureProtos.RecoverMetaStateData.parseDelimitedFrom(stream);
|
||||||
|
this.shouldSplitWal = state.hasShouldSplitWal() && state.getShouldSplitWal();
|
||||||
|
this.failedMetaServer = state.hasFailedMetaServer() ?
|
||||||
|
ProtobufUtil.toServerName(state.getFailedMetaServer()) : null;
|
||||||
|
this.replicaId = state.hasReplicaId() ? state.getReplicaId() : HRegionInfo.DEFAULT_REPLICA_ID;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected LockState acquireLock(MasterProcedureEnv env) {
|
||||||
|
if (env.getProcedureScheduler().waitTableExclusiveLock(this, TableName.META_TABLE_NAME)) {
|
||||||
|
return LockState.LOCK_EVENT_WAIT;
|
||||||
|
}
|
||||||
|
return LockState.LOCK_ACQUIRED;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void releaseLock(MasterProcedureEnv env) {
|
||||||
|
env.getProcedureScheduler().wakeTableExclusiveLock(this, TableName.META_TABLE_NAME);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void completionCleanup(MasterProcedureEnv env) {
|
||||||
|
ProcedurePrepareLatch.releaseLatch(syncLatch, this);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TableName getTableName() {
|
||||||
|
return TableName.META_TABLE_NAME;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TableOperationType getTableOperationType() {
|
||||||
|
return TableOperationType.ENABLE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return true if failedMetaServer is not null (meta carrying server crashed) or meta is
|
||||||
|
* already initialized
|
||||||
|
*/
|
||||||
|
private boolean isRunRequired() {
|
||||||
|
return failedMetaServer != null || !master.getAssignmentManager().isMetaInitialized();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prepare for execution
|
||||||
|
*/
|
||||||
|
private void prepare(MasterProcedureEnv env) {
|
||||||
|
if (master == null) {
|
||||||
|
master = (HMaster) env.getMasterServices();
|
||||||
|
Preconditions.checkArgument(master != null);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
/**
|
/*
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
* or more contributor license agreements. See the NOTICE file
|
* or more contributor license agreements. See the NOTICE file
|
||||||
* distributed with this work for additional information
|
* distributed with this work for additional information
|
||||||
@ -21,7 +21,6 @@ import java.io.IOException;
|
|||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@ -78,17 +77,6 @@ implements ServerProcedureInterface {
|
|||||||
private boolean carryingMeta = false;
|
private boolean carryingMeta = false;
|
||||||
private boolean shouldSplitWal;
|
private boolean shouldSplitWal;
|
||||||
|
|
||||||
/**
|
|
||||||
* Cycles on same state. Good for figuring if we are stuck.
|
|
||||||
*/
|
|
||||||
private int cycles = 0;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Ordinal of the previous state. So we can tell if we are progressing or not. TODO: if useful,
|
|
||||||
* move this back up into StateMachineProcedure
|
|
||||||
*/
|
|
||||||
private int previousState;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Call this constructor queuing up a Procedure.
|
* Call this constructor queuing up a Procedure.
|
||||||
* @param serverName Name of the crashed server.
|
* @param serverName Name of the crashed server.
|
||||||
@ -117,16 +105,6 @@ implements ServerProcedureInterface {
|
|||||||
@Override
|
@Override
|
||||||
protected Flow executeFromState(MasterProcedureEnv env, ServerCrashState state)
|
protected Flow executeFromState(MasterProcedureEnv env, ServerCrashState state)
|
||||||
throws ProcedureSuspendedException, ProcedureYieldException {
|
throws ProcedureSuspendedException, ProcedureYieldException {
|
||||||
if (LOG.isTraceEnabled()) {
|
|
||||||
LOG.trace(state + " " + this + "; cycles=" + this.cycles);
|
|
||||||
}
|
|
||||||
// Keep running count of cycles
|
|
||||||
if (state.ordinal() != this.previousState) {
|
|
||||||
this.previousState = state.ordinal();
|
|
||||||
this.cycles = 0;
|
|
||||||
} else {
|
|
||||||
this.cycles++;
|
|
||||||
}
|
|
||||||
final MasterServices services = env.getMasterServices();
|
final MasterServices services = env.getMasterServices();
|
||||||
// HBASE-14802
|
// HBASE-14802
|
||||||
// If we have not yet notified that we are processing a dead server, we should do now.
|
// If we have not yet notified that we are processing a dead server, we should do now.
|
||||||
@ -182,7 +160,7 @@ implements ServerProcedureInterface {
|
|||||||
if (LOG.isTraceEnabled()) {
|
if (LOG.isTraceEnabled()) {
|
||||||
LOG.trace("Assigning regions " +
|
LOG.trace("Assigning regions " +
|
||||||
HRegionInfo.getShortNameToLog(regionsOnCrashedServer) + ", " + this +
|
HRegionInfo.getShortNameToLog(regionsOnCrashedServer) + ", " + this +
|
||||||
"; cycles=" + this.cycles);
|
"; cycles=" + getCycles());
|
||||||
}
|
}
|
||||||
handleRIT(env, regionsOnCrashedServer);
|
handleRIT(env, regionsOnCrashedServer);
|
||||||
AssignmentManager am = env.getAssignmentManager();
|
AssignmentManager am = env.getAssignmentManager();
|
||||||
@ -200,7 +178,7 @@ implements ServerProcedureInterface {
|
|||||||
throw new UnsupportedOperationException("unhandled state=" + state);
|
throw new UnsupportedOperationException("unhandled state=" + state);
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
LOG.warn("Failed state=" + state + ", retry " + this + "; cycles=" + this.cycles, e);
|
LOG.warn("Failed state=" + state + ", retry " + this + "; cycles=" + getCycles(), e);
|
||||||
}
|
}
|
||||||
return Flow.HAS_MORE_STATE;
|
return Flow.HAS_MORE_STATE;
|
||||||
}
|
}
|
||||||
@ -208,15 +186,10 @@ implements ServerProcedureInterface {
|
|||||||
/**
|
/**
|
||||||
* @param env
|
* @param env
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
* @throws InterruptedException
|
|
||||||
*/
|
*/
|
||||||
private void processMeta(final MasterProcedureEnv env) throws IOException {
|
private void processMeta(final MasterProcedureEnv env) throws IOException {
|
||||||
if (LOG.isDebugEnabled()) LOG.debug("Processing hbase:meta that was on " + this.serverName);
|
if (LOG.isDebugEnabled()) LOG.debug(this + "; Processing hbase:meta that was on " +
|
||||||
|
this.serverName);
|
||||||
if (this.shouldSplitWal) {
|
|
||||||
// TODO: Matteo. We BLOCK here but most important thing to be doing at this moment.
|
|
||||||
env.getMasterServices().getMasterWalManager().splitMetaLog(serverName);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Assign meta if still carrying it. Check again: region may be assigned because of RIT timeout
|
// Assign meta if still carrying it. Check again: region may be assigned because of RIT timeout
|
||||||
final AssignmentManager am = env.getMasterServices().getAssignmentManager();
|
final AssignmentManager am = env.getMasterServices().getAssignmentManager();
|
||||||
@ -224,19 +197,13 @@ implements ServerProcedureInterface {
|
|||||||
if (!isDefaultMetaRegion(hri)) continue;
|
if (!isDefaultMetaRegion(hri)) continue;
|
||||||
|
|
||||||
am.offlineRegion(hri);
|
am.offlineRegion(hri);
|
||||||
addChildProcedure(am.createAssignProcedure(hri, true));
|
addChildProcedure(new RecoverMetaProcedure(serverName, this.shouldSplitWal));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean filterDefaultMetaRegions(final List<HRegionInfo> regions) {
|
private boolean filterDefaultMetaRegions(final List<HRegionInfo> regions) {
|
||||||
if (regions == null) return false;
|
if (regions == null) return false;
|
||||||
final Iterator<HRegionInfo> it = regions.iterator();
|
regions.removeIf(this::isDefaultMetaRegion);
|
||||||
while (it.hasNext()) {
|
|
||||||
final HRegionInfo hri = it.next();
|
|
||||||
if (isDefaultMetaRegion(hri)) {
|
|
||||||
it.remove();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return !regions.isEmpty();
|
return !regions.isEmpty();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -260,10 +227,6 @@ implements ServerProcedureInterface {
|
|||||||
am.getRegionStates().logSplit(this.serverName);
|
am.getRegionStates().logSplit(this.serverName);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int size(final Collection<HRegionInfo> hris) {
|
|
||||||
return hris == null? 0: hris.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void rollbackState(MasterProcedureEnv env, ServerCrashState state)
|
protected void rollbackState(MasterProcedureEnv env, ServerCrashState state)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
@ -273,7 +236,7 @@ implements ServerProcedureInterface {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected ServerCrashState getState(int stateId) {
|
protected ServerCrashState getState(int stateId) {
|
||||||
return ServerCrashState.valueOf(stateId);
|
return ServerCrashState.forNumber(stateId);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -394,9 +357,8 @@ implements ServerProcedureInterface {
|
|||||||
* Notify them of crash. Remove assign entries from the passed in <code>regions</code>
|
* Notify them of crash. Remove assign entries from the passed in <code>regions</code>
|
||||||
* otherwise we have two assigns going on and they will fight over who has lock.
|
* otherwise we have two assigns going on and they will fight over who has lock.
|
||||||
* Notify Unassigns also.
|
* Notify Unassigns also.
|
||||||
* @param crashedServer Server that crashed.
|
* @param env
|
||||||
* @param regions Regions that were on crashed server
|
* @param regions Regions that were on crashed server
|
||||||
* @return Subset of <code>regions</code> that were RIT against <code>crashedServer</code>
|
|
||||||
*/
|
*/
|
||||||
private void handleRIT(final MasterProcedureEnv env, final List<HRegionInfo> regions) {
|
private void handleRIT(final MasterProcedureEnv env, final List<HRegionInfo> regions) {
|
||||||
if (regions == null) return;
|
if (regions == null) return;
|
||||||
|
@ -52,6 +52,8 @@ import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
|
|||||||
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
||||||
import com.google.protobuf.Service;
|
import com.google.protobuf.Service;
|
||||||
|
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
|
||||||
public class MockNoopMasterServices implements MasterServices, Server {
|
public class MockNoopMasterServices implements MasterServices, Server {
|
||||||
private final Configuration conf;
|
private final Configuration conf;
|
||||||
private final MetricsMaster metricsMaster;
|
private final MetricsMaster metricsMaster;
|
||||||
@ -324,7 +326,7 @@ public class MockNoopMasterServices implements MasterServices, Server {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TableStateManager getTableStateManager() {
|
public TableStateManager getTableStateManager() {
|
||||||
return null;
|
return mock(TableStateManager.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -451,6 +453,11 @@ public class MockNoopMasterServices implements MasterServices, Server {
|
|||||||
public void checkIfShouldMoveSystemRegionAsync() {
|
public void checkIfShouldMoveSystemRegionAsync() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean recoverMeta() throws IOException {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ProcedureEvent getInitializedEvent() {
|
public ProcedureEvent getInitializedEvent() {
|
||||||
// TODO Auto-generated method stub
|
// TODO Auto-generated method stub
|
||||||
|
@ -272,7 +272,7 @@ public class TestMasterNoCluster {
|
|||||||
MasterMetaBootstrap createMetaBootstrap(final HMaster master, final MonitoredTask status) {
|
MasterMetaBootstrap createMetaBootstrap(final HMaster master, final MonitoredTask status) {
|
||||||
return new MasterMetaBootstrap(this, status) {
|
return new MasterMetaBootstrap(this, status) {
|
||||||
@Override
|
@Override
|
||||||
protected void assignMeta(Set<ServerName> previouslyFailedMeatRSs, int replicaId) { }
|
protected void assignMeta(int replicaId) { }
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -92,13 +92,10 @@ public class MasterProcedureTestingUtility {
|
|||||||
public Void call() throws Exception {
|
public Void call() throws Exception {
|
||||||
final AssignmentManager am = env.getAssignmentManager();
|
final AssignmentManager am = env.getAssignmentManager();
|
||||||
am.start();
|
am.start();
|
||||||
if (true) {
|
MasterMetaBootstrap metaBootstrap = new MasterMetaBootstrap(master,
|
||||||
MasterMetaBootstrap metaBootstrap = new MasterMetaBootstrap(master,
|
TaskMonitor.get().createStatus("meta"));
|
||||||
TaskMonitor.get().createStatus("meta"));
|
metaBootstrap.recoverMeta();
|
||||||
metaBootstrap.splitMetaLogsBeforeAssignment();
|
metaBootstrap.processDeadServers();
|
||||||
metaBootstrap.assignMeta();
|
|
||||||
metaBootstrap.processDeadServers();
|
|
||||||
}
|
|
||||||
am.joinCluster();
|
am.joinCluster();
|
||||||
master.setInitialized(true);
|
master.setInitialized(true);
|
||||||
return null;
|
return null;
|
||||||
|
@ -84,17 +84,18 @@ public class TestServerCrashProcedure {
|
|||||||
|
|
||||||
@Test(timeout=60000)
|
@Test(timeout=60000)
|
||||||
public void testCrashTargetRs() throws Exception {
|
public void testCrashTargetRs() throws Exception {
|
||||||
|
testRecoveryAndDoubleExecution(false, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Ignore // HBASE-18366... To be enabled again.
|
@Ignore // HBASE-18366... To be enabled again.
|
||||||
@Test(timeout=60000)
|
@Test(timeout=60000)
|
||||||
public void testRecoveryAndDoubleExecutionOnRsWithMeta() throws Exception {
|
public void testRecoveryAndDoubleExecutionOnRsWithMeta() throws Exception {
|
||||||
testRecoveryAndDoubleExecution(true);
|
testRecoveryAndDoubleExecution(true, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(timeout=60000)
|
@Test(timeout=60000)
|
||||||
public void testRecoveryAndDoubleExecutionOnRsWithoutMeta() throws Exception {
|
public void testRecoveryAndDoubleExecutionOnRsWithoutMeta() throws Exception {
|
||||||
testRecoveryAndDoubleExecution(false);
|
testRecoveryAndDoubleExecution(false, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -102,7 +103,8 @@ public class TestServerCrashProcedure {
|
|||||||
* needed state.
|
* needed state.
|
||||||
* @throws Exception
|
* @throws Exception
|
||||||
*/
|
*/
|
||||||
private void testRecoveryAndDoubleExecution(final boolean carryingMeta) throws Exception {
|
private void testRecoveryAndDoubleExecution(final boolean carryingMeta,
|
||||||
|
final boolean doubleExecution) throws Exception {
|
||||||
final TableName tableName = TableName.valueOf(
|
final TableName tableName = TableName.valueOf(
|
||||||
"testRecoveryAndDoubleExecution-carryingMeta-" + carryingMeta);
|
"testRecoveryAndDoubleExecution-carryingMeta-" + carryingMeta);
|
||||||
final Table t = this.util.createTable(tableName, HBaseTestingUtility.COLUMNS,
|
final Table t = this.util.createTable(tableName, HBaseTestingUtility.COLUMNS,
|
||||||
@ -120,7 +122,7 @@ public class TestServerCrashProcedure {
|
|||||||
master.setServerCrashProcessingEnabled(false);
|
master.setServerCrashProcessingEnabled(false);
|
||||||
// find the first server that match the request and executes the test
|
// find the first server that match the request and executes the test
|
||||||
ServerName rsToKill = null;
|
ServerName rsToKill = null;
|
||||||
for (HRegionInfo hri: util.getHBaseAdmin().getTableRegions(tableName)) {
|
for (HRegionInfo hri : util.getHBaseAdmin().getTableRegions(tableName)) {
|
||||||
final ServerName serverName = AssignmentTestingUtil.getServerHoldingRegion(util, hri);
|
final ServerName serverName = AssignmentTestingUtil.getServerHoldingRegion(util, hri);
|
||||||
if (AssignmentTestingUtil.isServerHoldingMeta(util, serverName) == carryingMeta) {
|
if (AssignmentTestingUtil.isServerHoldingMeta(util, serverName) == carryingMeta) {
|
||||||
rsToKill = serverName;
|
rsToKill = serverName;
|
||||||
@ -135,14 +137,22 @@ public class TestServerCrashProcedure {
|
|||||||
master.getServerManager().moveFromOnlineToDeadServers(rsToKill);
|
master.getServerManager().moveFromOnlineToDeadServers(rsToKill);
|
||||||
// Enable test flags and then queue the crash procedure.
|
// Enable test flags and then queue the crash procedure.
|
||||||
ProcedureTestingUtility.waitNoProcedureRunning(procExec);
|
ProcedureTestingUtility.waitNoProcedureRunning(procExec);
|
||||||
ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
|
ServerCrashProcedure scp = new ServerCrashProcedure(procExec.getEnvironment(), rsToKill,
|
||||||
long procId = procExec.submitProcedure(new ServerCrashProcedure(
|
true, carryingMeta);
|
||||||
procExec.getEnvironment(), rsToKill, true, carryingMeta));
|
if (doubleExecution) {
|
||||||
// Now run through the procedure twice crashing the executor on each step...
|
ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
|
||||||
MasterProcedureTestingUtility.testRecoveryAndDoubleExecution(procExec, procId);
|
long procId = procExec.submitProcedure(scp);
|
||||||
|
// Now run through the procedure twice crashing the executor on each step...
|
||||||
|
MasterProcedureTestingUtility.testRecoveryAndDoubleExecution(procExec, procId);
|
||||||
|
} else {
|
||||||
|
ProcedureTestingUtility.submitAndWait(procExec, scp);
|
||||||
|
}
|
||||||
// Assert all data came back.
|
// Assert all data came back.
|
||||||
assertEquals(count, util.countRows(t));
|
assertEquals(count, util.countRows(t));
|
||||||
assertEquals(checksum, util.checksumRows(t));
|
assertEquals(checksum, util.checksumRows(t));
|
||||||
|
} catch(Throwable throwable) {
|
||||||
|
LOG.error("Test failed!", throwable);
|
||||||
|
throw throwable;
|
||||||
} finally {
|
} finally {
|
||||||
t.close();
|
t.close();
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user