HBASE-20881 Introduce a region transition procedure to handle all the state transition for a region
This commit is contained in:
parent
2b7ef2e623
commit
f533f01a3a
|
@ -51,9 +51,13 @@ public class RegionState {
|
|||
SPLITTING_NEW, // new region to be created when RS splits a parent
|
||||
// region but hasn't be created yet, or master doesn't
|
||||
// know it's already created
|
||||
MERGING_NEW; // new region to be created when RS merges two
|
||||
MERGING_NEW, // new region to be created when RS merges two
|
||||
// daughter regions but hasn't be created yet, or
|
||||
// master doesn't know it's already created
|
||||
ABNORMALLY_CLOSED; // the region is CLOSED because of a RS crashes. Usually it is the same
|
||||
// with CLOSED, but for some operations such as merge/split, we can not
|
||||
// apply it to a region in this state, as it may lead to data loss as we
|
||||
// may have some data in recovered edits.
|
||||
|
||||
/**
|
||||
* Convert to protobuf ClusterStatusProtos.RegionState.State
|
||||
|
@ -61,47 +65,50 @@ public class RegionState {
|
|||
public ClusterStatusProtos.RegionState.State convert() {
|
||||
ClusterStatusProtos.RegionState.State rs;
|
||||
switch (this) {
|
||||
case OFFLINE:
|
||||
rs = ClusterStatusProtos.RegionState.State.OFFLINE;
|
||||
break;
|
||||
case OPENING:
|
||||
rs = ClusterStatusProtos.RegionState.State.OPENING;
|
||||
break;
|
||||
case OPEN:
|
||||
rs = ClusterStatusProtos.RegionState.State.OPEN;
|
||||
break;
|
||||
case CLOSING:
|
||||
rs = ClusterStatusProtos.RegionState.State.CLOSING;
|
||||
break;
|
||||
case CLOSED:
|
||||
rs = ClusterStatusProtos.RegionState.State.CLOSED;
|
||||
break;
|
||||
case SPLITTING:
|
||||
rs = ClusterStatusProtos.RegionState.State.SPLITTING;
|
||||
break;
|
||||
case SPLIT:
|
||||
rs = ClusterStatusProtos.RegionState.State.SPLIT;
|
||||
break;
|
||||
case FAILED_OPEN:
|
||||
rs = ClusterStatusProtos.RegionState.State.FAILED_OPEN;
|
||||
break;
|
||||
case FAILED_CLOSE:
|
||||
rs = ClusterStatusProtos.RegionState.State.FAILED_CLOSE;
|
||||
break;
|
||||
case MERGING:
|
||||
rs = ClusterStatusProtos.RegionState.State.MERGING;
|
||||
break;
|
||||
case MERGED:
|
||||
rs = ClusterStatusProtos.RegionState.State.MERGED;
|
||||
break;
|
||||
case SPLITTING_NEW:
|
||||
rs = ClusterStatusProtos.RegionState.State.SPLITTING_NEW;
|
||||
break;
|
||||
case MERGING_NEW:
|
||||
rs = ClusterStatusProtos.RegionState.State.MERGING_NEW;
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException("");
|
||||
case OFFLINE:
|
||||
rs = ClusterStatusProtos.RegionState.State.OFFLINE;
|
||||
break;
|
||||
case OPENING:
|
||||
rs = ClusterStatusProtos.RegionState.State.OPENING;
|
||||
break;
|
||||
case OPEN:
|
||||
rs = ClusterStatusProtos.RegionState.State.OPEN;
|
||||
break;
|
||||
case CLOSING:
|
||||
rs = ClusterStatusProtos.RegionState.State.CLOSING;
|
||||
break;
|
||||
case CLOSED:
|
||||
rs = ClusterStatusProtos.RegionState.State.CLOSED;
|
||||
break;
|
||||
case SPLITTING:
|
||||
rs = ClusterStatusProtos.RegionState.State.SPLITTING;
|
||||
break;
|
||||
case SPLIT:
|
||||
rs = ClusterStatusProtos.RegionState.State.SPLIT;
|
||||
break;
|
||||
case FAILED_OPEN:
|
||||
rs = ClusterStatusProtos.RegionState.State.FAILED_OPEN;
|
||||
break;
|
||||
case FAILED_CLOSE:
|
||||
rs = ClusterStatusProtos.RegionState.State.FAILED_CLOSE;
|
||||
break;
|
||||
case MERGING:
|
||||
rs = ClusterStatusProtos.RegionState.State.MERGING;
|
||||
break;
|
||||
case MERGED:
|
||||
rs = ClusterStatusProtos.RegionState.State.MERGED;
|
||||
break;
|
||||
case SPLITTING_NEW:
|
||||
rs = ClusterStatusProtos.RegionState.State.SPLITTING_NEW;
|
||||
break;
|
||||
case MERGING_NEW:
|
||||
rs = ClusterStatusProtos.RegionState.State.MERGING_NEW;
|
||||
break;
|
||||
case ABNORMALLY_CLOSED:
|
||||
rs = ClusterStatusProtos.RegionState.State.ABNORMALLY_CLOSED;
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException("");
|
||||
}
|
||||
return rs;
|
||||
}
|
||||
|
@ -114,49 +121,52 @@ public class RegionState {
|
|||
public static State convert(ClusterStatusProtos.RegionState.State protoState) {
|
||||
State state;
|
||||
switch (protoState) {
|
||||
case OFFLINE:
|
||||
state = OFFLINE;
|
||||
break;
|
||||
case PENDING_OPEN:
|
||||
case OPENING:
|
||||
state = OPENING;
|
||||
break;
|
||||
case OPEN:
|
||||
state = OPEN;
|
||||
break;
|
||||
case PENDING_CLOSE:
|
||||
case CLOSING:
|
||||
state = CLOSING;
|
||||
break;
|
||||
case CLOSED:
|
||||
state = CLOSED;
|
||||
break;
|
||||
case SPLITTING:
|
||||
state = SPLITTING;
|
||||
break;
|
||||
case SPLIT:
|
||||
state = SPLIT;
|
||||
break;
|
||||
case FAILED_OPEN:
|
||||
state = FAILED_OPEN;
|
||||
break;
|
||||
case FAILED_CLOSE:
|
||||
state = FAILED_CLOSE;
|
||||
break;
|
||||
case MERGING:
|
||||
state = MERGING;
|
||||
break;
|
||||
case MERGED:
|
||||
state = MERGED;
|
||||
break;
|
||||
case SPLITTING_NEW:
|
||||
state = SPLITTING_NEW;
|
||||
break;
|
||||
case MERGING_NEW:
|
||||
state = MERGING_NEW;
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException("Unhandled state " + protoState);
|
||||
case OFFLINE:
|
||||
state = OFFLINE;
|
||||
break;
|
||||
case PENDING_OPEN:
|
||||
case OPENING:
|
||||
state = OPENING;
|
||||
break;
|
||||
case OPEN:
|
||||
state = OPEN;
|
||||
break;
|
||||
case PENDING_CLOSE:
|
||||
case CLOSING:
|
||||
state = CLOSING;
|
||||
break;
|
||||
case CLOSED:
|
||||
state = CLOSED;
|
||||
break;
|
||||
case SPLITTING:
|
||||
state = SPLITTING;
|
||||
break;
|
||||
case SPLIT:
|
||||
state = SPLIT;
|
||||
break;
|
||||
case FAILED_OPEN:
|
||||
state = FAILED_OPEN;
|
||||
break;
|
||||
case FAILED_CLOSE:
|
||||
state = FAILED_CLOSE;
|
||||
break;
|
||||
case MERGING:
|
||||
state = MERGING;
|
||||
break;
|
||||
case MERGED:
|
||||
state = MERGED;
|
||||
break;
|
||||
case SPLITTING_NEW:
|
||||
state = SPLITTING_NEW;
|
||||
break;
|
||||
case MERGING_NEW:
|
||||
state = MERGING_NEW;
|
||||
break;
|
||||
case ABNORMALLY_CLOSED:
|
||||
state = ABNORMALLY_CLOSED;
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException("Unhandled state " + protoState);
|
||||
}
|
||||
return state;
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.IOException;
|
|||
import java.util.ArrayDeque;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Deque;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
|
@ -94,6 +95,7 @@ public class ProcedureExecutor<TEnvironment> {
|
|||
* Class with parameters describing how to fail/die when in testing-context.
|
||||
*/
|
||||
public static class Testing {
|
||||
protected boolean killIfHasParent = true;
|
||||
protected boolean killIfSuspended = false;
|
||||
|
||||
/**
|
||||
|
@ -120,8 +122,14 @@ public class ProcedureExecutor<TEnvironment> {
|
|||
return kill;
|
||||
}
|
||||
|
||||
protected boolean shouldKillBeforeStoreUpdate(final boolean isSuspended) {
|
||||
return (isSuspended && !killIfSuspended) ? false : shouldKillBeforeStoreUpdate();
|
||||
protected boolean shouldKillBeforeStoreUpdate(boolean isSuspended, boolean hasParent) {
|
||||
if (isSuspended && !killIfSuspended) {
|
||||
return false;
|
||||
}
|
||||
if (hasParent && !killIfHasParent) {
|
||||
return false;
|
||||
}
|
||||
return shouldKillBeforeStoreUpdate();
|
||||
}
|
||||
|
||||
protected boolean shouldKillAfterStoreUpdate() {
|
||||
|
@ -457,6 +465,7 @@ public class ProcedureExecutor<TEnvironment> {
|
|||
int failedCount = 0;
|
||||
while (procIter.hasNext()) {
|
||||
boolean finished = procIter.isNextFinished();
|
||||
@SuppressWarnings("unchecked")
|
||||
Procedure<TEnvironment> proc = procIter.next();
|
||||
NonceKey nonceKey = proc.getNonceKey();
|
||||
long procId = proc.getProcId();
|
||||
|
@ -508,6 +517,7 @@ public class ProcedureExecutor<TEnvironment> {
|
|||
continue;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
Procedure<TEnvironment> proc = procIter.next();
|
||||
assert !(proc.isFinished() && !proc.hasParent()) : "unexpected completed proc=" + proc;
|
||||
|
||||
|
@ -1179,6 +1189,17 @@ public class ProcedureExecutor<TEnvironment> {
|
|||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should only be used when starting up, where the procedure workers have not been started.
|
||||
* <p/>
|
||||
* If the procedure works has been started, the return values maybe changed when you are
|
||||
* processing it so usually this is not safe. Use {@link #getProcedures()} below for most cases as
|
||||
* it will do a copy, and also include the finished procedures.
|
||||
*/
|
||||
public Collection<Procedure<TEnvironment>> getActiveProceduresNoCopy() {
|
||||
return procedures.values();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get procedures.
|
||||
* @return the procedures in a list
|
||||
|
@ -1607,7 +1628,8 @@ public class ProcedureExecutor<TEnvironment> {
|
|||
|
||||
// allows to kill the executor before something is stored to the wal.
|
||||
// useful to test the procedure recovery.
|
||||
if (testing != null && testing.shouldKillBeforeStoreUpdate(suspended)) {
|
||||
if (testing != null &&
|
||||
testing.shouldKillBeforeStoreUpdate(suspended, procedure.hasParent())) {
|
||||
kill("TESTING: Kill BEFORE store update: " + procedure);
|
||||
}
|
||||
|
||||
|
@ -1840,6 +1862,7 @@ public class ProcedureExecutor<TEnvironment> {
|
|||
long lastUpdate = EnvironmentEdgeManager.currentTime();
|
||||
try {
|
||||
while (isRunning() && keepAlive(lastUpdate)) {
|
||||
@SuppressWarnings("unchecked")
|
||||
Procedure<TEnvironment> proc = scheduler.poll(keepAliveTime, TimeUnit.MILLISECONDS);
|
||||
if (proc == null) {
|
||||
continue;
|
||||
|
|
|
@ -185,7 +185,7 @@ public abstract class StateMachineProcedure<TEnvironment, TState>
|
|||
this.cycles++;
|
||||
}
|
||||
|
||||
LOG.trace("{}", toString());
|
||||
LOG.trace("{}", this);
|
||||
stateFlow = executeFromState(env, state);
|
||||
if (!hasMoreState()) setNextState(EOF_STATE);
|
||||
if (subProcList != null && !subProcList.isEmpty()) {
|
||||
|
|
|
@ -26,7 +26,6 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.Callable;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
@ -37,14 +36,16 @@ import org.apache.hadoop.hbase.procedure2.store.NoopProcedureStore;
|
|||
import org.apache.hadoop.hbase.procedure2.store.ProcedureStore;
|
||||
import org.apache.hadoop.hbase.procedure2.store.ProcedureStore.ProcedureIterator;
|
||||
import org.apache.hadoop.hbase.procedure2.store.wal.WALProcedureStore;
|
||||
import org.apache.hbase.thirdparty.com.google.protobuf.ByteString;
|
||||
import org.apache.hbase.thirdparty.com.google.protobuf.BytesValue;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState;
|
||||
import org.apache.hadoop.hbase.util.NonceKey;
|
||||
import org.apache.hadoop.hbase.util.Threads;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.protobuf.ByteString;
|
||||
import org.apache.hbase.thirdparty.com.google.protobuf.BytesValue;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState;
|
||||
|
||||
public class ProcedureTestingUtility {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(ProcedureTestingUtility.class);
|
||||
|
||||
|
@ -67,7 +68,7 @@ public class ProcedureTestingUtility {
|
|||
}
|
||||
|
||||
public static <TEnv> void restart(final ProcedureExecutor<TEnv> procExecutor) throws Exception {
|
||||
restart(procExecutor, false, true, null, null);
|
||||
restart(procExecutor, false, true, null, null, null);
|
||||
}
|
||||
|
||||
public static void initAndStartWorkers(ProcedureExecutor<?> procExecutor, int numThreads,
|
||||
|
@ -76,9 +77,9 @@ public class ProcedureTestingUtility {
|
|||
procExecutor.startWorkers();
|
||||
}
|
||||
|
||||
public static <TEnv> void restart(final ProcedureExecutor<TEnv> procExecutor,
|
||||
final boolean avoidTestKillDuringRestart, final boolean failOnCorrupted,
|
||||
final Callable<Void> stopAction, final Callable<Void> startAction)
|
||||
public static <TEnv> void restart(ProcedureExecutor<TEnv> procExecutor,
|
||||
boolean avoidTestKillDuringRestart, boolean failOnCorrupted, Callable<Void> stopAction,
|
||||
Callable<Void> actionBeforeStartWorker, Callable<Void> startAction)
|
||||
throws Exception {
|
||||
final ProcedureStore procStore = procExecutor.getStore();
|
||||
final int storeThreads = procExecutor.getCorePoolSize();
|
||||
|
@ -104,7 +105,11 @@ public class ProcedureTestingUtility {
|
|||
// re-start
|
||||
LOG.info("RESTART - Start");
|
||||
procStore.start(storeThreads);
|
||||
initAndStartWorkers(procExecutor, execThreads, failOnCorrupted);
|
||||
procExecutor.init(execThreads, failOnCorrupted);
|
||||
if (actionBeforeStartWorker != null) {
|
||||
actionBeforeStartWorker.call();
|
||||
}
|
||||
procExecutor.startWorkers();
|
||||
if (startAction != null) {
|
||||
startAction.call();
|
||||
}
|
||||
|
@ -139,6 +144,12 @@ public class ProcedureTestingUtility {
|
|||
}
|
||||
}
|
||||
|
||||
public static <TEnv> void setKillIfHasParent(ProcedureExecutor<TEnv> procExecutor,
|
||||
boolean value) {
|
||||
createExecutorTesting(procExecutor);
|
||||
procExecutor.testing.killIfHasParent = value;
|
||||
}
|
||||
|
||||
public static <TEnv> void setKillIfSuspended(ProcedureExecutor<TEnv> procExecutor,
|
||||
boolean value) {
|
||||
createExecutorTesting(procExecutor);
|
||||
|
|
|
@ -52,6 +52,10 @@ message RegionState {
|
|||
MERGING_NEW = 14; // new region to be created when RS merges two
|
||||
// daughter regions but hasn't be created yet, or
|
||||
// master doesn't know it's already created
|
||||
ABNORMALLY_CLOSED = 15;// the region is CLOSED because of a RS crash. Usually it is the same
|
||||
// with CLOSED, but for some operations such as merge/split, we can not
|
||||
// apply it to a region in this state, as it may lead to data loss as we
|
||||
// may have some data in recovered edits.
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -459,3 +459,31 @@ enum InitMetaState {
|
|||
|
||||
message InitMetaStateData {
|
||||
}
|
||||
|
||||
enum RegionStateTransitionState {
|
||||
REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE = 1;
|
||||
REGION_STATE_TRANSITION_OPEN = 2;
|
||||
REGION_STATE_TRANSITION_CONFIRM_OPENED = 3;
|
||||
REGION_STATE_TRANSITION_CLOSE = 4;
|
||||
REGION_STATE_TRANSITION_CONFIRM_CLOSED = 5;
|
||||
}
|
||||
|
||||
message RegionStateTransitionStateData {
|
||||
required RegionStateTransitionState initialState = 1;
|
||||
required RegionStateTransitionState lastState = 2;
|
||||
optional ServerName assign_candidate = 3;
|
||||
required bool force_new_plan = 4;
|
||||
}
|
||||
|
||||
message RegionRemoteProcedureBaseStateData {
|
||||
required RegionInfo region = 1;
|
||||
required ServerName target_server = 2;
|
||||
required bool dispatched = 3;
|
||||
}
|
||||
|
||||
message OpenRegionProcedureStateData {
|
||||
}
|
||||
|
||||
message CloseRegionProcedureStateData {
|
||||
optional ServerName assign_candidate = 1;
|
||||
}
|
||||
|
|
|
@ -41,7 +41,7 @@ import org.apache.hadoop.hbase.master.RegionPlan;
|
|||
import org.apache.hadoop.hbase.master.RegionState;
|
||||
import org.apache.hadoop.hbase.master.ServerManager;
|
||||
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.net.Address;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
|
|
|
@ -64,7 +64,7 @@ import org.apache.hadoop.hbase.ipc.CoprocessorRpcChannel;
|
|||
import org.apache.hadoop.hbase.master.MasterServices;
|
||||
import org.apache.hadoop.hbase.master.ServerListener;
|
||||
import org.apache.hadoop.hbase.master.TableStateManager;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.net.Address;
|
||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||
import org.apache.hadoop.hbase.protobuf.ProtobufMagic;
|
||||
|
|
|
@ -100,10 +100,14 @@ import org.apache.hadoop.hbase.ipc.RpcServer;
|
|||
import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
|
||||
import org.apache.hadoop.hbase.log.HBaseMarkers;
|
||||
import org.apache.hadoop.hbase.master.MasterRpcServices.BalanceSwitchMode;
|
||||
import org.apache.hadoop.hbase.master.assignment.AssignProcedure;
|
||||
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
|
||||
import org.apache.hadoop.hbase.master.assignment.MergeTableRegionsProcedure;
|
||||
import org.apache.hadoop.hbase.master.assignment.MoveRegionProcedure;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStates;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
|
||||
import org.apache.hadoop.hbase.master.assignment.UnassignProcedure;
|
||||
import org.apache.hadoop.hbase.master.balancer.BalancerChore;
|
||||
import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer;
|
||||
import org.apache.hadoop.hbase.master.balancer.ClusterStatusChore;
|
||||
|
@ -130,6 +134,7 @@ import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
|
|||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureUtil;
|
||||
import org.apache.hadoop.hbase.master.procedure.ModifyTableProcedure;
|
||||
import org.apache.hadoop.hbase.master.procedure.ProcedurePrepareLatch;
|
||||
import org.apache.hadoop.hbase.master.procedure.RecoverMetaProcedure;
|
||||
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
|
||||
import org.apache.hadoop.hbase.master.procedure.TruncateTableProcedure;
|
||||
import org.apache.hadoop.hbase.master.replication.AddPeerProcedure;
|
||||
|
@ -211,6 +216,7 @@ import org.slf4j.Logger;
|
|||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
|
||||
import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableSet;
|
||||
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
|
||||
import org.apache.hbase.thirdparty.com.google.common.collect.Maps;
|
||||
|
||||
|
@ -792,6 +798,45 @@ public class HMaster extends HRegionServer implements MasterServices {
|
|||
this.mpmHost.initialize(this, this.metricsMaster);
|
||||
}
|
||||
|
||||
private static final ImmutableSet<Class<?>> UNSUPPORTED_PROCEDURES =
|
||||
ImmutableSet.of(RecoverMetaProcedure.class, AssignProcedure.class, UnassignProcedure.class,
|
||||
MoveRegionProcedure.class);
|
||||
|
||||
/**
|
||||
* In HBASE-20811, we have introduced a new TRSP to assign/unassign/move regions, and it is
|
||||
* incompatible with the old AssignProcedure/UnassignProcedure/MoveRegionProcedure. So we need to
|
||||
* make sure that there are none these procedures when upgrading. If there are, the master will
|
||||
* quit, you need to go back to the old version to finish these procedures first before upgrading.
|
||||
*/
|
||||
private void checkUnsupportedProcedure(
|
||||
Map<Class<? extends Procedure>, List<Procedure<MasterProcedureEnv>>> procsByType)
|
||||
throws HBaseIOException {
|
||||
// Confirm that we do not have unfinished assign/unassign related procedures. It is not easy to
|
||||
// support both the old assign/unassign procedures and the new TransitRegionStateProcedure as
|
||||
// there will be conflict in the code for AM. We should finish all these procedures before
|
||||
// upgrading.
|
||||
for (Class<?> clazz : UNSUPPORTED_PROCEDURES) {
|
||||
List<Procedure<MasterProcedureEnv>> procs = procsByType.get(clazz);
|
||||
if (procs != null) {
|
||||
LOG.error(
|
||||
"Unsupported procedure type {} found, please rollback your master to the old" +
|
||||
" version to finish them, and then try to upgrade again. The full procedure list: {}",
|
||||
clazz, procs);
|
||||
throw new HBaseIOException("Unsupported procedure type " + clazz + " found");
|
||||
}
|
||||
}
|
||||
// A special check for SCP, as we do not support RecoverMetaProcedure any more so we need to
|
||||
// make sure that no one will try to schedule it but SCP does have a state which will schedule
|
||||
// it.
|
||||
if (procsByType.getOrDefault(ServerCrashProcedure.class, Collections.emptyList()).stream()
|
||||
.map(p -> (ServerCrashProcedure) p).anyMatch(ServerCrashProcedure::isInRecoverMetaState)) {
|
||||
LOG.error("At least one ServerCrashProcedure is going to schedule a RecoverMetaProcedure," +
|
||||
" which is not supported any more. Please rollback your master to the old version to" +
|
||||
" finish them, and then try to upgrade again.");
|
||||
throw new HBaseIOException("Unsupported procedure state found for ServerCrashProcedure");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finish initialization of HMaster after becoming the primary master.
|
||||
* <p/>
|
||||
|
@ -876,19 +921,36 @@ public class HMaster extends HRegionServer implements MasterServices {
|
|||
status.setStatus("Initialze ServerManager and schedule SCP for crash servers");
|
||||
this.serverManager = createServerManager(this);
|
||||
createProcedureExecutor();
|
||||
@SuppressWarnings("rawtypes")
|
||||
Map<Class<? extends Procedure>, List<Procedure<MasterProcedureEnv>>> procsByType =
|
||||
procedureExecutor.getActiveProceduresNoCopy().stream()
|
||||
.collect(Collectors.groupingBy(p -> p.getClass()));
|
||||
|
||||
checkUnsupportedProcedure(procsByType);
|
||||
|
||||
// Create Assignment Manager
|
||||
this.assignmentManager = new AssignmentManager(this);
|
||||
this.assignmentManager.start();
|
||||
// TODO: TRSP can perform as the sub procedure for other procedures, so even if it is marked as
|
||||
// completed, it could still be in the procedure list. This is a bit strange but is another
|
||||
// story, need to verify the implementation for ProcedureExecutor and ProcedureStore.
|
||||
List<TransitRegionStateProcedure> ritList =
|
||||
procsByType.getOrDefault(TransitRegionStateProcedure.class, Collections.emptyList()).stream()
|
||||
.filter(p -> !p.isFinished()).map(p -> (TransitRegionStateProcedure) p)
|
||||
.collect(Collectors.toList());
|
||||
this.assignmentManager.setupRIT(ritList);
|
||||
|
||||
this.regionServerTracker = new RegionServerTracker(zooKeeper, this, this.serverManager);
|
||||
this.regionServerTracker.start(
|
||||
procedureExecutor.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure)
|
||||
.map(p -> ((ServerCrashProcedure) p).getServerName()).collect(Collectors.toSet()),
|
||||
procsByType.getOrDefault(ServerCrashProcedure.class, Collections.emptyList()).stream()
|
||||
.map(p -> (ServerCrashProcedure) p).map(p -> p.getServerName()).collect(Collectors.toSet()),
|
||||
walManager.getLiveServersFromWALDir());
|
||||
// This manager will be started AFTER hbase:meta is confirmed on line.
|
||||
// hbase.mirror.table.state.to.zookeeper is so hbase1 clients can connect. They read table
|
||||
// state from zookeeper while hbase2 reads it from hbase:meta. Disable if no hbase1 clients.
|
||||
this.tableStateManager =
|
||||
this.conf.getBoolean(MirroringTableStateManager.MIRROR_TABLE_STATE_TO_ZK_KEY, true)?
|
||||
this.conf.getBoolean(MirroringTableStateManager.MIRROR_TABLE_STATE_TO_ZK_KEY, true)
|
||||
?
|
||||
new MirroringTableStateManager(this):
|
||||
new TableStateManager(this);
|
||||
|
||||
|
@ -3560,7 +3622,6 @@ public class HMaster extends HRegionServer implements MasterServices {
|
|||
* Mark region server(s) as decommissioned (previously called 'draining') to prevent additional
|
||||
* regions from getting assigned to them. Also unload the regions on the servers asynchronously.0
|
||||
* @param servers Region servers to decommission.
|
||||
* @throws HBaseIOException
|
||||
*/
|
||||
public void decommissionRegionServers(final List<ServerName> servers, final boolean offload)
|
||||
throws HBaseIOException {
|
||||
|
@ -3573,7 +3634,7 @@ public class HMaster extends HRegionServer implements MasterServices {
|
|||
ZKUtil.createAndFailSilent(getZooKeeper(), node);
|
||||
} catch (KeeperException ke) {
|
||||
throw new HBaseIOException(
|
||||
this.zooKeeper.prefix("Unable to decommission '" + server.getServerName() + "'."), ke);
|
||||
this.zooKeeper.prefix("Unable to decommission '" + server.getServerName() + "'."), ke);
|
||||
}
|
||||
if (this.serverManager.addServerToDrainList(server)) {
|
||||
serversAdded.add(server);
|
||||
|
@ -3584,7 +3645,7 @@ public class HMaster extends HRegionServer implements MasterServices {
|
|||
final List<ServerName> destServers = this.serverManager.createDestinationServersList();
|
||||
for (ServerName server : serversAdded) {
|
||||
final List<RegionInfo> regionsOnServer =
|
||||
this.assignmentManager.getRegionStates().getServerRegionInfoSet(server);
|
||||
this.assignmentManager.getRegionStates().getServerRegionInfoSet(server);
|
||||
for (RegionInfo hri : regionsOnServer) {
|
||||
ServerName dest = balancer.randomAssignment(hri, destServers);
|
||||
if (dest == null) {
|
||||
|
@ -3610,10 +3671,9 @@ public class HMaster extends HRegionServer implements MasterServices {
|
|||
* Remove decommission marker (previously called 'draining') from a region server to allow regions
|
||||
* assignments. Load regions onto the server asynchronously if a list of regions is given
|
||||
* @param server Region server to remove decommission marker from.
|
||||
* @throws HBaseIOException
|
||||
*/
|
||||
public void recommissionRegionServer(final ServerName server,
|
||||
final List<byte[]> encodedRegionNames) throws HBaseIOException {
|
||||
final List<byte[]> encodedRegionNames) throws IOException {
|
||||
// Remove the server from decommissioned (draining) server list.
|
||||
String parentZnode = getZooKeeper().getZNodePaths().drainingZNode;
|
||||
String node = ZNodePaths.joinZNode(parentZnode, server.getServerName());
|
||||
|
@ -3621,7 +3681,7 @@ public class HMaster extends HRegionServer implements MasterServices {
|
|||
ZKUtil.deleteNodeFailSilent(getZooKeeper(), node);
|
||||
} catch (KeeperException ke) {
|
||||
throw new HBaseIOException(
|
||||
this.zooKeeper.prefix("Unable to recommission '" + server.getServerName() + "'."), ke);
|
||||
this.zooKeeper.prefix("Unable to recommission '" + server.getServerName() + "'."), ke);
|
||||
}
|
||||
this.serverManager.removeServerFromDrainList(server);
|
||||
|
||||
|
@ -3634,15 +3694,15 @@ public class HMaster extends HRegionServer implements MasterServices {
|
|||
}
|
||||
for (byte[] encodedRegionName : encodedRegionNames) {
|
||||
RegionState regionState =
|
||||
assignmentManager.getRegionStates().getRegionState(Bytes.toString(encodedRegionName));
|
||||
assignmentManager.getRegionStates().getRegionState(Bytes.toString(encodedRegionName));
|
||||
if (regionState == null) {
|
||||
LOG.warn("Unknown region " + Bytes.toStringBinary(encodedRegionName));
|
||||
continue;
|
||||
}
|
||||
RegionInfo hri = regionState.getRegion();
|
||||
if (server.equals(regionState.getServerName())) {
|
||||
LOG.info("Skipping move of region " + hri.getRegionNameAsString()
|
||||
+ " because region already assigned to the same server " + server + ".");
|
||||
LOG.info("Skipping move of region " + hri.getRegionNameAsString() +
|
||||
" because region already assigned to the same server " + server + ".");
|
||||
continue;
|
||||
}
|
||||
RegionPlan rp = new RegionPlan(hri, regionState.getServerName(), server);
|
||||
|
|
|
@ -30,7 +30,7 @@ import org.apache.hadoop.conf.Configuration;
|
|||
import org.apache.hadoop.hbase.ServerMetrics;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.monitoring.LogMonitoring;
|
||||
import org.apache.hadoop.hbase.monitoring.StateDumpServlet;
|
||||
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
/*
|
||||
*
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
|
@ -16,20 +15,11 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.RetriesExhaustedException;
|
||||
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
|
||||
import org.apache.hadoop.hbase.master.RegionState.State;
|
||||
import org.apache.hadoop.hbase.master.TableStateManager;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionOpenOperation;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
|
||||
|
@ -37,75 +27,27 @@ import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
|
|||
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
|
||||
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteOperation;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.AssignRegionStateData;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
||||
|
||||
/**
|
||||
* Procedure that describe the assignment of a single region.
|
||||
* There can only be one RegionTransitionProcedure per region running at a time
|
||||
* since each procedure takes a lock on the region.
|
||||
*
|
||||
* <p>The Assign starts by pushing the "assign" operation to the AssignmentManager
|
||||
* and then will go in a "waiting" state.
|
||||
* The AM will batch the "assign" requests and ask the Balancer where to put
|
||||
* the region (the various policies will be respected: retain, round-robin, random).
|
||||
* Once the AM and the balancer have found a place for the region the procedure
|
||||
* will be resumed and an "open region" request will be placed in the Remote Dispatcher
|
||||
* queue, and the procedure once again will go in a "waiting state".
|
||||
* The Remote Dispatcher will batch the various requests for that server and
|
||||
* they will be sent to the RS for execution.
|
||||
* The RS will complete the open operation by calling master.reportRegionStateTransition().
|
||||
* The AM will intercept the transition report, and notify the procedure.
|
||||
* The procedure will finish the assignment by publishing to new state on meta
|
||||
* or it will retry the assignment.
|
||||
*
|
||||
* <p>This procedure does not rollback when beyond the first
|
||||
* REGION_TRANSITION_QUEUE step; it will press on trying to assign in the face of
|
||||
* failure. Should we ignore rollback calls to Assign/Unassign then? Or just
|
||||
* remove rollback here?
|
||||
* Leave here only for checking if we can successfully start the master.
|
||||
* @deprecated Do not use any more.
|
||||
* @see TransitRegionStateProcedure
|
||||
*/
|
||||
// TODO: Add being able to assign a region to open read-only.
|
||||
@Deprecated
|
||||
@InterfaceAudience.Private
|
||||
public class AssignProcedure extends RegionTransitionProcedure {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(AssignProcedure.class);
|
||||
|
||||
/**
|
||||
* Set to true when we need recalibrate -- choose a new target -- because original assign failed.
|
||||
*/
|
||||
private boolean forceNewPlan = false;
|
||||
|
||||
/**
|
||||
* Gets set as desired target on move, merge, etc., when we want to go to a particular server.
|
||||
* We may not be able to respect this request but will try. When it is NOT set, then we ask
|
||||
* the balancer to assign. This value is used below in startTransition to set regionLocation if
|
||||
* non-null. Setting regionLocation in regionServerNode is how we override balancer setting
|
||||
* destination.
|
||||
*/
|
||||
protected volatile ServerName targetServer;
|
||||
|
||||
/**
|
||||
* Comparator that will sort AssignProcedures so meta assigns come first, then system table
|
||||
* assigns and finally user space assigns.
|
||||
*/
|
||||
public static final CompareAssignProcedure COMPARATOR = new CompareAssignProcedure();
|
||||
|
||||
public AssignProcedure() {
|
||||
// Required by the Procedure framework to create the procedure on replay
|
||||
super();
|
||||
}
|
||||
|
||||
public AssignProcedure(final RegionInfo regionInfo) {
|
||||
super(regionInfo);
|
||||
this.targetServer = null;
|
||||
}
|
||||
|
||||
public AssignProcedure(final RegionInfo regionInfo, final ServerName destinationServer) {
|
||||
super(regionInfo);
|
||||
this.targetServer = destinationServer;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -124,10 +66,9 @@ public class AssignProcedure extends RegionTransitionProcedure {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void serializeStateData(ProcedureStateSerializer serializer)
|
||||
throws IOException {
|
||||
final AssignRegionStateData.Builder state = AssignRegionStateData.newBuilder()
|
||||
.setTransitionState(getTransitionState())
|
||||
protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
final AssignRegionStateData.Builder state =
|
||||
AssignRegionStateData.newBuilder().setTransitionState(getTransitionState())
|
||||
.setRegionInfo(ProtobufUtil.toRegionInfo(getRegionInfo()));
|
||||
if (forceNewPlan) {
|
||||
state.setForceNewPlan(true);
|
||||
|
@ -142,8 +83,7 @@ public class AssignProcedure extends RegionTransitionProcedure {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void deserializeStateData(ProcedureStateSerializer serializer)
|
||||
throws IOException {
|
||||
protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
final AssignRegionStateData state = serializer.deserialize(AssignRegionStateData.class);
|
||||
setTransitionState(state.getTransitionState());
|
||||
setRegionInfo(ProtobufUtil.toRegionInfo(state.getRegionInfo()));
|
||||
|
@ -159,202 +99,36 @@ public class AssignProcedure extends RegionTransitionProcedure {
|
|||
@Override
|
||||
protected boolean startTransition(final MasterProcedureEnv env, final RegionStateNode regionNode)
|
||||
throws IOException {
|
||||
// If the region is already open we can't do much...
|
||||
if (regionNode.isInState(State.OPEN) && isServerOnline(env, regionNode)) {
|
||||
LOG.info("Assigned, not reassigning; " + this + "; " + regionNode.toShortString());
|
||||
return false;
|
||||
}
|
||||
// Don't assign if table is in disabling or disabled state.
|
||||
TableStateManager tsm = env.getMasterServices().getTableStateManager();
|
||||
TableName tn = regionNode.getRegionInfo().getTable();
|
||||
if (tsm.getTableState(tn).isDisabledOrDisabling()) {
|
||||
LOG.info("Table " + tn + " state=" + tsm.getTableState(tn) + ", skipping " + this);
|
||||
return false;
|
||||
}
|
||||
// If the region is SPLIT, we can't assign it. But state might be CLOSED, rather than
|
||||
// SPLIT which is what a region gets set to when unassigned as part of SPLIT. FIX.
|
||||
if (regionNode.isInState(State.SPLIT) ||
|
||||
(regionNode.getRegionInfo().isOffline() && regionNode.getRegionInfo().isSplit())) {
|
||||
LOG.info("SPLIT, cannot be assigned; " + this + "; " + regionNode +
|
||||
"; hri=" + regionNode.getRegionInfo());
|
||||
return false;
|
||||
}
|
||||
|
||||
// If we haven't started the operation yet, we can abort
|
||||
if (aborted.get() && regionNode.isInState(State.CLOSED, State.OFFLINE)) {
|
||||
if (incrementAndCheckMaxAttempts(env, regionNode)) {
|
||||
regionNode.setState(State.FAILED_OPEN);
|
||||
setFailure(getClass().getSimpleName(),
|
||||
new RetriesExhaustedException("Max attempts exceeded"));
|
||||
} else {
|
||||
setAbortFailure(getClass().getSimpleName(), "Abort requested");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Send assign (add into assign-pool). We call regionNode.offline below to set state to
|
||||
// OFFLINE and to clear the region location. Setting a new regionLocation here is how we retain
|
||||
// old assignment or specify target server if a move or merge. See
|
||||
// AssignmentManager#processAssignQueue. Otherwise, balancer gives us location.
|
||||
// TODO: Region will be set into OFFLINE state below regardless of what its previous state was
|
||||
// This is dangerous? Wrong? What if region was in an unexpected state?
|
||||
ServerName lastRegionLocation = regionNode.offline();
|
||||
boolean retain = false;
|
||||
if (!forceNewPlan) {
|
||||
if (this.targetServer != null) {
|
||||
retain = targetServer.equals(lastRegionLocation);
|
||||
regionNode.setRegionLocation(targetServer);
|
||||
} else {
|
||||
if (lastRegionLocation != null) {
|
||||
// Try and keep the location we had before we offlined.
|
||||
retain = true;
|
||||
regionNode.setRegionLocation(lastRegionLocation);
|
||||
} else if (regionNode.getLastHost() != null) {
|
||||
retain = true;
|
||||
LOG.info("Setting lastHost as the region location " + regionNode.getLastHost());
|
||||
regionNode.setRegionLocation(regionNode.getLastHost());
|
||||
}
|
||||
}
|
||||
}
|
||||
LOG.info("Starting " + this + "; " + regionNode.toShortString() +
|
||||
"; forceNewPlan=" + this.forceNewPlan +
|
||||
", retain=" + retain);
|
||||
env.getAssignmentManager().queueAssign(regionNode);
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean updateTransition(final MasterProcedureEnv env, final RegionStateNode regionNode)
|
||||
throws IOException, ProcedureSuspendedException {
|
||||
// TODO: crash if destinationServer is specified and not online
|
||||
// which is also the case when the balancer provided us with a different location.
|
||||
if (LOG.isTraceEnabled()) {
|
||||
LOG.trace("Update " + this + "; " + regionNode.toShortString());
|
||||
}
|
||||
if (regionNode.getRegionLocation() == null) {
|
||||
setTransitionState(RegionTransitionState.REGION_TRANSITION_QUEUE);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!isServerOnline(env, regionNode)) {
|
||||
// TODO: is this correct? should we wait the chore/ssh?
|
||||
LOG.info("Server not online, re-queuing " + this + "; " + regionNode.toShortString());
|
||||
setTransitionState(RegionTransitionState.REGION_TRANSITION_QUEUE);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (env.getAssignmentManager().waitServerReportEvent(regionNode.getRegionLocation(), this)) {
|
||||
LOG.info("Early suspend! " + this + "; " + regionNode.toShortString());
|
||||
throw new ProcedureSuspendedException();
|
||||
}
|
||||
|
||||
if (regionNode.isInState(State.OPEN)) {
|
||||
LOG.info("Already assigned: " + this + "; " + regionNode.toShortString());
|
||||
return false;
|
||||
}
|
||||
|
||||
// Transition regionNode State. Set it to OPENING. Update hbase:meta, and add
|
||||
// region to list of regions on the target regionserver. Need to UNDO if failure!
|
||||
env.getAssignmentManager().markRegionAsOpening(regionNode);
|
||||
|
||||
// TODO: Requires a migration to be open by the RS?
|
||||
// regionNode.getFormatVersion()
|
||||
|
||||
if (!addToRemoteDispatcher(env, regionNode.getRegionLocation())) {
|
||||
// Failed the dispatch BUT addToRemoteDispatcher internally does
|
||||
// cleanup on failure -- even the undoing of markRegionAsOpening above --
|
||||
// so nothing more to do here; in fact we need to get out of here
|
||||
// fast since we've been put back on the scheduler.
|
||||
}
|
||||
|
||||
// We always return true, even if we fail dispatch because addToRemoteDispatcher
|
||||
// failure processing sets state back to REGION_TRANSITION_QUEUE so we try again;
|
||||
// i.e. return true to keep the Procedure running; it has been reset to startover.
|
||||
throws IOException, ProcedureSuspendedException {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void finishTransition(final MasterProcedureEnv env, final RegionStateNode regionNode)
|
||||
throws IOException {
|
||||
env.getAssignmentManager().markRegionAsOpened(regionNode);
|
||||
// This success may have been after we failed open a few times. Be sure to cleanup any
|
||||
// failed open references. See #incrementAndCheckMaxAttempts and where it is called.
|
||||
env.getAssignmentManager().getRegionStates().removeFromFailedOpen(regionNode.getRegionInfo());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void reportTransition(final MasterProcedureEnv env, final RegionStateNode regionNode,
|
||||
final TransitionCode code, final long openSeqNum) throws UnexpectedStateException {
|
||||
switch (code) {
|
||||
case OPENED:
|
||||
if (openSeqNum < 0) {
|
||||
throw new UnexpectedStateException("Received report unexpected " + code +
|
||||
" transition openSeqNum=" + openSeqNum + ", " + regionNode);
|
||||
}
|
||||
if (openSeqNum < regionNode.getOpenSeqNum()) {
|
||||
// Don't bother logging if openSeqNum == 0
|
||||
if (openSeqNum != 0) {
|
||||
LOG.warn("Skipping update of open seqnum with " + openSeqNum +
|
||||
" because current seqnum=" + regionNode.getOpenSeqNum());
|
||||
}
|
||||
} else {
|
||||
regionNode.setOpenSeqNum(openSeqNum);
|
||||
}
|
||||
// Leave the state here as OPENING for now. We set it to OPEN in
|
||||
// REGION_TRANSITION_FINISH section where we do a bunch of checks.
|
||||
// regionNode.setState(RegionState.State.OPEN, RegionState.State.OPENING);
|
||||
setTransitionState(RegionTransitionState.REGION_TRANSITION_FINISH);
|
||||
break;
|
||||
case FAILED_OPEN:
|
||||
handleFailure(env, regionNode);
|
||||
break;
|
||||
default:
|
||||
throw new UnexpectedStateException("Received report unexpected " + code +
|
||||
" transition openSeqNum=" + openSeqNum + ", " + regionNode.toShortString() +
|
||||
", " + this + ", expected OPENED or FAILED_OPEN.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Called when dispatch or subsequent OPEN request fail. Can be run by the
|
||||
* inline dispatch call or later by the ServerCrashProcedure. Our state is
|
||||
* generally OPENING. Cleanup and reset to OFFLINE and put our Procedure
|
||||
* State back to REGION_TRANSITION_QUEUE so the Assign starts over.
|
||||
*/
|
||||
private void handleFailure(final MasterProcedureEnv env, final RegionStateNode regionNode) {
|
||||
if (incrementAndCheckMaxAttempts(env, regionNode)) {
|
||||
aborted.set(true);
|
||||
}
|
||||
this.forceNewPlan = true;
|
||||
this.targetServer = null;
|
||||
regionNode.offline();
|
||||
// We were moved to OPENING state before dispatch. Undo. It is safe to call
|
||||
// this method because it checks for OPENING first.
|
||||
env.getAssignmentManager().undoRegionAsOpening(regionNode);
|
||||
setTransitionState(RegionTransitionState.REGION_TRANSITION_QUEUE);
|
||||
}
|
||||
|
||||
private boolean incrementAndCheckMaxAttempts(final MasterProcedureEnv env,
|
||||
final RegionStateNode regionNode) {
|
||||
final int retries = env.getAssignmentManager().getRegionStates().
|
||||
addToFailedOpen(regionNode).incrementAndGetRetries();
|
||||
int max = env.getAssignmentManager().getAssignMaxAttempts();
|
||||
LOG.info("Retry=" + retries + " of max=" + max + "; " +
|
||||
this + "; " + regionNode.toShortString());
|
||||
return retries >= max;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RemoteOperation remoteCallBuild(final MasterProcedureEnv env, final ServerName serverName) {
|
||||
public RemoteOperation remoteCallBuild(final MasterProcedureEnv env,
|
||||
final ServerName serverName) {
|
||||
assert serverName.equals(getRegionState(env).getRegionLocation());
|
||||
return new RegionOpenOperation(this, getRegionInfo(),
|
||||
env.getAssignmentManager().getFavoredNodes(getRegionInfo()), false);
|
||||
env.getAssignmentManager().getFavoredNodes(getRegionInfo()), false);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean remoteCallFailed(final MasterProcedureEnv env, final RegionStateNode regionNode,
|
||||
final IOException exception) {
|
||||
handleFailure(env, regionNode);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -364,44 +138,8 @@ public class AssignProcedure extends RegionTransitionProcedure {
|
|||
if (this.targetServer != null) sb.append(", target=").append(this.targetServer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ServerName getServer(final MasterProcedureEnv env) {
|
||||
RegionStateNode node =
|
||||
env.getAssignmentManager().getRegionStates().getRegionStateNode(this.getRegionInfo());
|
||||
if (node == null) return null;
|
||||
return node.getRegionLocation();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ProcedureMetrics getProcedureMetrics(MasterProcedureEnv env) {
|
||||
return env.getAssignmentManager().getAssignmentManagerMetrics().getAssignProcMetrics();
|
||||
}
|
||||
|
||||
/**
|
||||
* Sort AssignProcedures such that meta and system assigns come first before user-space assigns.
|
||||
* Have to do it this way w/ distinct Comparator because Procedure is already Comparable on
|
||||
* 'Env'(?).
|
||||
*/
|
||||
public static class CompareAssignProcedure implements Comparator<AssignProcedure> {
|
||||
@Override
|
||||
public int compare(AssignProcedure left, AssignProcedure right) {
|
||||
if (left.getRegionInfo().isMetaRegion()) {
|
||||
if (right.getRegionInfo().isMetaRegion()) {
|
||||
return RegionInfo.COMPARATOR.compare(left.getRegionInfo(), right.getRegionInfo());
|
||||
}
|
||||
return -1;
|
||||
} else if (right.getRegionInfo().isMetaRegion()) {
|
||||
return +1;
|
||||
}
|
||||
if (left.getRegionInfo().getTable().isSystemTable()) {
|
||||
if (right.getRegionInfo().getTable().isSystemTable()) {
|
||||
return RegionInfo.COMPARATOR.compare(left.getRegionInfo(), right.getRegionInfo());
|
||||
}
|
||||
return -1;
|
||||
} else if (right.getRegionInfo().getTable().isSystemTable()) {
|
||||
return +1;
|
||||
}
|
||||
return RegionInfo.COMPARATOR.compare(left.getRegionInfo(), right.getRegionInfo());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,195 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.hadoop.hbase.HBaseIOException;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
|
||||
import org.apache.hadoop.hbase.favored.FavoredNodesManager;
|
||||
import org.apache.hadoop.hbase.ipc.HBaseRpcController;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.wal.WALSplitter;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.RequestConverter;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.AdminService;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.GetRegionInfoRequest;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.GetRegionInfoResponse;
|
||||
|
||||
/**
|
||||
* Utility for this assignment package only.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
final class AssignmentManagerUtil {
|
||||
private AssignmentManagerUtil() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Raw call to remote regionserver to get info on a particular region.
|
||||
* @throws IOException Let it out so can report this IOE as reason for failure
|
||||
*/
|
||||
static GetRegionInfoResponse getRegionInfoResponse(final MasterProcedureEnv env,
|
||||
final ServerName regionLocation, final RegionInfo hri) throws IOException {
|
||||
return getRegionInfoResponse(env, regionLocation, hri, false);
|
||||
}
|
||||
|
||||
static GetRegionInfoResponse getRegionInfoResponse(final MasterProcedureEnv env,
|
||||
final ServerName regionLocation, final RegionInfo hri, boolean includeBestSplitRow)
|
||||
throws IOException {
|
||||
// TODO: There is no timeout on this controller. Set one!
|
||||
HBaseRpcController controller =
|
||||
env.getMasterServices().getClusterConnection().getRpcControllerFactory().newController();
|
||||
final AdminService.BlockingInterface admin =
|
||||
env.getMasterServices().getClusterConnection().getAdmin(regionLocation);
|
||||
GetRegionInfoRequest request = null;
|
||||
if (includeBestSplitRow) {
|
||||
request = RequestConverter.buildGetRegionInfoRequest(hri.getRegionName(), false, true);
|
||||
} else {
|
||||
request = RequestConverter.buildGetRegionInfoRequest(hri.getRegionName());
|
||||
}
|
||||
try {
|
||||
return admin.getRegionInfo(controller, request);
|
||||
} catch (ServiceException e) {
|
||||
throw ProtobufUtil.handleRemoteException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static void lock(List<RegionStateNode> regionNodes) {
|
||||
regionNodes.iterator().forEachRemaining(RegionStateNode::lock);
|
||||
}
|
||||
|
||||
private static void unlock(List<RegionStateNode> regionNodes) {
|
||||
for (ListIterator<RegionStateNode> iter = regionNodes.listIterator(regionNodes.size()); iter
|
||||
.hasPrevious();) {
|
||||
iter.previous().unlock();
|
||||
}
|
||||
}
|
||||
|
||||
static TransitRegionStateProcedure[] createUnassignProceduresForSplitOrMerge(
|
||||
MasterProcedureEnv env, Stream<RegionInfo> regions, int regionReplication)
|
||||
throws IOException {
|
||||
List<RegionStateNode> regionNodes = regions
|
||||
.flatMap(hri -> IntStream.range(0, regionReplication)
|
||||
.mapToObj(i -> RegionReplicaUtil.getRegionInfoForReplica(hri, i)))
|
||||
.map(env.getAssignmentManager().getRegionStates()::getOrCreateRegionStateNode)
|
||||
.collect(Collectors.toList());
|
||||
TransitRegionStateProcedure[] procs = new TransitRegionStateProcedure[regionNodes.size()];
|
||||
boolean rollback = true;
|
||||
int i = 0;
|
||||
// hold the lock at once, and then release it in finally. This is important as SCP may jump in
|
||||
// if we release the lock in the middle when we want to do rollback, and cause problems.
|
||||
lock(regionNodes);
|
||||
try {
|
||||
for (; i < procs.length; i++) {
|
||||
RegionStateNode regionNode = regionNodes.get(i);
|
||||
TransitRegionStateProcedure proc =
|
||||
TransitRegionStateProcedure.unassign(env, regionNode.getRegionInfo());
|
||||
if (regionNode.getProcedure() != null) {
|
||||
throw new HBaseIOException(
|
||||
"The parent region " + regionNode + " is currently in transition, give up");
|
||||
}
|
||||
regionNode.setProcedure(proc);
|
||||
procs[i] = proc;
|
||||
}
|
||||
// all succeeded, set rollback to false
|
||||
rollback = false;
|
||||
} finally {
|
||||
if (rollback) {
|
||||
for (;;) {
|
||||
i--;
|
||||
if (i < 0) {
|
||||
break;
|
||||
}
|
||||
RegionStateNode regionNode = regionNodes.get(i);
|
||||
regionNode.unsetProcedure(procs[i]);
|
||||
}
|
||||
}
|
||||
unlock(regionNodes);
|
||||
}
|
||||
return procs;
|
||||
}
|
||||
|
||||
private static TransitRegionStateProcedure[] createAssignProcedures(MasterProcedureEnv env,
|
||||
Stream<RegionInfo> regions, int regionReplication, ServerName targetServer) {
|
||||
return regions
|
||||
.flatMap(hri -> IntStream.range(0, regionReplication)
|
||||
.mapToObj(i -> RegionReplicaUtil.getRegionInfoForReplica(hri, i)))
|
||||
.map(env.getAssignmentManager().getRegionStates()::getOrCreateRegionStateNode)
|
||||
.map(regionNode -> {
|
||||
TransitRegionStateProcedure proc =
|
||||
TransitRegionStateProcedure.assign(env, regionNode.getRegionInfo(), targetServer);
|
||||
regionNode.lock();
|
||||
try {
|
||||
// should never fail, as we have the exclusive region lock, and the region is newly
|
||||
// created, or has been successfully closed so should not be on any servers, so SCP will
|
||||
// not process it either.
|
||||
assert !regionNode.isInTransition();
|
||||
regionNode.setProcedure(proc);
|
||||
} finally {
|
||||
regionNode.unlock();
|
||||
}
|
||||
return proc;
|
||||
}).toArray(TransitRegionStateProcedure[]::new);
|
||||
}
|
||||
|
||||
static TransitRegionStateProcedure[] createAssignProceduresForOpeningNewRegions(
|
||||
MasterProcedureEnv env, Stream<RegionInfo> regions, int regionReplication,
|
||||
ServerName targetServer) {
|
||||
return createAssignProcedures(env, regions, regionReplication, targetServer);
|
||||
}
|
||||
|
||||
static void reopenRegionsForRollback(MasterProcedureEnv env, Stream<RegionInfo> regions,
|
||||
int regionReplication, ServerName targetServer) {
|
||||
TransitRegionStateProcedure[] procs =
|
||||
createAssignProcedures(env, regions, regionReplication, targetServer);
|
||||
env.getMasterServices().getMasterProcedureExecutor().submitProcedures(procs);
|
||||
}
|
||||
|
||||
static void removeNonDefaultReplicas(MasterProcedureEnv env, Stream<RegionInfo> regions,
|
||||
int regionReplication) {
|
||||
// Remove from in-memory states
|
||||
regions.flatMap(hri -> IntStream.range(1, regionReplication)
|
||||
.mapToObj(i -> RegionReplicaUtil.getRegionInfoForReplica(hri, i))).forEach(hri -> {
|
||||
env.getAssignmentManager().getRegionStates().deleteRegion(hri);
|
||||
env.getMasterServices().getServerManager().removeRegion(hri);
|
||||
FavoredNodesManager fnm = env.getMasterServices().getFavoredNodesManager();
|
||||
if (fnm != null) {
|
||||
fnm.deleteFavoredNodesForRegions(Collections.singletonList(hri));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
static void checkClosedRegion(MasterProcedureEnv env, RegionInfo regionInfo) throws IOException {
|
||||
if (WALSplitter.hasRecoveredEdits(env.getMasterServices().getFileSystem(),
|
||||
env.getMasterConfiguration(), regionInfo)) {
|
||||
throw new IOException("Recovered.edits are found in Region: " + regionInfo +
|
||||
", abort split/merge to prevent data loss");
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionCloseOperation;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
|
||||
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteOperation;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.CloseRegionProcedureStateData;
|
||||
|
||||
/**
|
||||
* The remote procedure used to close a region.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class CloseRegionProcedure extends RegionRemoteProcedureBase {
|
||||
|
||||
// For a region move operation, we will assign the region after we unassign it, this is the target
|
||||
// server for the subsequent assign. We will send this value to RS, and RS will record the region
|
||||
// in a Map to tell client that where the region has been moved to. Can be null. And also, can be
|
||||
// wrong(but do not make it wrong intentionally). The client can handle this error.
|
||||
private ServerName assignCandidate;
|
||||
|
||||
public CloseRegionProcedure() {
|
||||
super();
|
||||
}
|
||||
|
||||
public CloseRegionProcedure(RegionInfo region, ServerName targetServer,
|
||||
ServerName assignCandidate) {
|
||||
super(region, targetServer);
|
||||
this.assignCandidate = assignCandidate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TableOperationType getTableOperationType() {
|
||||
return TableOperationType.REGION_UNASSIGN;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RemoteOperation remoteCallBuild(MasterProcedureEnv env, ServerName remote) {
|
||||
return new RegionCloseOperation(this, region, assignCandidate);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
super.serializeStateData(serializer);
|
||||
CloseRegionProcedureStateData.Builder builder = CloseRegionProcedureStateData.newBuilder();
|
||||
if (assignCandidate != null) {
|
||||
builder.setAssignCandidate(ProtobufUtil.toServerName(assignCandidate));
|
||||
}
|
||||
serializer.serialize(builder.build());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
super.deserializeStateData(serializer);
|
||||
CloseRegionProcedureStateData data =
|
||||
serializer.deserialize(CloseRegionProcedureStateData.class);
|
||||
if (data.hasAssignCandidate()) {
|
||||
assignCandidate = ProtobufUtil.toServerName(data.getAssignCandidate());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
/*
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
|
@ -15,7 +15,6 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -23,7 +22,7 @@ import java.util.ArrayList;
|
|||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
@ -38,7 +37,6 @@ import org.apache.hadoop.hbase.client.MasterSwitchType;
|
|||
import org.apache.hadoop.hbase.client.Mutation;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
||||
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
|
||||
import org.apache.hadoop.hbase.client.TableDescriptor;
|
||||
import org.apache.hadoop.hbase.exceptions.MergeRegionException;
|
||||
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
|
||||
|
@ -64,7 +62,9 @@ import org.apache.hadoop.hbase.wal.WALSplitter;
|
|||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.GetRegionInfoResponse;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos;
|
||||
|
@ -72,10 +72,13 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.M
|
|||
|
||||
/**
|
||||
* The procedure to Merge a region in a table.
|
||||
* <p/>
|
||||
* This procedure takes an exclusive table lock since it is working over multiple regions.
|
||||
* <p/>
|
||||
* It holds the lock for the life of the procedure.
|
||||
* <p>Throws exception on construction if determines context hostile to merge (cluster going
|
||||
* down or master is shutting down or table is disabled).</p>
|
||||
* <p/>
|
||||
* Throws exception on construction if determines context hostile to merge (cluster going down or
|
||||
* master is shutting down or table is disabled).
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class MergeTableRegionsProcedure
|
||||
|
@ -216,6 +219,20 @@ public class MergeTableRegionsProcedure
|
|||
return rid;
|
||||
}
|
||||
|
||||
|
||||
private void removeNonDefaultReplicas(MasterProcedureEnv env) throws IOException {
|
||||
AssignmentManagerUtil.removeNonDefaultReplicas(env, Stream.of(regionsToMerge),
|
||||
getRegionReplication(env));
|
||||
}
|
||||
|
||||
private void checkClosedRegions(MasterProcedureEnv env) throws IOException {
|
||||
// theoretically this should not happen any more after we use TRSP, but anyway let's add a check
|
||||
// here
|
||||
for (RegionInfo region : regionsToMerge) {
|
||||
AssignmentManagerUtil.checkClosedRegion(env, region);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Flow executeFromState(final MasterProcedureEnv env,
|
||||
MergeTableRegionsState state) {
|
||||
|
@ -234,27 +251,15 @@ public class MergeTableRegionsProcedure
|
|||
setNextState(MergeTableRegionsState.MERGE_TABLE_REGIONS_CLOSE_REGIONS);
|
||||
break;
|
||||
case MERGE_TABLE_REGIONS_CLOSE_REGIONS:
|
||||
addChildProcedure(createUnassignProcedures(env, getRegionReplication(env)));
|
||||
addChildProcedure(createUnassignProcedures(env));
|
||||
setNextState(MergeTableRegionsState.MERGE_TABLE_REGIONS_CHECK_CLOSED_REGIONS);
|
||||
break;
|
||||
case MERGE_TABLE_REGIONS_CHECK_CLOSED_REGIONS:
|
||||
List<RegionInfo> ris = hasRecoveredEdits(env);
|
||||
if (ris.isEmpty()) {
|
||||
setNextState(MergeTableRegionsState.MERGE_TABLE_REGIONS_CREATE_MERGED_REGION);
|
||||
} else {
|
||||
// Need to reopen parent regions to pickup missed recovered.edits. Do it by creating
|
||||
// child assigns and then stepping back to MERGE_TABLE_REGIONS_CLOSE_REGIONS.
|
||||
// Just assign the primary regions recovering the missed recovered.edits -- no replicas.
|
||||
// May need to cycle here a few times if heavy writes.
|
||||
// TODO: Add an assign read-only.
|
||||
for (RegionInfo ri: ris) {
|
||||
LOG.info("Found recovered.edits under {}, reopen to pickup missed edits!", ri);
|
||||
addChildProcedure(env.getAssignmentManager().createAssignProcedure(ri));
|
||||
}
|
||||
setNextState(MergeTableRegionsState.MERGE_TABLE_REGIONS_CLOSE_REGIONS);
|
||||
}
|
||||
checkClosedRegions(env);
|
||||
setNextState(MergeTableRegionsState.MERGE_TABLE_REGIONS_CREATE_MERGED_REGION);
|
||||
break;
|
||||
case MERGE_TABLE_REGIONS_CREATE_MERGED_REGION:
|
||||
removeNonDefaultReplicas(env);
|
||||
createMergedRegion(env);
|
||||
setNextState(MergeTableRegionsState.MERGE_TABLE_REGIONS_WRITE_MAX_SEQUENCE_ID_FILE);
|
||||
break;
|
||||
|
@ -275,7 +280,7 @@ public class MergeTableRegionsProcedure
|
|||
setNextState(MergeTableRegionsState.MERGE_TABLE_REGIONS_OPEN_MERGED_REGION);
|
||||
break;
|
||||
case MERGE_TABLE_REGIONS_OPEN_MERGED_REGION:
|
||||
addChildProcedure(createAssignProcedures(env, getRegionReplication(env)));
|
||||
addChildProcedure(createAssignProcedures(env));
|
||||
setNextState(MergeTableRegionsState.MERGE_TABLE_REGIONS_POST_OPERATION);
|
||||
break;
|
||||
case MERGE_TABLE_REGIONS_POST_OPERATION:
|
||||
|
@ -470,26 +475,8 @@ public class MergeTableRegionsProcedure
|
|||
return env.getAssignmentManager().getAssignmentManagerMetrics().getMergeProcMetrics();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return list of regions that have recovered.edits... usually its an empty list.
|
||||
* @param env the master env
|
||||
* @throws IOException IOException
|
||||
*/
|
||||
private List<RegionInfo> hasRecoveredEdits(final MasterProcedureEnv env) throws IOException {
|
||||
List<RegionInfo> ris = new ArrayList<RegionInfo>(regionsToMerge.length);
|
||||
for (int i = 0; i < regionsToMerge.length; i++) {
|
||||
RegionInfo ri = regionsToMerge[i];
|
||||
if (SplitTableRegionProcedure.hasRecoveredEdits(env, ri)) {
|
||||
ris.add(ri);
|
||||
}
|
||||
}
|
||||
return ris;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare merge and do some check
|
||||
* @param env MasterProcedureEnv
|
||||
* @throws IOException
|
||||
*/
|
||||
private boolean prepareMergeRegion(final MasterProcedureEnv env) throws IOException {
|
||||
// Note: the following logic assumes that we only have 2 regions to merge. In the future,
|
||||
|
@ -559,9 +546,9 @@ public class MergeTableRegionsProcedure
|
|||
}
|
||||
|
||||
private boolean isMergeable(final MasterProcedureEnv env, final RegionState rs)
|
||||
throws IOException {
|
||||
throws IOException {
|
||||
GetRegionInfoResponse response =
|
||||
Util.getRegionInfoResponse(env, rs.getServerName(), rs.getRegion());
|
||||
AssignmentManagerUtil.getRegionInfoResponse(env, rs.getServerName(), rs.getRegion());
|
||||
return response.hasMergeable() && response.getMergeable();
|
||||
}
|
||||
|
||||
|
@ -598,9 +585,8 @@ public class MergeTableRegionsProcedure
|
|||
|
||||
/**
|
||||
* Set the region states to MERGING state
|
||||
* @param env MasterProcedureEnv
|
||||
*/
|
||||
public void setRegionStateToMerging(final MasterProcedureEnv env) {
|
||||
private void setRegionStateToMerging(final MasterProcedureEnv env) {
|
||||
// Set State.MERGING to regions to be merged
|
||||
RegionStates regionStates = env.getAssignmentManager().getRegionStates();
|
||||
regionStates.getRegionStateNode(regionsToMerge[0]).setState(State.MERGING);
|
||||
|
@ -675,49 +661,22 @@ public class MergeTableRegionsProcedure
|
|||
|
||||
/**
|
||||
* Rollback close regions
|
||||
* @param env MasterProcedureEnv
|
||||
**/
|
||||
private void rollbackCloseRegionsForMerge(final MasterProcedureEnv env) throws IOException {
|
||||
// Check whether the region is closed; if so, open it in the same server
|
||||
final int regionReplication = getRegionReplication(env);
|
||||
final ServerName serverName = getServerName(env);
|
||||
|
||||
final AssignProcedure[] procs =
|
||||
new AssignProcedure[regionsToMerge.length * regionReplication];
|
||||
int procsIdx = 0;
|
||||
for (int i = 0; i < regionsToMerge.length; ++i) {
|
||||
for (int j = 0; j < regionReplication; ++j) {
|
||||
final RegionInfo hri = RegionReplicaUtil.getRegionInfoForReplica(regionsToMerge[i], j);
|
||||
procs[procsIdx++] = env.getAssignmentManager().createAssignProcedure(hri, serverName);
|
||||
}
|
||||
}
|
||||
env.getMasterServices().getMasterProcedureExecutor().submitProcedures(procs);
|
||||
private void rollbackCloseRegionsForMerge(MasterProcedureEnv env) throws IOException {
|
||||
AssignmentManagerUtil.reopenRegionsForRollback(env, Stream.of(regionsToMerge),
|
||||
getRegionReplication(env), getServerName(env));
|
||||
}
|
||||
|
||||
private UnassignProcedure[] createUnassignProcedures(final MasterProcedureEnv env,
|
||||
final int regionReplication) {
|
||||
final UnassignProcedure[] procs =
|
||||
new UnassignProcedure[regionsToMerge.length * regionReplication];
|
||||
int procsIdx = 0;
|
||||
for (int i = 0; i < regionsToMerge.length; ++i) {
|
||||
for (int j = 0; j < regionReplication; ++j) {
|
||||
final RegionInfo hri = RegionReplicaUtil.getRegionInfoForReplica(regionsToMerge[i], j);
|
||||
procs[procsIdx++] = env.getAssignmentManager().
|
||||
createUnassignProcedure(hri, null, true, !RegionReplicaUtil.isDefaultReplica(hri));
|
||||
}
|
||||
}
|
||||
return procs;
|
||||
private TransitRegionStateProcedure[] createUnassignProcedures(MasterProcedureEnv env)
|
||||
throws IOException {
|
||||
return AssignmentManagerUtil.createUnassignProceduresForSplitOrMerge(env,
|
||||
Stream.of(regionsToMerge), getRegionReplication(env));
|
||||
}
|
||||
|
||||
private AssignProcedure[] createAssignProcedures(final MasterProcedureEnv env,
|
||||
final int regionReplication) {
|
||||
final ServerName targetServer = getServerName(env);
|
||||
final AssignProcedure[] procs = new AssignProcedure[regionReplication];
|
||||
for (int i = 0; i < procs.length; ++i) {
|
||||
final RegionInfo hri = RegionReplicaUtil.getRegionInfoForReplica(mergedRegion, i);
|
||||
procs[i] = env.getAssignmentManager().createAssignProcedure(hri, targetServer);
|
||||
}
|
||||
return procs;
|
||||
private TransitRegionStateProcedure[] createAssignProcedures(MasterProcedureEnv env)
|
||||
throws IOException {
|
||||
return AssignmentManagerUtil.createAssignProceduresForOpeningNewRegions(env,
|
||||
Stream.of(mergedRegion), getRegionReplication(env), getServerName(env));
|
||||
}
|
||||
|
||||
private int getRegionReplication(final MasterProcedureEnv env) throws IOException {
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
/*
|
||||
*
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
|
@ -16,109 +15,40 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.hbase.HBaseIOException;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.DoNotRetryRegionException;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.master.RegionPlan;
|
||||
import org.apache.hadoop.hbase.master.procedure.AbstractStateMachineRegionProcedure;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MoveRegionState;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.MoveRegionStateData;
|
||||
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Procedure that implements a RegionPlan.
|
||||
* It first runs an unassign subprocedure followed
|
||||
* by an assign subprocedure. It takes a lock on the region being moved.
|
||||
* It holds the lock for the life of the procedure.
|
||||
*
|
||||
* <p>Throws exception on construction if determines context hostile to move (cluster going
|
||||
* down or master is shutting down or table is disabled).</p>
|
||||
* Leave here only for checking if we can successfully start the master.
|
||||
* @deprecated Do not use any more.
|
||||
* @see TransitRegionStateProcedure
|
||||
*/
|
||||
@Deprecated
|
||||
@InterfaceAudience.Private
|
||||
public class MoveRegionProcedure extends AbstractStateMachineRegionProcedure<MoveRegionState> {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MoveRegionProcedure.class);
|
||||
private RegionPlan plan;
|
||||
|
||||
public MoveRegionProcedure() {
|
||||
// Required by the Procedure framework to create the procedure on replay
|
||||
super();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
protected RegionPlan getPlan() {
|
||||
return this.plan;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param check whether we should do some checks in the constructor. We will skip the checks if we
|
||||
* are reopening a region as this may fail the whole procedure and cause stuck. We will
|
||||
* do the check later when actually executing the procedure so not a big problem.
|
||||
* @throws IOException If the cluster is offline or master is stopping or if table is disabled or
|
||||
* non-existent.
|
||||
*/
|
||||
public MoveRegionProcedure(MasterProcedureEnv env, RegionPlan plan, boolean check)
|
||||
throws HBaseIOException {
|
||||
super(env, plan.getRegionInfo());
|
||||
this.plan = plan;
|
||||
if (check) {
|
||||
preflightChecks(env, true);
|
||||
checkOnline(env, plan.getRegionInfo());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Flow executeFromState(final MasterProcedureEnv env, final MoveRegionState state)
|
||||
throws InterruptedException {
|
||||
LOG.trace("{} execute state={}", this, state);
|
||||
switch (state) {
|
||||
case MOVE_REGION_PREPARE:
|
||||
// Check context again and that region is online; do it here after we have lock on region.
|
||||
try {
|
||||
preflightChecks(env, true);
|
||||
checkOnline(env, this.plan.getRegionInfo());
|
||||
if (!env.getMasterServices().getServerManager().isServerOnline(this.plan.getSource())) {
|
||||
throw new HBaseIOException(this.plan.getSource() + " not online");
|
||||
}
|
||||
} catch (HBaseIOException e) {
|
||||
LOG.warn(this.toString() + " FAILED because " + e.toString());
|
||||
return Flow.NO_MORE_STATE;
|
||||
}
|
||||
break;
|
||||
case MOVE_REGION_UNASSIGN:
|
||||
try {
|
||||
checkOnline(env, this.plan.getRegionInfo());
|
||||
} catch (DoNotRetryRegionException dnrre) {
|
||||
LOG.info("Skipping move, {} is not online; {}", getRegion().getEncodedName(), this,
|
||||
dnrre);
|
||||
return Flow.NO_MORE_STATE;
|
||||
}
|
||||
addChildProcedure(new UnassignProcedure(plan.getRegionInfo(), plan.getSource(),
|
||||
plan.getDestination(), true));
|
||||
setNextState(MoveRegionState.MOVE_REGION_ASSIGN);
|
||||
break;
|
||||
case MOVE_REGION_ASSIGN:
|
||||
AssignProcedure assignProcedure = plan.getDestination() == null ?
|
||||
new AssignProcedure(plan.getRegionInfo()):
|
||||
new AssignProcedure(plan.getRegionInfo(), plan.getDestination());
|
||||
addChildProcedure(assignProcedure);
|
||||
return Flow.NO_MORE_STATE;
|
||||
default:
|
||||
throw new UnsupportedOperationException("unhandled state=" + state);
|
||||
}
|
||||
return Flow.HAS_MORE_STATE;
|
||||
return Flow.NO_MORE_STATE;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionOpenOperation;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
|
||||
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteOperation;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.OpenRegionProcedureStateData;
|
||||
|
||||
/**
|
||||
* The remote procedure used to open a region.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class OpenRegionProcedure extends RegionRemoteProcedureBase {
|
||||
|
||||
public OpenRegionProcedure() {
|
||||
super();
|
||||
}
|
||||
|
||||
public OpenRegionProcedure(RegionInfo region, ServerName targetServer) {
|
||||
super(region, targetServer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TableOperationType getTableOperationType() {
|
||||
return TableOperationType.REGION_ASSIGN;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RemoteOperation remoteCallBuild(MasterProcedureEnv env, ServerName remote) {
|
||||
return new RegionOpenOperation(this, region, env.getAssignmentManager().getFavoredNodes(region),
|
||||
false);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
super.serializeStateData(serializer);
|
||||
serializer.serialize(OpenRegionProcedureStateData.getDefaultInstance());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
super.deserializeStateData(serializer);
|
||||
serializer.deserialize(OpenRegionProcedureStateData.class);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,157 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.TableProcedureInterface;
|
||||
import org.apache.hadoop.hbase.procedure2.FailedRemoteDispatchException;
|
||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
|
||||
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteProcedure;
|
||||
import org.apache.hadoop.hbase.procedure2.RemoteProcedureException;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionRemoteProcedureBaseStateData;
|
||||
|
||||
/**
|
||||
* The base class for the remote procedures used to open/close a region.
|
||||
* <p/>
|
||||
* Notice that here we do not care about the result of the remote call, if the remote call is
|
||||
* finished, either succeeded or not, we will always finish the procedure. The parent procedure
|
||||
* should take care of the result and try to reschedule if the result is not good.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedureEnv>
|
||||
implements TableProcedureInterface, RemoteProcedure<MasterProcedureEnv, ServerName> {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(RegionRemoteProcedureBase.class);
|
||||
|
||||
protected RegionInfo region;
|
||||
|
||||
private ServerName targetServer;
|
||||
|
||||
private boolean dispatched;
|
||||
|
||||
protected RegionRemoteProcedureBase() {
|
||||
}
|
||||
|
||||
protected RegionRemoteProcedureBase(RegionInfo region, ServerName targetServer) {
|
||||
this.region = region;
|
||||
this.targetServer = targetServer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remoteOperationCompleted(MasterProcedureEnv env) {
|
||||
// should not be called since we use reportRegionStateTransition to report the result
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remoteOperationFailed(MasterProcedureEnv env, RemoteProcedureException error) {
|
||||
// should not be called since we use reportRegionStateTransition to report the result
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
private ProcedureEvent<?> getRegionEvent(MasterProcedureEnv env) {
|
||||
return env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(region)
|
||||
.getProcedureEvent();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remoteCallFailed(MasterProcedureEnv env, ServerName remote,
|
||||
IOException exception) {
|
||||
ProcedureEvent<?> event = getRegionEvent(env);
|
||||
synchronized (event) {
|
||||
if (event.isReady()) {
|
||||
LOG.warn(
|
||||
"The procedure event of procedure {} for region {} to server {} is not suspended, " +
|
||||
"usually this should not happen, but anyway let's skip the following wake up code, ",
|
||||
this, region, targetServer);
|
||||
return;
|
||||
}
|
||||
LOG.warn("The remote operation {} for region {} to server {} failed", this, region,
|
||||
targetServer, exception);
|
||||
event.wake(env.getProcedureScheduler());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TableName getTableName() {
|
||||
return region.getTable();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void rollback(MasterProcedureEnv env) throws IOException, InterruptedException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean abort(MasterProcedureEnv env) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env)
|
||||
throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException {
|
||||
if (dispatched) {
|
||||
// we are done, the parent procedure will check whether we are succeeded.
|
||||
return null;
|
||||
}
|
||||
ProcedureEvent<?> event = getRegionEvent(env);
|
||||
synchronized (event) {
|
||||
try {
|
||||
env.getRemoteDispatcher().addOperationToNode(targetServer, this);
|
||||
} catch (FailedRemoteDispatchException e) {
|
||||
LOG.warn("Can not add remote operation {} for region {} to server {}, this usually " +
|
||||
"because the server is alread dead, give up and mark the procedure as complete, " +
|
||||
"the parent procedure will take care of this.", this, region, targetServer, e);
|
||||
return null;
|
||||
}
|
||||
dispatched = true;
|
||||
event.suspend();
|
||||
event.suspendIfNotReady(this);
|
||||
throw new ProcedureSuspendedException();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
serializer.serialize(RegionRemoteProcedureBaseStateData.newBuilder()
|
||||
.setRegion(ProtobufUtil.toRegionInfo(region))
|
||||
.setTargetServer(ProtobufUtil.toServerName(targetServer)).setDispatched(dispatched).build());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
RegionRemoteProcedureBaseStateData data =
|
||||
serializer.deserialize(RegionRemoteProcedureBaseStateData.class);
|
||||
region = ProtobufUtil.toRegionInfo(data.getRegion());
|
||||
targetServer = ProtobufUtil.toServerName(data.getTargetServer());
|
||||
dispatched = data.getDispatched();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,313 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
import java.util.concurrent.locks.Lock;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.DoNotRetryRegionException;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.RegionOfflineException;
|
||||
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
|
||||
import org.apache.hadoop.hbase.master.RegionState;
|
||||
import org.apache.hadoop.hbase.master.RegionState.State;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
|
||||
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
/**
|
||||
* Current Region State. Most fields are synchronized with meta region, i.e, we will update meta
|
||||
* immediately after we modify this RegionStateNode, and usually under the lock. The only exception
|
||||
* is {@link #lastHost}, which should not be used for critical condition.
|
||||
* <p/>
|
||||
* Typically, the only way to modify this class is through {@link TransitRegionStateProcedure}, and
|
||||
* we will record the TRSP along with this RegionStateNode to make sure that there could at most one
|
||||
* TRSP. For other operations, such as SCP, we will first get the lock, and then try to schedule a
|
||||
* TRSP. If there is already one, then the solution will be different:
|
||||
* <ul>
|
||||
* <li>For SCP, we will update the region state in meta to tell the TRSP to retry.</li>
|
||||
* <li>For DisableTableProcedure, as we have the xlock, we can make sure that the TRSP has not been
|
||||
* executed yet, so just unset it and attach a new one. The original one will quit immediately when
|
||||
* executing.</li>
|
||||
* <li>For split/merge, we will fail immediately as there is no actual operations yet so no
|
||||
* harm.</li>
|
||||
* <li>For EnableTableProcedure/TruncateTableProcedure, we can make sure that there will be no TRSP
|
||||
* attached with the RSNs.</li>
|
||||
* <li>For other procedures, you'd better use ReopenTableRegionsProcedure. The RTRP will take care
|
||||
* of lots of corner cases when reopening regions.</li>
|
||||
* </ul>
|
||||
* <p/>
|
||||
* Several fields are declared with {@code volatile}, which means you are free to get it without
|
||||
* lock, but usually you should not use these fields without locking for critical condition, as it
|
||||
* will be easily to introduce inconsistency. For example, you are free to dump the status and show
|
||||
* it on web without locking, but if you want to change the state of the RegionStateNode by checking
|
||||
* the current state, you'd better have the lock...
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class RegionStateNode implements Comparable<RegionStateNode> {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(RegionStateNode.class);
|
||||
|
||||
private static final class AssignmentProcedureEvent extends ProcedureEvent<RegionInfo> {
|
||||
public AssignmentProcedureEvent(final RegionInfo regionInfo) {
|
||||
super(regionInfo);
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
final Lock lock = new ReentrantLock();
|
||||
private final RegionInfo regionInfo;
|
||||
private final ProcedureEvent<?> event;
|
||||
private final ConcurrentMap<RegionInfo, RegionStateNode> ritMap;
|
||||
|
||||
// volatile only for getLastUpdate and test usage, the upper layer should sync on the
|
||||
// RegionStateNode before accessing usually.
|
||||
private volatile TransitRegionStateProcedure procedure = null;
|
||||
private volatile ServerName regionLocation = null;
|
||||
// notice that, the lastHost will only be updated when a region is successfully CLOSED through
|
||||
// UnassignProcedure, so do not use it for critical condition as the data maybe stale and unsync
|
||||
// with the data in meta.
|
||||
private volatile ServerName lastHost = null;
|
||||
/**
|
||||
* A Region-in-Transition (RIT) moves through states. See {@link State} for complete list. A
|
||||
* Region that is opened moves from OFFLINE => OPENING => OPENED.
|
||||
*/
|
||||
private volatile State state = State.OFFLINE;
|
||||
|
||||
/**
|
||||
* Updated whenever a call to {@link #setRegionLocation(ServerName)} or
|
||||
* {@link #setState(State, State...)}.
|
||||
*/
|
||||
private volatile long lastUpdate = 0;
|
||||
|
||||
private volatile long openSeqNum = HConstants.NO_SEQNUM;
|
||||
|
||||
RegionStateNode(RegionInfo regionInfo, ConcurrentMap<RegionInfo, RegionStateNode> ritMap) {
|
||||
this.regionInfo = regionInfo;
|
||||
this.event = new AssignmentProcedureEvent(regionInfo);
|
||||
this.ritMap = ritMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param update new region state this node should be assigned.
|
||||
* @param expected current state should be in this given list of expected states
|
||||
* @return true, if current state is in expected list; otherwise false.
|
||||
*/
|
||||
public boolean setState(final State update, final State... expected) {
|
||||
if (!isInState(expected)) {
|
||||
return false;
|
||||
}
|
||||
this.state = update;
|
||||
this.lastUpdate = EnvironmentEdgeManager.currentTime();
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Put region into OFFLINE mode (set state and clear location).
|
||||
* @return Last recorded server deploy
|
||||
*/
|
||||
public ServerName offline() {
|
||||
setState(State.OFFLINE);
|
||||
return setRegionLocation(null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set new {@link State} but only if currently in <code>expected</code> State (if not, throw
|
||||
* {@link UnexpectedStateException}.
|
||||
*/
|
||||
public void transitionState(final State update, final State... expected)
|
||||
throws UnexpectedStateException {
|
||||
if (!setState(update, expected)) {
|
||||
throw new UnexpectedStateException("Expected " + Arrays.toString(expected) +
|
||||
" so could move to " + update + " but current state=" + getState());
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isInState(final State... expected) {
|
||||
if (expected != null && expected.length > 0) {
|
||||
boolean expectedState = false;
|
||||
for (int i = 0; i < expected.length; ++i) {
|
||||
expectedState |= (getState() == expected[i]);
|
||||
}
|
||||
return expectedState;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean isStuck() {
|
||||
return isInState(State.FAILED_OPEN) && getProcedure() != null;
|
||||
}
|
||||
|
||||
public boolean isInTransition() {
|
||||
return getProcedure() != null;
|
||||
}
|
||||
|
||||
public long getLastUpdate() {
|
||||
TransitRegionStateProcedure proc = this.procedure;
|
||||
return proc != null ? proc.getLastUpdate() : lastUpdate;
|
||||
}
|
||||
|
||||
public void setLastHost(final ServerName serverName) {
|
||||
this.lastHost = serverName;
|
||||
}
|
||||
|
||||
public void setOpenSeqNum(final long seqId) {
|
||||
this.openSeqNum = seqId;
|
||||
}
|
||||
|
||||
public ServerName setRegionLocation(final ServerName serverName) {
|
||||
ServerName lastRegionLocation = this.regionLocation;
|
||||
if (LOG.isTraceEnabled() && serverName == null) {
|
||||
LOG.trace("Tracking when we are set to null " + this, new Throwable("TRACE"));
|
||||
}
|
||||
this.regionLocation = serverName;
|
||||
this.lastUpdate = EnvironmentEdgeManager.currentTime();
|
||||
return lastRegionLocation;
|
||||
}
|
||||
|
||||
public void setProcedure(TransitRegionStateProcedure proc) {
|
||||
assert this.procedure == null;
|
||||
this.procedure = proc;
|
||||
ritMap.put(regionInfo, this);
|
||||
}
|
||||
|
||||
public void unsetProcedure(TransitRegionStateProcedure proc) {
|
||||
assert this.procedure == proc;
|
||||
this.procedure = null;
|
||||
ritMap.remove(regionInfo, this);
|
||||
}
|
||||
|
||||
public TransitRegionStateProcedure getProcedure() {
|
||||
return procedure;
|
||||
}
|
||||
|
||||
public ProcedureEvent<?> getProcedureEvent() {
|
||||
return event;
|
||||
}
|
||||
|
||||
public RegionInfo getRegionInfo() {
|
||||
return regionInfo;
|
||||
}
|
||||
|
||||
public TableName getTable() {
|
||||
return getRegionInfo().getTable();
|
||||
}
|
||||
|
||||
public boolean isSystemTable() {
|
||||
return getTable().isSystemTable();
|
||||
}
|
||||
|
||||
public ServerName getLastHost() {
|
||||
return lastHost;
|
||||
}
|
||||
|
||||
public ServerName getRegionLocation() {
|
||||
return regionLocation;
|
||||
}
|
||||
|
||||
public State getState() {
|
||||
return state;
|
||||
}
|
||||
|
||||
public long getOpenSeqNum() {
|
||||
return openSeqNum;
|
||||
}
|
||||
|
||||
public int getFormatVersion() {
|
||||
// we don't have any format for now
|
||||
// it should probably be in regionInfo.getFormatVersion()
|
||||
return 0;
|
||||
}
|
||||
|
||||
public RegionState toRegionState() {
|
||||
return new RegionState(getRegionInfo(), getState(), getLastUpdate(), getRegionLocation());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(final RegionStateNode other) {
|
||||
// NOTE: RegionInfo sort by table first, so we are relying on that.
|
||||
// we have a TestRegionState#testOrderedByTable() that check for that.
|
||||
return RegionInfo.COMPARATOR.compare(getRegionInfo(), other.getRegionInfo());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return getRegionInfo().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
if (!(other instanceof RegionStateNode)) {
|
||||
return false;
|
||||
}
|
||||
return compareTo((RegionStateNode) other) == 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return toDescriptiveString();
|
||||
}
|
||||
|
||||
public String toShortString() {
|
||||
// rit= is the current Region-In-Transition State -- see State enum.
|
||||
return String.format("rit=%s, location=%s", getState(), getRegionLocation());
|
||||
}
|
||||
|
||||
public String toDescriptiveString() {
|
||||
return String.format("%s, table=%s, region=%s", toShortString(), getTable(),
|
||||
getRegionInfo().getEncodedName());
|
||||
}
|
||||
|
||||
public void checkOnline() throws DoNotRetryRegionException {
|
||||
RegionInfo ri = getRegionInfo();
|
||||
State s = state;
|
||||
if (s != State.OPEN) {
|
||||
throw new DoNotRetryRegionException(ri.getEncodedName() + " is no OPEN; state=" + s);
|
||||
}
|
||||
if (ri.isSplitParent()) {
|
||||
throw new DoNotRetryRegionException(
|
||||
ri.getEncodedName() + " is not online (splitParent=true)");
|
||||
}
|
||||
if (ri.isSplit()) {
|
||||
throw new DoNotRetryRegionException(ri.getEncodedName() + " has split=true");
|
||||
}
|
||||
if (ri.isOffline()) {
|
||||
// RegionOfflineException is not instance of DNRIOE so wrap it.
|
||||
throw new DoNotRetryRegionException(new RegionOfflineException(ri.getEncodedName()));
|
||||
}
|
||||
}
|
||||
|
||||
public void lock() {
|
||||
lock.lock();
|
||||
}
|
||||
|
||||
public void unlock() {
|
||||
lock.unlock();
|
||||
}
|
||||
}
|
|
@ -127,7 +127,7 @@ public class RegionStateStore {
|
|||
}
|
||||
}
|
||||
|
||||
public void updateRegionLocation(RegionStates.RegionStateNode regionStateNode)
|
||||
public void updateRegionLocation(RegionStateNode regionStateNode)
|
||||
throws IOException {
|
||||
if (regionStateNode.getRegionInfo().isMetaRegion()) {
|
||||
updateMetaLocation(regionStateNode.getRegionInfo(), regionStateNode.getRegionLocation(),
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
/**
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
|
@ -16,11 +15,9 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
|
@ -28,7 +25,6 @@ import java.util.HashMap;
|
|||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
@ -41,12 +37,9 @@ import org.apache.hadoop.hbase.HRegionLocation;
|
|||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
|
||||
import org.apache.hadoop.hbase.master.RegionState;
|
||||
import org.apache.hadoop.hbase.master.RegionState.State;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -62,247 +55,22 @@ import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesti
|
|||
public class RegionStates {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(RegionStates.class);
|
||||
|
||||
protected static final State[] STATES_EXPECTED_ON_OPEN = new State[] {
|
||||
// TODO: need to be more specific, i.e, OPENING vs. OPEN, CLOSING vs. CLOSED.
|
||||
static final State[] STATES_EXPECTED_ON_OPEN = new State[] {
|
||||
State.OPEN, // State may already be OPEN if we died after receiving the OPEN from regionserver
|
||||
// but before complete finish of AssignProcedure. HBASE-20100.
|
||||
State.OFFLINE, State.CLOSED, // disable/offline
|
||||
State.SPLITTING, State.SPLIT, // ServerCrashProcedure
|
||||
State.OFFLINE, State.CLOSED, State.ABNORMALLY_CLOSED, // disable/offline
|
||||
State.SPLITTING, // ServerCrashProcedure
|
||||
State.OPENING, State.FAILED_OPEN, // already in-progress (retrying)
|
||||
State.MERGED, State.SPLITTING_NEW
|
||||
};
|
||||
|
||||
protected static final State[] STATES_EXPECTED_ON_CLOSE = new State[] {
|
||||
State.SPLITTING, State.SPLIT, State.MERGING, // ServerCrashProcedure
|
||||
static final State[] STATES_EXPECTED_ON_CLOSE = new State[] {
|
||||
State.SPLITTING, State.MERGING, State.OPENING, // ServerCrashProcedure
|
||||
State.OPEN, // enabled/open
|
||||
State.CLOSING // already in-progress (retrying)
|
||||
};
|
||||
|
||||
private static class AssignmentProcedureEvent extends ProcedureEvent<RegionInfo> {
|
||||
public AssignmentProcedureEvent(final RegionInfo regionInfo) {
|
||||
super(regionInfo);
|
||||
}
|
||||
}
|
||||
|
||||
private static class ServerReportEvent extends ProcedureEvent<ServerName> {
|
||||
public ServerReportEvent(final ServerName serverName) {
|
||||
super(serverName);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Current Region State.
|
||||
* In-memory only. Not persisted.
|
||||
*/
|
||||
// Mutable/Immutable? Changes have to be synchronized or not?
|
||||
// Data members are volatile which seems to say multi-threaded access is fine.
|
||||
// In the below we do check and set but the check state could change before
|
||||
// we do the set because no synchronization....which seems dodgy. Clear up
|
||||
// understanding here... how many threads accessing? Do locks make it so one
|
||||
// thread at a time working on a single Region's RegionStateNode? Lets presume
|
||||
// so for now. Odd is that elsewhere in this RegionStates, we synchronize on
|
||||
// the RegionStateNode instance. TODO.
|
||||
public static class RegionStateNode implements Comparable<RegionStateNode> {
|
||||
private final RegionInfo regionInfo;
|
||||
private final ProcedureEvent<?> event;
|
||||
|
||||
private volatile RegionTransitionProcedure procedure = null;
|
||||
private volatile ServerName regionLocation = null;
|
||||
// notice that, the lastHost will only be updated when a region is successfully CLOSED through
|
||||
// UnassignProcedure, so do not use it for critical condition as the data maybe stale and unsync
|
||||
// with the data in meta.
|
||||
private volatile ServerName lastHost = null;
|
||||
/**
|
||||
* A Region-in-Transition (RIT) moves through states.
|
||||
* See {@link State} for complete list. A Region that
|
||||
* is opened moves from OFFLINE => OPENING => OPENED.
|
||||
*/
|
||||
private volatile State state = State.OFFLINE;
|
||||
|
||||
/**
|
||||
* Updated whenever a call to {@link #setRegionLocation(ServerName)}
|
||||
* or {@link #setState(State, State...)}.
|
||||
*/
|
||||
private volatile long lastUpdate = 0;
|
||||
|
||||
private volatile long openSeqNum = HConstants.NO_SEQNUM;
|
||||
|
||||
public RegionStateNode(final RegionInfo regionInfo) {
|
||||
this.regionInfo = regionInfo;
|
||||
this.event = new AssignmentProcedureEvent(regionInfo);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param update new region state this node should be assigned.
|
||||
* @param expected current state should be in this given list of expected states
|
||||
* @return true, if current state is in expected list; otherwise false.
|
||||
*/
|
||||
public boolean setState(final State update, final State... expected) {
|
||||
if (!isInState(expected)) {
|
||||
return false;
|
||||
}
|
||||
this.state = update;
|
||||
this.lastUpdate = EnvironmentEdgeManager.currentTime();
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Put region into OFFLINE mode (set state and clear location).
|
||||
* @return Last recorded server deploy
|
||||
*/
|
||||
public ServerName offline() {
|
||||
setState(State.OFFLINE);
|
||||
return setRegionLocation(null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set new {@link State} but only if currently in <code>expected</code> State
|
||||
* (if not, throw {@link UnexpectedStateException}.
|
||||
*/
|
||||
public void transitionState(final State update, final State... expected)
|
||||
throws UnexpectedStateException {
|
||||
if (!setState(update, expected)) {
|
||||
throw new UnexpectedStateException("Expected " + Arrays.toString(expected) +
|
||||
" so could move to " + update + " but current state=" + getState());
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isInState(final State... expected) {
|
||||
if (expected != null && expected.length > 0) {
|
||||
boolean expectedState = false;
|
||||
for (int i = 0; i < expected.length; ++i) {
|
||||
expectedState |= (getState() == expected[i]);
|
||||
}
|
||||
return expectedState;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean isStuck() {
|
||||
return isInState(State.FAILED_OPEN) && getProcedure() != null;
|
||||
}
|
||||
|
||||
public boolean isInTransition() {
|
||||
return getProcedure() != null;
|
||||
}
|
||||
|
||||
public long getLastUpdate() {
|
||||
return procedure != null ? procedure.getLastUpdate() : lastUpdate;
|
||||
}
|
||||
|
||||
public void setLastHost(final ServerName serverName) {
|
||||
this.lastHost = serverName;
|
||||
}
|
||||
|
||||
public void setOpenSeqNum(final long seqId) {
|
||||
this.openSeqNum = seqId;
|
||||
}
|
||||
|
||||
public ServerName setRegionLocation(final ServerName serverName) {
|
||||
ServerName lastRegionLocation = this.regionLocation;
|
||||
if (LOG.isTraceEnabled() && serverName == null) {
|
||||
LOG.trace("Tracking when we are set to null " + this, new Throwable("TRACE"));
|
||||
}
|
||||
this.regionLocation = serverName;
|
||||
this.lastUpdate = EnvironmentEdgeManager.currentTime();
|
||||
return lastRegionLocation;
|
||||
}
|
||||
|
||||
public boolean setProcedure(final RegionTransitionProcedure proc) {
|
||||
if (this.procedure != null && this.procedure != proc) {
|
||||
return false;
|
||||
}
|
||||
this.procedure = proc;
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean unsetProcedure(final RegionTransitionProcedure proc) {
|
||||
if (this.procedure != null && this.procedure != proc) {
|
||||
return false;
|
||||
}
|
||||
this.procedure = null;
|
||||
return true;
|
||||
}
|
||||
|
||||
public RegionTransitionProcedure getProcedure() {
|
||||
return procedure;
|
||||
}
|
||||
|
||||
public ProcedureEvent<?> getProcedureEvent() {
|
||||
return event;
|
||||
}
|
||||
|
||||
public RegionInfo getRegionInfo() {
|
||||
return regionInfo;
|
||||
}
|
||||
|
||||
public TableName getTable() {
|
||||
return getRegionInfo().getTable();
|
||||
}
|
||||
|
||||
public boolean isSystemTable() {
|
||||
return getTable().isSystemTable();
|
||||
}
|
||||
|
||||
public ServerName getLastHost() {
|
||||
return lastHost;
|
||||
}
|
||||
|
||||
public ServerName getRegionLocation() {
|
||||
return regionLocation;
|
||||
}
|
||||
|
||||
public State getState() {
|
||||
return state;
|
||||
}
|
||||
|
||||
public long getOpenSeqNum() {
|
||||
return openSeqNum;
|
||||
}
|
||||
|
||||
public int getFormatVersion() {
|
||||
// we don't have any format for now
|
||||
// it should probably be in regionInfo.getFormatVersion()
|
||||
return 0;
|
||||
}
|
||||
|
||||
public RegionState toRegionState() {
|
||||
return new RegionState(getRegionInfo(), getState(), getLastUpdate(), getRegionLocation());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(final RegionStateNode other) {
|
||||
// NOTE: RegionInfo sort by table first, so we are relying on that.
|
||||
// we have a TestRegionState#testOrderedByTable() that check for that.
|
||||
return RegionInfo.COMPARATOR.compare(getRegionInfo(), other.getRegionInfo());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return getRegionInfo().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) return true;
|
||||
if (!(other instanceof RegionStateNode)) return false;
|
||||
return compareTo((RegionStateNode)other) == 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return toDescriptiveString();
|
||||
}
|
||||
|
||||
public String toShortString() {
|
||||
// rit= is the current Region-In-Transition State -- see State enum.
|
||||
return String.format("rit=%s, location=%s", getState(), getRegionLocation());
|
||||
}
|
||||
|
||||
public String toDescriptiveString() {
|
||||
return String.format("%s, table=%s, region=%s",
|
||||
toShortString(), getTable(), getRegionInfo().getEncodedName());
|
||||
}
|
||||
}
|
||||
|
||||
// This comparator sorts the RegionStates by time stamp then Region name.
|
||||
// Comparing by timestamp alone can lead us to discard different RegionStates that happen
|
||||
// to share a timestamp.
|
||||
|
@ -314,130 +82,6 @@ public class RegionStates {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Server State.
|
||||
*/
|
||||
public enum ServerState {
|
||||
/**
|
||||
* Initial state. Available.
|
||||
*/
|
||||
ONLINE,
|
||||
|
||||
/**
|
||||
* Only server which carries meta can have this state. We will split wal for meta and then
|
||||
* assign meta first before splitting other wals.
|
||||
*/
|
||||
SPLITTING_META,
|
||||
|
||||
/**
|
||||
* Indicate that the meta splitting is done. We need this state so that the UnassignProcedure
|
||||
* for meta can safely quit. See the comments in UnassignProcedure.remoteCallFailed for more
|
||||
* details.
|
||||
*/
|
||||
SPLITTING_META_DONE,
|
||||
|
||||
/**
|
||||
* Server expired/crashed. Currently undergoing WAL splitting.
|
||||
*/
|
||||
SPLITTING,
|
||||
|
||||
/**
|
||||
* WAL splitting done. This state will be used to tell the UnassignProcedure that it can safely
|
||||
* quit. See the comments in UnassignProcedure.remoteCallFailed for more details.
|
||||
*/
|
||||
OFFLINE
|
||||
}
|
||||
|
||||
/**
|
||||
* State of Server; list of hosted regions, etc.
|
||||
*/
|
||||
public static class ServerStateNode implements Comparable<ServerStateNode> {
|
||||
private final ServerReportEvent reportEvent;
|
||||
|
||||
private final Set<RegionStateNode> regions;
|
||||
private final ServerName serverName;
|
||||
|
||||
private volatile ServerState state = ServerState.ONLINE;
|
||||
|
||||
public ServerStateNode(final ServerName serverName) {
|
||||
this.serverName = serverName;
|
||||
this.regions = ConcurrentHashMap.newKeySet();
|
||||
this.reportEvent = new ServerReportEvent(serverName);
|
||||
}
|
||||
|
||||
public ServerName getServerName() {
|
||||
return serverName;
|
||||
}
|
||||
|
||||
public ServerState getState() {
|
||||
return state;
|
||||
}
|
||||
|
||||
public ProcedureEvent<?> getReportEvent() {
|
||||
return reportEvent;
|
||||
}
|
||||
|
||||
public boolean isInState(final ServerState... expected) {
|
||||
boolean expectedState = false;
|
||||
if (expected != null) {
|
||||
for (int i = 0; i < expected.length; ++i) {
|
||||
expectedState |= (state == expected[i]);
|
||||
}
|
||||
}
|
||||
return expectedState;
|
||||
}
|
||||
|
||||
private void setState(final ServerState state) {
|
||||
this.state = state;
|
||||
}
|
||||
|
||||
public Set<RegionStateNode> getRegions() {
|
||||
return regions;
|
||||
}
|
||||
|
||||
public int getRegionCount() {
|
||||
return regions.size();
|
||||
}
|
||||
|
||||
public ArrayList<RegionInfo> getRegionInfoList() {
|
||||
ArrayList<RegionInfo> hris = new ArrayList<RegionInfo>(regions.size());
|
||||
for (RegionStateNode region: regions) {
|
||||
hris.add(region.getRegionInfo());
|
||||
}
|
||||
return hris;
|
||||
}
|
||||
|
||||
public void addRegion(final RegionStateNode regionNode) {
|
||||
this.regions.add(regionNode);
|
||||
}
|
||||
|
||||
public void removeRegion(final RegionStateNode regionNode) {
|
||||
this.regions.remove(regionNode);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(final ServerStateNode other) {
|
||||
return getServerName().compareTo(other.getServerName());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return getServerName().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) return true;
|
||||
if (!(other instanceof ServerStateNode)) return false;
|
||||
return compareTo((ServerStateNode)other) == 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("ServerStateNode(%s)", getServerName());
|
||||
}
|
||||
}
|
||||
|
||||
public final static RegionStateStampComparator REGION_STATE_STAMP_COMPARATOR =
|
||||
new RegionStateStampComparator();
|
||||
|
||||
|
@ -482,22 +126,23 @@ public class RegionStates {
|
|||
// ==========================================================================
|
||||
// RegionStateNode helpers
|
||||
// ==========================================================================
|
||||
protected RegionStateNode createRegionStateNode(final RegionInfo regionInfo) {
|
||||
RegionStateNode newNode = new RegionStateNode(regionInfo);
|
||||
@VisibleForTesting
|
||||
RegionStateNode createRegionStateNode(RegionInfo regionInfo) {
|
||||
RegionStateNode newNode = new RegionStateNode(regionInfo, regionInTransition);
|
||||
RegionStateNode oldNode = regionsMap.putIfAbsent(regionInfo.getRegionName(), newNode);
|
||||
return oldNode != null ? oldNode : newNode;
|
||||
}
|
||||
|
||||
protected RegionStateNode getOrCreateRegionStateNode(final RegionInfo regionInfo) {
|
||||
RegionStateNode node = regionsMap.get(regionInfo.getRegionName());
|
||||
public RegionStateNode getOrCreateRegionStateNode(RegionInfo regionInfo) {
|
||||
RegionStateNode node = getRegionStateNodeFromName(regionInfo.getRegionName());
|
||||
return node != null ? node : createRegionStateNode(regionInfo);
|
||||
}
|
||||
|
||||
RegionStateNode getRegionStateNodeFromName(final byte[] regionName) {
|
||||
RegionStateNode getRegionStateNodeFromName(byte[] regionName) {
|
||||
return regionsMap.get(regionName);
|
||||
}
|
||||
|
||||
protected RegionStateNode getRegionStateNode(final RegionInfo regionInfo) {
|
||||
public RegionStateNode getRegionStateNode(RegionInfo regionInfo) {
|
||||
return getRegionStateNodeFromName(regionInfo.getRegionName());
|
||||
}
|
||||
|
||||
|
@ -593,7 +238,8 @@ public class RegionStates {
|
|||
}
|
||||
|
||||
private HRegionLocation createRegionForReopen(RegionStateNode node) {
|
||||
synchronized (node) {
|
||||
node.lock();
|
||||
try {
|
||||
if (!include(node, false)) {
|
||||
return null;
|
||||
}
|
||||
|
@ -605,6 +251,8 @@ public class RegionStates {
|
|||
} else {
|
||||
return null;
|
||||
}
|
||||
} finally {
|
||||
node.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -649,7 +297,8 @@ public class RegionStates {
|
|||
if (node == null) {
|
||||
return null;
|
||||
}
|
||||
synchronized (node) {
|
||||
node.lock();
|
||||
try {
|
||||
if (oldLoc.getSeqNum() >= 0) {
|
||||
// in OPEN state before
|
||||
if (node.isInState(State.OPEN)) {
|
||||
|
@ -683,6 +332,8 @@ public class RegionStates {
|
|||
return new HRegionLocation(node.getRegionInfo(), node.getRegionLocation(), openSeqNum);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
node.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -726,8 +377,10 @@ public class RegionStates {
|
|||
* @return set of RegionInfo hosted by the specified server
|
||||
*/
|
||||
public List<RegionInfo> getServerRegionInfoSet(final ServerName serverName) {
|
||||
final ServerStateNode serverInfo = getServerNode(serverName);
|
||||
if (serverInfo == null) return Collections.emptyList();
|
||||
ServerStateNode serverInfo = getServerNode(serverName);
|
||||
if (serverInfo == null) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
synchronized (serverInfo) {
|
||||
return serverInfo.getRegionInfoList();
|
||||
|
@ -779,10 +432,13 @@ public class RegionStates {
|
|||
setServerState(serverName, ServerState.OFFLINE);
|
||||
}
|
||||
|
||||
public void updateRegionState(final RegionInfo regionInfo, final State state) {
|
||||
final RegionStateNode regionNode = getOrCreateRegionStateNode(regionInfo);
|
||||
synchronized (regionNode) {
|
||||
public void updateRegionState(RegionInfo regionInfo, State state) {
|
||||
RegionStateNode regionNode = getOrCreateRegionStateNode(regionInfo);
|
||||
regionNode.lock();
|
||||
try {
|
||||
regionNode.setState(state);
|
||||
} finally {
|
||||
regionNode.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -799,11 +455,14 @@ public class RegionStates {
|
|||
return result;
|
||||
}
|
||||
|
||||
public boolean isRegionInState(final RegionInfo regionInfo, final State... state) {
|
||||
final RegionStateNode region = getRegionStateNode(regionInfo);
|
||||
if (region != null) {
|
||||
synchronized (region) {
|
||||
return region.isInState(state);
|
||||
public boolean isRegionInState(RegionInfo regionInfo, State... state) {
|
||||
RegionStateNode regionNode = getRegionStateNode(regionInfo);
|
||||
if (regionNode != null) {
|
||||
regionNode.lock();
|
||||
try {
|
||||
return regionNode.isInState(state);
|
||||
} finally {
|
||||
regionNode.unlock();
|
||||
}
|
||||
}
|
||||
return false;
|
||||
|
@ -866,12 +525,15 @@ public class RegionStates {
|
|||
return tableRegions;
|
||||
}
|
||||
|
||||
public ServerName getRegionServerOfRegion(final RegionInfo regionInfo) {
|
||||
final RegionStateNode region = getRegionStateNode(regionInfo);
|
||||
if (region != null) {
|
||||
synchronized (region) {
|
||||
ServerName server = region.getRegionLocation();
|
||||
return server != null ? server : region.getLastHost();
|
||||
public ServerName getRegionServerOfRegion(RegionInfo regionInfo) {
|
||||
RegionStateNode regionNode = getRegionStateNode(regionInfo);
|
||||
if (regionNode != null) {
|
||||
regionNode.lock();
|
||||
try {
|
||||
ServerName server = regionNode.getRegionLocation();
|
||||
return server != null ? server : regionNode.getLastHost();
|
||||
} finally {
|
||||
regionNode.unlock();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
|
@ -938,20 +600,6 @@ public class RegionStates {
|
|||
// ==========================================================================
|
||||
// Region in transition helpers
|
||||
// ==========================================================================
|
||||
protected boolean addRegionInTransition(final RegionStateNode regionNode,
|
||||
final RegionTransitionProcedure procedure) {
|
||||
if (procedure != null && !regionNode.setProcedure(procedure)) return false;
|
||||
|
||||
regionInTransition.put(regionNode.getRegionInfo(), regionNode);
|
||||
return true;
|
||||
}
|
||||
|
||||
protected void removeRegionInTransition(final RegionStateNode regionNode,
|
||||
final RegionTransitionProcedure procedure) {
|
||||
regionInTransition.remove(regionNode.getRegionInfo());
|
||||
regionNode.unsetProcedure(procedure);
|
||||
}
|
||||
|
||||
public boolean hasRegionsInTransition() {
|
||||
return !regionInTransition.isEmpty();
|
||||
}
|
||||
|
@ -961,21 +609,17 @@ public class RegionStates {
|
|||
return node != null ? node.isInTransition() : false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return If a procedure-in-transition for <code>hri</code>, return it else null.
|
||||
*/
|
||||
public RegionTransitionProcedure getRegionTransitionProcedure(final RegionInfo hri) {
|
||||
public RegionState getRegionTransitionState(RegionInfo hri) {
|
||||
RegionStateNode node = regionInTransition.get(hri);
|
||||
if (node == null) return null;
|
||||
return node.getProcedure();
|
||||
}
|
||||
if (node == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public RegionState getRegionTransitionState(final RegionInfo hri) {
|
||||
RegionStateNode node = regionInTransition.get(hri);
|
||||
if (node == null) return null;
|
||||
|
||||
synchronized (node) {
|
||||
node.lock();
|
||||
try {
|
||||
return node.isInTransition() ? node.toRegionState() : null;
|
||||
} finally {
|
||||
node.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1110,7 +754,7 @@ public class RegionStates {
|
|||
serverMap.remove(serverName);
|
||||
}
|
||||
|
||||
protected ServerStateNode getServerNode(final ServerName serverName) {
|
||||
ServerStateNode getServerNode(final ServerName serverName) {
|
||||
return serverMap.get(serverName);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
/**
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
|
@ -24,100 +23,41 @@ import org.apache.hadoop.hbase.ServerName;
|
|||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.TableProcedureInterface;
|
||||
import org.apache.hadoop.hbase.procedure2.FailedRemoteDispatchException;
|
||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
|
||||
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteOperation;
|
||||
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteProcedure;
|
||||
import org.apache.hadoop.hbase.procedure2.RemoteProcedureException;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
||||
|
||||
/**
|
||||
* Base class for the Assign and Unassign Procedure.
|
||||
*
|
||||
* Locking:
|
||||
* Takes exclusive lock on the region being assigned/unassigned. Thus, there can only be one
|
||||
* RegionTransitionProcedure per region running at a time (see MasterProcedureScheduler).
|
||||
*
|
||||
* <p>This procedure is asynchronous and responds to external events.
|
||||
* The AssignmentManager will notify this procedure when the RS completes
|
||||
* the operation and reports the transitioned state
|
||||
* (see the Assign and Unassign class for more detail).</p>
|
||||
*
|
||||
* <p>Procedures move from the REGION_TRANSITION_QUEUE state when they are
|
||||
* first submitted, to the REGION_TRANSITION_DISPATCH state when the request
|
||||
* to remote server is sent and the Procedure is suspended waiting on external
|
||||
* event to be woken again. Once the external event is triggered, Procedure
|
||||
* moves to the REGION_TRANSITION_FINISH state.</p>
|
||||
*
|
||||
* <p>NOTE: {@link AssignProcedure} and {@link UnassignProcedure} should not be thought of
|
||||
* as being asymmetric, at least currently.
|
||||
* <ul>
|
||||
* <li>{@link AssignProcedure} moves through all the above described states and implements methods
|
||||
* associated with each while {@link UnassignProcedure} starts at state
|
||||
* REGION_TRANSITION_DISPATCH and state REGION_TRANSITION_QUEUE is not supported.</li>
|
||||
*
|
||||
* <li>When any step in {@link AssignProcedure} fails, failure handler
|
||||
* AssignProcedure#handleFailure(MasterProcedureEnv, RegionStateNode) re-attempts the
|
||||
* assignment by setting the procedure state to REGION_TRANSITION_QUEUE and forces
|
||||
* assignment to a different target server by setting {@link AssignProcedure#forceNewPlan}. When
|
||||
* the number of attempts reaches threshold configuration 'hbase.assignment.maximum.attempts',
|
||||
* the procedure is aborted. For {@link UnassignProcedure}, similar re-attempts are
|
||||
* intentionally not implemented. It is a 'one shot' procedure. See its class doc for how it
|
||||
* handles failure.
|
||||
* </li>
|
||||
* <li>If we find a region in an 'unexpected' state, we'll complain and retry with backoff forever.
|
||||
* The 'unexpected' state needs to be fixed either by another running Procedure or by operator
|
||||
* intervention (Regions in 'unexpected' state indicates bug or unexpected transition type).
|
||||
* For this to work, subclasses need to persist the 'attempt' counter kept in this class when
|
||||
* they do serializeStateData and restore it inside their deserializeStateData, just as they do
|
||||
* for {@link #regionInfo}.
|
||||
* </li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* <p>TODO: Considering it is a priority doing all we can to get make a region available as soon as
|
||||
* possible, re-attempting with any target makes sense if specified target fails in case of
|
||||
* {@link AssignProcedure}. For {@link UnassignProcedure}, our concern is preventing data loss
|
||||
* on failed unassign. See class doc for explanation.
|
||||
* Leave here only for checking if we can successfully start the master.
|
||||
* @deprecated Do not use any more.
|
||||
* @see TransitRegionStateProcedure
|
||||
*/
|
||||
@Deprecated
|
||||
@InterfaceAudience.Private
|
||||
public abstract class RegionTransitionProcedure
|
||||
extends Procedure<MasterProcedureEnv>
|
||||
implements TableProcedureInterface,
|
||||
RemoteProcedure<MasterProcedureEnv, ServerName> {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(RegionTransitionProcedure.class);
|
||||
public abstract class RegionTransitionProcedure extends Procedure<MasterProcedureEnv>
|
||||
implements TableProcedureInterface, RemoteProcedure<MasterProcedureEnv, ServerName> {
|
||||
|
||||
protected final AtomicBoolean aborted = new AtomicBoolean(false);
|
||||
|
||||
private RegionTransitionState transitionState = RegionTransitionState.REGION_TRANSITION_QUEUE;
|
||||
/**
|
||||
* This data member must be persisted. Expectation is that it is done by subclasses in their
|
||||
* {@link #serializeStateData(ProcedureStateSerializer)} call, restoring {@link #regionInfo}
|
||||
* in their {@link #deserializeStateData(ProcedureStateSerializer)} method.
|
||||
*/
|
||||
|
||||
private RegionInfo regionInfo;
|
||||
|
||||
/**
|
||||
* Like {@link #regionInfo}, the expectation is that subclasses persist the value of this
|
||||
* data member. It is used doing backoff when Procedure gets stuck.
|
||||
*/
|
||||
private int attempt;
|
||||
|
||||
// Required by the Procedure framework to create the procedure on replay
|
||||
public RegionTransitionProcedure() {}
|
||||
public RegionTransitionProcedure() {
|
||||
}
|
||||
|
||||
public RegionTransitionProcedure(final RegionInfo regionInfo) {
|
||||
this.regionInfo = regionInfo;
|
||||
|
@ -128,22 +68,10 @@ public abstract class RegionTransitionProcedure
|
|||
return regionInfo;
|
||||
}
|
||||
|
||||
/**
|
||||
* This setter is for subclasses to call in their
|
||||
* {@link #deserializeStateData(ProcedureStateSerializer)} method. Expectation is that
|
||||
* subclasses will persist `regioninfo` in their
|
||||
* {@link #serializeStateData(ProcedureStateSerializer)} method and then restore `regionInfo` on
|
||||
* deserialization by calling.
|
||||
*/
|
||||
protected void setRegionInfo(final RegionInfo regionInfo) {
|
||||
this.regionInfo = regionInfo;
|
||||
}
|
||||
|
||||
/**
|
||||
* This setter is for subclasses to call in their
|
||||
* {@link #deserializeStateData(ProcedureStateSerializer)} method.
|
||||
* @see #setRegionInfo(RegionInfo)
|
||||
*/
|
||||
protected void setAttempt(int attempt) {
|
||||
this.attempt = attempt;
|
||||
}
|
||||
|
@ -155,7 +83,7 @@ public abstract class RegionTransitionProcedure
|
|||
@Override
|
||||
public TableName getTableName() {
|
||||
RegionInfo hri = getRegionInfo();
|
||||
return hri != null? hri.getTable(): null;
|
||||
return hri != null ? hri.getTable() : null;
|
||||
}
|
||||
|
||||
public boolean isMeta() {
|
||||
|
@ -168,7 +96,7 @@ public abstract class RegionTransitionProcedure
|
|||
sb.append(" table=");
|
||||
sb.append(getTableName());
|
||||
sb.append(", region=");
|
||||
sb.append(getRegionInfo() == null? null: getRegionInfo().getEncodedName());
|
||||
sb.append(getRegionInfo() == null ? null : getRegionInfo().getEncodedName());
|
||||
}
|
||||
|
||||
public RegionStateNode getRegionState(final MasterProcedureEnv env) {
|
||||
|
@ -184,113 +112,26 @@ public abstract class RegionTransitionProcedure
|
|||
}
|
||||
|
||||
protected abstract boolean startTransition(MasterProcedureEnv env, RegionStateNode regionNode)
|
||||
throws IOException, ProcedureSuspendedException;
|
||||
throws IOException, ProcedureSuspendedException;
|
||||
|
||||
/**
|
||||
* Called when the Procedure is in the REGION_TRANSITION_DISPATCH state.
|
||||
* In here we do the RPC call to OPEN/CLOSE the region. The suspending of
|
||||
* the thread so it sleeps until it gets update that the OPEN/CLOSE has
|
||||
* succeeded is complicated. Read the implementations to learn more.
|
||||
*/
|
||||
protected abstract boolean updateTransition(MasterProcedureEnv env, RegionStateNode regionNode)
|
||||
throws IOException, ProcedureSuspendedException;
|
||||
throws IOException, ProcedureSuspendedException;
|
||||
|
||||
protected abstract void finishTransition(MasterProcedureEnv env, RegionStateNode regionNode)
|
||||
throws IOException, ProcedureSuspendedException;
|
||||
throws IOException, ProcedureSuspendedException;
|
||||
|
||||
protected abstract void reportTransition(MasterProcedureEnv env,
|
||||
RegionStateNode regionNode, TransitionCode code, long seqId) throws UnexpectedStateException;
|
||||
protected abstract void reportTransition(MasterProcedureEnv env, RegionStateNode regionNode,
|
||||
TransitionCode code, long seqId) throws UnexpectedStateException;
|
||||
|
||||
@Override
|
||||
public abstract RemoteOperation remoteCallBuild(MasterProcedureEnv env, ServerName serverName);
|
||||
|
||||
/**
|
||||
* @return True if processing of fail is complete; the procedure will be woken from its suspend
|
||||
* and we'll go back to running through procedure steps:
|
||||
* otherwise if false we leave the procedure in suspended state.
|
||||
*/
|
||||
protected abstract boolean remoteCallFailed(MasterProcedureEnv env,
|
||||
RegionStateNode regionNode, IOException exception);
|
||||
protected abstract boolean remoteCallFailed(MasterProcedureEnv env, RegionStateNode regionNode,
|
||||
IOException exception);
|
||||
|
||||
@Override
|
||||
public synchronized void remoteCallFailed(final MasterProcedureEnv env,
|
||||
final ServerName serverName, final IOException exception) {
|
||||
final RegionStateNode regionNode = getRegionState(env);
|
||||
LOG.warn("Remote call failed {}; {}; {}; exception={}", serverName,
|
||||
this, regionNode.toShortString(), exception.getClass().getSimpleName(), exception);
|
||||
if (remoteCallFailed(env, regionNode, exception)) {
|
||||
// NOTE: This call to wakeEvent puts this Procedure back on the scheduler.
|
||||
// Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond
|
||||
// this method. Just get out of this current processing quickly.
|
||||
regionNode.getProcedureEvent().wake(env.getProcedureScheduler());
|
||||
}
|
||||
// else leave the procedure in suspended state; it is waiting on another call to this callback
|
||||
}
|
||||
|
||||
/**
|
||||
* Be careful! At the end of this method, the procedure has either succeeded
|
||||
* and this procedure has been set into a suspended state OR, we failed and
|
||||
* this procedure has been put back on the scheduler ready for another worker
|
||||
* to pick it up. In both cases, we need to exit the current Worker processing
|
||||
* immediately!
|
||||
* @return True if we successfully dispatched the call and false if we failed;
|
||||
* if failed, we need to roll back any setup done for the dispatch.
|
||||
*/
|
||||
protected boolean addToRemoteDispatcher(final MasterProcedureEnv env,
|
||||
final ServerName targetServer) {
|
||||
LOG.info("Dispatch {}; {}", this, getRegionState(env).toShortString());
|
||||
|
||||
// Put this procedure into suspended mode to wait on report of state change
|
||||
// from remote regionserver. Means Procedure associated ProcedureEvent is marked not 'ready'.
|
||||
getRegionState(env).getProcedureEvent().suspend();
|
||||
|
||||
// Tricky because the below call to addOperationToNode can fail. If it fails, we need to
|
||||
// backtrack on stuff like the 'suspend' done above -- tricky as the 'wake' requests us -- and
|
||||
// ditto up in the caller; it needs to undo state changes. Inside in remoteCallFailed, it does
|
||||
// wake to undo the above suspend.
|
||||
try {
|
||||
env.getRemoteDispatcher().addOperationToNode(targetServer, this);
|
||||
} catch (FailedRemoteDispatchException frde) {
|
||||
remoteCallFailed(env, targetServer, frde);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
protected void reportTransition(final MasterProcedureEnv env, final ServerName serverName,
|
||||
final TransitionCode code, final long seqId) throws UnexpectedStateException {
|
||||
final RegionStateNode regionNode = getRegionState(env);
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Received report " + code + " seqId=" + seqId + ", " +
|
||||
this + "; " + regionNode.toShortString());
|
||||
}
|
||||
if (!serverName.equals(regionNode.getRegionLocation())) {
|
||||
if (isMeta() && regionNode.getRegionLocation() == null) {
|
||||
regionNode.setRegionLocation(serverName);
|
||||
} else {
|
||||
throw new UnexpectedStateException(String.format(
|
||||
"Unexpected state=%s from server=%s; expected server=%s; %s; %s",
|
||||
code, serverName, regionNode.getRegionLocation(),
|
||||
this, regionNode.toShortString()));
|
||||
}
|
||||
}
|
||||
|
||||
reportTransition(env, regionNode, code, seqId);
|
||||
|
||||
// NOTE: This call adds this procedure back on the scheduler.
|
||||
// This makes it so this procedure can run again. Another worker will take
|
||||
// processing to the next stage. At an extreme, the other worker may run in
|
||||
// parallel so DO NOT CHANGE any state hereafter! This should be last thing
|
||||
// done in this processing step.
|
||||
regionNode.getProcedureEvent().wake(env.getProcedureScheduler());
|
||||
}
|
||||
|
||||
protected boolean isServerOnline(final MasterProcedureEnv env, final RegionStateNode regionNode) {
|
||||
return isServerOnline(env, regionNode.getRegionLocation());
|
||||
}
|
||||
|
||||
protected boolean isServerOnline(final MasterProcedureEnv env, final ServerName serverName) {
|
||||
return env.getMasterServices().getServerManager().isServerOnline(serverName);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -303,105 +144,12 @@ public abstract class RegionTransitionProcedure
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Procedure[] execute(final MasterProcedureEnv env) throws ProcedureSuspendedException {
|
||||
final AssignmentManager am = env.getAssignmentManager();
|
||||
final RegionStateNode regionNode = getRegionState(env);
|
||||
if (!am.addRegionInTransition(regionNode, this)) {
|
||||
String msg = String.format(
|
||||
"There is already another procedure running on this region this=%s owner=%s",
|
||||
this, regionNode.getProcedure());
|
||||
LOG.warn(msg + " " + this + "; " + regionNode.toShortString());
|
||||
setAbortFailure(getClass().getSimpleName(), msg);
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
boolean retry;
|
||||
do {
|
||||
retry = false;
|
||||
switch (transitionState) {
|
||||
case REGION_TRANSITION_QUEUE:
|
||||
// 1. push into the AM queue for balancer policy
|
||||
if (!startTransition(env, regionNode)) {
|
||||
// The operation figured it is done or it aborted; check getException()
|
||||
am.removeRegionInTransition(getRegionState(env), this);
|
||||
return null;
|
||||
}
|
||||
transitionState = RegionTransitionState.REGION_TRANSITION_DISPATCH;
|
||||
if (regionNode.getProcedureEvent().suspendIfNotReady(this)) {
|
||||
// Why this suspend? Because we want to ensure Store happens before proceed?
|
||||
throw new ProcedureSuspendedException();
|
||||
}
|
||||
break;
|
||||
|
||||
case REGION_TRANSITION_DISPATCH:
|
||||
// 2. send the request to the target server
|
||||
if (!updateTransition(env, regionNode)) {
|
||||
// The operation figured it is done or it aborted; check getException()
|
||||
am.removeRegionInTransition(regionNode, this);
|
||||
return null;
|
||||
}
|
||||
if (transitionState != RegionTransitionState.REGION_TRANSITION_DISPATCH) {
|
||||
retry = true;
|
||||
break;
|
||||
}
|
||||
if (regionNode.getProcedureEvent().suspendIfNotReady(this)) {
|
||||
throw new ProcedureSuspendedException();
|
||||
}
|
||||
break;
|
||||
|
||||
case REGION_TRANSITION_FINISH:
|
||||
// 3. wait assignment response. completion/failure
|
||||
LOG.debug("Finishing {}; {}", this, regionNode.toShortString());
|
||||
finishTransition(env, regionNode);
|
||||
am.removeRegionInTransition(regionNode, this);
|
||||
return null;
|
||||
}
|
||||
} while (retry);
|
||||
// If here, success so clear out the attempt counter so we start fresh each time we get stuck.
|
||||
this.attempt = 0;
|
||||
} catch (IOException e) {
|
||||
long backoff = getBackoffTime(this.attempt++);
|
||||
LOG.warn("Failed transition, suspend {}secs {}; {}; waiting on rectified condition fixed " +
|
||||
"by other Procedure or operator intervention", backoff / 1000, this,
|
||||
regionNode.toShortString(), e);
|
||||
setTimeout(Math.toIntExact(backoff));
|
||||
setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
|
||||
throw new ProcedureSuspendedException();
|
||||
}
|
||||
|
||||
return new Procedure[] {this};
|
||||
}
|
||||
|
||||
private long getBackoffTime(int attempts) {
|
||||
long backoffTime = (long)(1000 * Math.pow(2, attempts));
|
||||
long maxBackoffTime = 60 * 60 * 1000; // An hour. Hard-coded for for now.
|
||||
return backoffTime < maxBackoffTime? backoffTime: maxBackoffTime;
|
||||
}
|
||||
|
||||
/**
|
||||
* At end of timeout, wake ourselves up so we run again.
|
||||
*/
|
||||
@Override
|
||||
protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
|
||||
setState(ProcedureProtos.ProcedureState.RUNNABLE);
|
||||
env.getProcedureScheduler().addFront(this);
|
||||
return false; // 'false' means that this procedure handled the timeout
|
||||
protected Procedure[] execute(final MasterProcedureEnv env) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void rollback(final MasterProcedureEnv env) {
|
||||
if (isRollbackSupported(transitionState)) {
|
||||
// Nothing done up to this point. abort safely.
|
||||
// This should happen when something like disableTable() is triggered.
|
||||
env.getAssignmentManager().removeRegionInTransition(getRegionState(env), this);
|
||||
return;
|
||||
}
|
||||
|
||||
// There is no rollback for assignment unless we cancel the operation by
|
||||
// dropping/disabling the table.
|
||||
throw new UnsupportedOperationException("Unhandled state " + transitionState +
|
||||
"; there is no rollback for assignment unless we cancel the operation by " +
|
||||
"dropping/disabling the table");
|
||||
protected void rollback(MasterProcedureEnv env) {
|
||||
}
|
||||
|
||||
protected abstract boolean isRollbackSupported(final RegionTransitionState state);
|
||||
|
@ -415,54 +163,6 @@ public abstract class RegionTransitionProcedure
|
|||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean waitInitialized(MasterProcedureEnv env) {
|
||||
// Unless we are assigning meta, wait for meta to be available and loaded.
|
||||
if (isMeta()) {
|
||||
return false;
|
||||
}
|
||||
AssignmentManager am = env.getAssignmentManager();
|
||||
return am.waitMetaLoaded(this) || am.waitMetaAssigned(this, regionInfo);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected LockState acquireLock(final MasterProcedureEnv env) {
|
||||
// TODO: Revisit this and move it to the executor
|
||||
if (env.getProcedureScheduler().waitRegion(this, getRegionInfo())) {
|
||||
try {
|
||||
LOG.debug(LockState.LOCK_EVENT_WAIT + " pid=" + getProcId() + " " +
|
||||
env.getProcedureScheduler().dumpLocks());
|
||||
} catch (IOException e) {
|
||||
// ignore, just for logging
|
||||
}
|
||||
return LockState.LOCK_EVENT_WAIT;
|
||||
}
|
||||
return LockState.LOCK_ACQUIRED;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void releaseLock(final MasterProcedureEnv env) {
|
||||
env.getProcedureScheduler().wakeRegion(this, getRegionInfo());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean holdLock(final MasterProcedureEnv env) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean shouldWaitClientAck(MasterProcedureEnv env) {
|
||||
// The operation is triggered internally on the server
|
||||
// the client does not know about this procedure.
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Used by ServerCrashProcedure to see if this Assign/Unassign needs processing.
|
||||
* @return ServerName the Assign or Unassign is going against.
|
||||
*/
|
||||
public abstract ServerName getServer(final MasterProcedureEnv env);
|
||||
|
||||
@Override
|
||||
public void remoteOperationCompleted(MasterProcedureEnv env) {
|
||||
// should not be called for region operation until we modified the open/close region procedure
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/*
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
|
@ -15,28 +15,41 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master;
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
|
||||
/**
|
||||
* Get notification of assignment events. The invocations are inline
|
||||
* so make sure your implementation is fast else you'll slow hbase.
|
||||
* Server State.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public interface AssignmentListener {
|
||||
enum ServerState {
|
||||
/**
|
||||
* The region was opened on the specified server.
|
||||
* @param regionInfo The opened region.
|
||||
* @param serverName The remote servers name.
|
||||
* Initial state. Available.
|
||||
*/
|
||||
void regionOpened(final RegionInfo regionInfo, final ServerName serverName);
|
||||
ONLINE,
|
||||
|
||||
/**
|
||||
* The region was closed on the region server.
|
||||
* @param regionInfo The closed region.
|
||||
* Only server which carries meta can have this state. We will split wal for meta and then
|
||||
* assign meta first before splitting other wals.
|
||||
*/
|
||||
void regionClosed(final RegionInfo regionInfo);
|
||||
}
|
||||
SPLITTING_META,
|
||||
|
||||
/**
|
||||
* Indicate that the meta splitting is done. We need this state so that the UnassignProcedure
|
||||
* for meta can safely quit. See the comments in UnassignProcedure.remoteCallFailed for more
|
||||
* details.
|
||||
*/
|
||||
SPLITTING_META_DONE,
|
||||
|
||||
/**
|
||||
* Server expired/crashed. Currently undergoing WAL splitting.
|
||||
*/
|
||||
SPLITTING,
|
||||
|
||||
/**
|
||||
* WAL splitting done. This state will be used to tell the UnassignProcedure that it can safely
|
||||
* quit. See the comments in UnassignProcedure.remoteCallFailed for more details.
|
||||
*/
|
||||
OFFLINE
|
||||
}
|
|
@ -0,0 +1,128 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
|
||||
/**
|
||||
* State of Server; list of hosted regions, etc.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
class ServerStateNode implements Comparable<ServerStateNode> {
|
||||
|
||||
private static final class ServerReportEvent extends ProcedureEvent<ServerName> {
|
||||
public ServerReportEvent(final ServerName serverName) {
|
||||
super(serverName);
|
||||
}
|
||||
}
|
||||
|
||||
private final ServerReportEvent reportEvent;
|
||||
|
||||
private final Set<RegionStateNode> regions;
|
||||
private final ServerName serverName;
|
||||
|
||||
private volatile ServerState state = ServerState.ONLINE;
|
||||
|
||||
public ServerStateNode(final ServerName serverName) {
|
||||
this.serverName = serverName;
|
||||
this.regions = ConcurrentHashMap.newKeySet();
|
||||
this.reportEvent = new ServerReportEvent(serverName);
|
||||
}
|
||||
|
||||
public ServerName getServerName() {
|
||||
return serverName;
|
||||
}
|
||||
|
||||
public ServerState getState() {
|
||||
return state;
|
||||
}
|
||||
|
||||
public ProcedureEvent<?> getReportEvent() {
|
||||
return reportEvent;
|
||||
}
|
||||
|
||||
public boolean isInState(final ServerState... expected) {
|
||||
boolean expectedState = false;
|
||||
if (expected != null) {
|
||||
for (int i = 0; i < expected.length; ++i) {
|
||||
expectedState |= (state == expected[i]);
|
||||
}
|
||||
}
|
||||
return expectedState;
|
||||
}
|
||||
|
||||
void setState(final ServerState state) {
|
||||
this.state = state;
|
||||
}
|
||||
|
||||
public Set<RegionStateNode> getRegions() {
|
||||
return regions;
|
||||
}
|
||||
|
||||
public int getRegionCount() {
|
||||
return regions.size();
|
||||
}
|
||||
|
||||
public ArrayList<RegionInfo> getRegionInfoList() {
|
||||
ArrayList<RegionInfo> hris = new ArrayList<RegionInfo>(regions.size());
|
||||
for (RegionStateNode region : regions) {
|
||||
hris.add(region.getRegionInfo());
|
||||
}
|
||||
return hris;
|
||||
}
|
||||
|
||||
public void addRegion(final RegionStateNode regionNode) {
|
||||
this.regions.add(regionNode);
|
||||
}
|
||||
|
||||
public void removeRegion(final RegionStateNode regionNode) {
|
||||
this.regions.remove(regionNode);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(final ServerStateNode other) {
|
||||
return getServerName().compareTo(other.getServerName());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return getServerName().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
if (!(other instanceof ServerStateNode)) {
|
||||
return false;
|
||||
}
|
||||
return compareTo((ServerStateNode) other) == 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("ServerStateNode(%s)", getServerName());
|
||||
}
|
||||
}
|
|
@ -15,7 +15,6 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -32,6 +31,7 @@ import java.util.concurrent.ExecutorService;
|
|||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
@ -45,13 +45,11 @@ import org.apache.hadoop.hbase.client.MasterSwitchType;
|
|||
import org.apache.hadoop.hbase.client.Mutation;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
||||
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
|
||||
import org.apache.hadoop.hbase.client.TableDescriptor;
|
||||
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
|
||||
import org.apache.hadoop.hbase.master.MasterCoprocessorHost;
|
||||
import org.apache.hadoop.hbase.master.MasterFileSystem;
|
||||
import org.apache.hadoop.hbase.master.RegionState.State;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.master.normalizer.NormalizationPlan;
|
||||
import org.apache.hadoop.hbase.master.procedure.AbstractStateMachineRegionProcedure;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
|
@ -138,16 +136,6 @@ public class SplitTableRegionProcedure
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether there are recovered.edits in the parent closed region.
|
||||
* @param env master env
|
||||
* @throws IOException IOException
|
||||
*/
|
||||
static boolean hasRecoveredEdits(MasterProcedureEnv env, RegionInfo ri) throws IOException {
|
||||
return WALSplitter.hasRecoveredEdits(env.getMasterServices().getFileSystem(),
|
||||
env.getMasterConfiguration(), ri);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether the region is splittable
|
||||
* @param env MasterProcedureEnv
|
||||
|
@ -169,12 +157,13 @@ public class SplitTableRegionProcedure
|
|||
if (node != null) {
|
||||
try {
|
||||
if (bestSplitRow == null || bestSplitRow.length == 0) {
|
||||
LOG.info("splitKey isn't explicitly specified, " + " will try to find a best split key from RS");
|
||||
LOG
|
||||
.info("splitKey isn't explicitly specified, will try to find a best split key from RS");
|
||||
}
|
||||
// Always set bestSplitRow request as true here,
|
||||
// need to call Region#checkSplit to check it splittable or not
|
||||
GetRegionInfoResponse response =
|
||||
Util.getRegionInfoResponse(env, node.getRegionLocation(), node.getRegionInfo(), true);
|
||||
GetRegionInfoResponse response = AssignmentManagerUtil.getRegionInfoResponse(env,
|
||||
node.getRegionLocation(), node.getRegionInfo(), true);
|
||||
if(bestSplitRow == null || bestSplitRow.length == 0) {
|
||||
bestSplitRow = response.hasBestSplitRow() ? response.getBestSplitRow().toByteArray() : null;
|
||||
}
|
||||
|
@ -189,14 +178,17 @@ public class SplitTableRegionProcedure
|
|||
}
|
||||
|
||||
if (!splittable) {
|
||||
IOException e = new DoNotRetryIOException(regionToSplit.getShortNameToLog() + " NOT splittable");
|
||||
if (splittableCheckIOE != null) e.initCause(splittableCheckIOE);
|
||||
IOException e =
|
||||
new DoNotRetryIOException(regionToSplit.getShortNameToLog() + " NOT splittable");
|
||||
if (splittableCheckIOE != null) {
|
||||
e.initCause(splittableCheckIOE);
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
|
||||
if(bestSplitRow == null || bestSplitRow.length == 0) {
|
||||
throw new DoNotRetryIOException("Region not splittable because bestSplitPoint = null, "
|
||||
+ "maybe table is too small for auto split. For force split, try specifying split row");
|
||||
if (bestSplitRow == null || bestSplitRow.length == 0) {
|
||||
throw new DoNotRetryIOException("Region not splittable because bestSplitPoint = null, " +
|
||||
"maybe table is too small for auto split. For force split, try specifying split row");
|
||||
}
|
||||
|
||||
if (Bytes.equals(regionToSplit.getStartKey(), bestSplitRow)) {
|
||||
|
@ -205,9 +197,8 @@ public class SplitTableRegionProcedure
|
|||
}
|
||||
|
||||
if (!regionToSplit.containsRow(bestSplitRow)) {
|
||||
throw new DoNotRetryIOException(
|
||||
"Split row is not inside region key range splitKey:" + Bytes.toStringBinary(splitRow) +
|
||||
" region: " + regionToSplit);
|
||||
throw new DoNotRetryIOException("Split row is not inside region key range splitKey:" +
|
||||
Bytes.toStringBinary(splitRow) + " region: " + regionToSplit);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -228,8 +219,19 @@ public class SplitTableRegionProcedure
|
|||
return rid;
|
||||
}
|
||||
|
||||
private void removeNonDefaultReplicas(MasterProcedureEnv env) throws IOException {
|
||||
AssignmentManagerUtil.removeNonDefaultReplicas(env, Stream.of(getParentRegion()),
|
||||
getRegionReplication(env));
|
||||
}
|
||||
|
||||
private void checkClosedRegions(MasterProcedureEnv env) throws IOException {
|
||||
// theoretically this should not happen any more after we use TRSP, but anyway let's add a check
|
||||
// here
|
||||
AssignmentManagerUtil.checkClosedRegion(env, getParentRegion());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Flow executeFromState(final MasterProcedureEnv env, final SplitTableRegionState state)
|
||||
protected Flow executeFromState(MasterProcedureEnv env, SplitTableRegionState state)
|
||||
throws InterruptedException {
|
||||
LOG.trace("{} execute state={}", this, state);
|
||||
|
||||
|
@ -247,24 +249,15 @@ public class SplitTableRegionProcedure
|
|||
setNextState(SplitTableRegionState.SPLIT_TABLE_REGION_CLOSE_PARENT_REGION);
|
||||
break;
|
||||
case SPLIT_TABLE_REGION_CLOSE_PARENT_REGION:
|
||||
addChildProcedure(createUnassignProcedures(env, getRegionReplication(env)));
|
||||
addChildProcedure(createUnassignProcedures(env));
|
||||
setNextState(SplitTableRegionState.SPLIT_TABLE_REGIONS_CHECK_CLOSED_REGIONS);
|
||||
break;
|
||||
case SPLIT_TABLE_REGIONS_CHECK_CLOSED_REGIONS:
|
||||
if (hasRecoveredEdits(env, getRegion())) {
|
||||
// If recovered edits, reopen parent region and then re-run the close by going back to
|
||||
// SPLIT_TABLE_REGION_CLOSE_PARENT_REGION. We might have to cycle here a few times
|
||||
// (TODO: Add being able to open a region in read-only mode). Open the primary replica
|
||||
// in this case only where we just want to pickup the left-out replicated.edits.
|
||||
LOG.info("Found recovered.edits under {}, reopen so we pickup these missed edits!",
|
||||
getRegion().getEncodedName());
|
||||
addChildProcedure(env.getAssignmentManager().createAssignProcedure(getParentRegion()));
|
||||
setNextState(SplitTableRegionState.SPLIT_TABLE_REGION_CLOSE_PARENT_REGION);
|
||||
} else {
|
||||
setNextState(SplitTableRegionState.SPLIT_TABLE_REGION_CREATE_DAUGHTER_REGIONS);
|
||||
}
|
||||
checkClosedRegions(env);
|
||||
setNextState(SplitTableRegionState.SPLIT_TABLE_REGION_CREATE_DAUGHTER_REGIONS);
|
||||
break;
|
||||
case SPLIT_TABLE_REGION_CREATE_DAUGHTER_REGIONS:
|
||||
removeNonDefaultReplicas(env);
|
||||
createDaughterRegions(env);
|
||||
setNextState(SplitTableRegionState.SPLIT_TABLE_REGION_WRITE_MAX_SEQUENCE_ID_FILE);
|
||||
break;
|
||||
|
@ -285,7 +278,7 @@ public class SplitTableRegionProcedure
|
|||
setNextState(SplitTableRegionState.SPLIT_TABLE_REGION_OPEN_CHILD_REGIONS);
|
||||
break;
|
||||
case SPLIT_TABLE_REGION_OPEN_CHILD_REGIONS:
|
||||
addChildProcedure(createAssignProcedures(env, getRegionReplication(env)));
|
||||
addChildProcedure(createAssignProcedures(env));
|
||||
setNextState(SplitTableRegionState.SPLIT_TABLE_REGION_POST_OPERATION);
|
||||
break;
|
||||
case SPLIT_TABLE_REGION_POST_OPERATION:
|
||||
|
@ -544,24 +537,14 @@ public class SplitTableRegionProcedure
|
|||
|
||||
/**
|
||||
* Rollback close parent region
|
||||
* @param env MasterProcedureEnv
|
||||
*/
|
||||
private void openParentRegion(final MasterProcedureEnv env) throws IOException {
|
||||
// Check whether the region is closed; if so, open it in the same server
|
||||
final int regionReplication = getRegionReplication(env);
|
||||
final ServerName serverName = getParentRegionServerName(env);
|
||||
|
||||
final AssignProcedure[] procs = new AssignProcedure[regionReplication];
|
||||
for (int i = 0; i < regionReplication; ++i) {
|
||||
final RegionInfo hri = RegionReplicaUtil.getRegionInfoForReplica(getParentRegion(), i);
|
||||
procs[i] = env.getAssignmentManager().createAssignProcedure(hri, serverName);
|
||||
}
|
||||
env.getMasterServices().getMasterProcedureExecutor().submitProcedures(procs);
|
||||
private void openParentRegion(MasterProcedureEnv env) throws IOException {
|
||||
AssignmentManagerUtil.reopenRegionsForRollback(env, Stream.of(getParentRegion()),
|
||||
getRegionReplication(env), getParentRegionServerName(env));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create daughter regions
|
||||
* @param env MasterProcedureEnv
|
||||
*/
|
||||
@VisibleForTesting
|
||||
public void createDaughterRegions(final MasterProcedureEnv env) throws IOException {
|
||||
|
@ -818,35 +801,21 @@ public class SplitTableRegionProcedure
|
|||
}
|
||||
|
||||
private ServerName getParentRegionServerName(final MasterProcedureEnv env) {
|
||||
return env.getMasterServices().getAssignmentManager()
|
||||
.getRegionStates().getRegionServerOfRegion(getParentRegion());
|
||||
return env.getMasterServices().getAssignmentManager().getRegionStates()
|
||||
.getRegionServerOfRegion(getParentRegion());
|
||||
}
|
||||
|
||||
private UnassignProcedure[] createUnassignProcedures(final MasterProcedureEnv env,
|
||||
final int regionReplication) {
|
||||
final UnassignProcedure[] procs = new UnassignProcedure[regionReplication];
|
||||
for (int i = 0; i < procs.length; ++i) {
|
||||
final RegionInfo hri = RegionReplicaUtil.getRegionInfoForReplica(getParentRegion(), i);
|
||||
procs[i] = env.getAssignmentManager().
|
||||
createUnassignProcedure(hri, null, true, !RegionReplicaUtil.isDefaultReplica(hri));
|
||||
}
|
||||
return procs;
|
||||
private TransitRegionStateProcedure[] createUnassignProcedures(MasterProcedureEnv env)
|
||||
throws IOException {
|
||||
return AssignmentManagerUtil.createUnassignProceduresForSplitOrMerge(env,
|
||||
Stream.of(getParentRegion()), getRegionReplication(env));
|
||||
}
|
||||
|
||||
private AssignProcedure[] createAssignProcedures(final MasterProcedureEnv env,
|
||||
final int regionReplication) {
|
||||
final ServerName targetServer = getParentRegionServerName(env);
|
||||
final AssignProcedure[] procs = new AssignProcedure[regionReplication * 2];
|
||||
int procsIdx = 0;
|
||||
for (int i = 0; i < regionReplication; ++i) {
|
||||
final RegionInfo hri = RegionReplicaUtil.getRegionInfoForReplica(daughter_1_RI, i);
|
||||
procs[procsIdx++] = env.getAssignmentManager().createAssignProcedure(hri, targetServer);
|
||||
}
|
||||
for (int i = 0; i < regionReplication; ++i) {
|
||||
final RegionInfo hri = RegionReplicaUtil.getRegionInfoForReplica(daughter_2_RI, i);
|
||||
procs[procsIdx++] = env.getAssignmentManager().createAssignProcedure(hri, targetServer);
|
||||
}
|
||||
return procs;
|
||||
private TransitRegionStateProcedure[] createAssignProcedures(MasterProcedureEnv env)
|
||||
throws IOException {
|
||||
return AssignmentManagerUtil.createAssignProceduresForOpeningNewRegions(env,
|
||||
Stream.of(daughter_1_RI, daughter_2_RI), getRegionReplication(env),
|
||||
getParentRegionServerName(env));
|
||||
}
|
||||
|
||||
private int getRegionReplication(final MasterProcedureEnv env) throws IOException {
|
||||
|
|
|
@ -0,0 +1,569 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import edu.umd.cs.findbugs.annotations.Nullable;
|
||||
import java.io.IOException;
|
||||
import org.apache.hadoop.hbase.HBaseIOException;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
|
||||
import org.apache.hadoop.hbase.client.RetriesExhaustedException;
|
||||
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
|
||||
import org.apache.hadoop.hbase.master.RegionState.State;
|
||||
import org.apache.hadoop.hbase.master.procedure.AbstractStateMachineRegionProcedure;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionStateTransitionState;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionStateTransitionStateData;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
||||
|
||||
/**
|
||||
* The procedure to deal with the state transition of a region. A region with a TRSP in place is
|
||||
* called RIT, i.e, RegionInTransition.
|
||||
* <p/>
|
||||
* It can be used to assign/unassign/reopen/move a region, and for
|
||||
* {@link #unassign(MasterProcedureEnv, RegionInfo)} and
|
||||
* {@link #reopen(MasterProcedureEnv, RegionInfo)}, you do not need to specify a target server, and
|
||||
* for {@link #assign(MasterProcedureEnv, RegionInfo, ServerName)} and
|
||||
* {@link #move(MasterProcedureEnv, RegionInfo, ServerName)}, if you want to you can provide a
|
||||
* target server. And for {@link #move(MasterProcedureEnv, RegionInfo, ServerName)}, if you do not
|
||||
* specify a targetServer, we will select one randomly.
|
||||
* <p/>
|
||||
* <p/>
|
||||
* The typical state transition for assigning a region is:
|
||||
*
|
||||
* <pre>
|
||||
* GET_ASSIGN_CANDIDATE ------> OPEN -----> CONFIRM_OPENED
|
||||
* </pre>
|
||||
*
|
||||
* Notice that, if there are failures we may go back to the {@code GET_ASSIGN_CANDIDATE} state to
|
||||
* try again.
|
||||
* <p/>
|
||||
* The typical state transition for unassigning a region is:
|
||||
*
|
||||
* <pre>
|
||||
* CLOSE -----> CONFIRM_CLOSED
|
||||
* </pre>
|
||||
*
|
||||
* Here things go a bit different, if there are failures, especially that if there is a server
|
||||
* crash, we will go to the {@code GET_ASSIGN_CANDIDATE} state to bring the region online first, and
|
||||
* then go through the normal way to unassign it.
|
||||
* <p/>
|
||||
* The typical state transition for reopening/moving a region is:
|
||||
*
|
||||
* <pre>
|
||||
* CLOSE -----> CONFIRM_CLOSED -----> GET_ASSIGN_CANDIDATE ------> OPEN -----> CONFIRM_OPENED
|
||||
* </pre>
|
||||
*
|
||||
* The retry logic is the same with the above assign/unassign.
|
||||
* <p/>
|
||||
* Notice that, although we allow specify a target server, it just acts as a candidate, we do not
|
||||
* guarantee that the region will finally be on the target server. If this is important for you, you
|
||||
* should check whether the region is on the target server after the procedure is finished.
|
||||
* <p/>
|
||||
* When you want to schedule a TRSP, please check whether there is still one for this region, and
|
||||
* the check should be under the RegionStateNode lock. We will remove the TRSP from a
|
||||
* RegionStateNode when we are done, see the code in {@code reportTransition} method below. There
|
||||
* could be at most one TRSP for a give region.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class TransitRegionStateProcedure
|
||||
extends AbstractStateMachineRegionProcedure<RegionStateTransitionState> {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(TransitRegionStateProcedure.class);
|
||||
|
||||
private RegionStateTransitionState initialState;
|
||||
|
||||
private RegionStateTransitionState lastState;
|
||||
|
||||
// the candidate where we want to assign the region to.
|
||||
private ServerName assignCandidate;
|
||||
|
||||
private boolean forceNewPlan;
|
||||
|
||||
private int attempt;
|
||||
|
||||
public TransitRegionStateProcedure() {
|
||||
}
|
||||
|
||||
private TransitRegionStateProcedure(MasterProcedureEnv env, RegionInfo hri,
|
||||
ServerName assignCandidate, boolean forceNewPlan, RegionStateTransitionState initialState,
|
||||
RegionStateTransitionState lastState) {
|
||||
super(env, hri);
|
||||
this.assignCandidate = assignCandidate;
|
||||
this.forceNewPlan = forceNewPlan;
|
||||
this.initialState = initialState;
|
||||
this.lastState = lastState;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TableOperationType getTableOperationType() {
|
||||
// TODO: maybe we should make another type here, REGION_TRANSITION?
|
||||
return TableOperationType.REGION_EDIT;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean waitInitialized(MasterProcedureEnv env) {
|
||||
if (TableName.isMetaTableName(getTableName())) {
|
||||
return false;
|
||||
}
|
||||
// First we need meta to be loaded, and second, if meta is not online then we will likely to
|
||||
// fail when updating meta so we wait until it is assigned.
|
||||
AssignmentManager am = env.getAssignmentManager();
|
||||
return am.waitMetaLoaded(this) || am.waitMetaAssigned(this, getRegion());
|
||||
}
|
||||
|
||||
private void queueAssign(MasterProcedureEnv env, RegionStateNode regionNode)
|
||||
throws ProcedureSuspendedException {
|
||||
// Here the assumption is that, the region must be in CLOSED state, so the region location
|
||||
// will be null. And if we fail to open the region and retry here, the forceNewPlan will be
|
||||
// true, and also we will set the region location to null.
|
||||
boolean retain = false;
|
||||
if (!forceNewPlan) {
|
||||
if (assignCandidate != null) {
|
||||
retain = assignCandidate.equals(regionNode.getLastHost());
|
||||
regionNode.setRegionLocation(assignCandidate);
|
||||
} else if (regionNode.getLastHost() != null) {
|
||||
retain = true;
|
||||
LOG.info("Setting lastHost as the region location {}", regionNode.getLastHost());
|
||||
regionNode.setRegionLocation(regionNode.getLastHost());
|
||||
}
|
||||
}
|
||||
LOG.info("Starting {}; {}; forceNewPlan={}, retain={}", this, regionNode.toShortString(),
|
||||
forceNewPlan, retain);
|
||||
env.getAssignmentManager().queueAssign(regionNode);
|
||||
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_OPEN);
|
||||
if (regionNode.getProcedureEvent().suspendIfNotReady(this)) {
|
||||
throw new ProcedureSuspendedException();
|
||||
}
|
||||
}
|
||||
|
||||
private void openRegion(MasterProcedureEnv env, RegionStateNode regionNode) throws IOException {
|
||||
ServerName loc = regionNode.getRegionLocation();
|
||||
if (loc == null) {
|
||||
LOG.warn("No location specified for {}, jump back to state {} to get one", getRegion(),
|
||||
RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
|
||||
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
|
||||
return;
|
||||
}
|
||||
env.getAssignmentManager().regionOpening(regionNode);
|
||||
addChildProcedure(new OpenRegionProcedure(getRegion(), loc));
|
||||
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED);
|
||||
}
|
||||
|
||||
private Flow confirmOpened(MasterProcedureEnv env, RegionStateNode regionNode)
|
||||
throws IOException {
|
||||
// notice that, for normal case, if we successfully opened a region, we will not arrive here, as
|
||||
// in reportTransition we will call unsetProcedure, and in executeFromState we will return
|
||||
// directly. But if the master is crashed before we finish the procedure, then next time we will
|
||||
// arrive here. So we still need to add code for normal cases.
|
||||
if (regionNode.isInState(State.OPEN)) {
|
||||
attempt = 0;
|
||||
if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED) {
|
||||
// we are the last state, finish
|
||||
regionNode.unsetProcedure(this);
|
||||
return Flow.NO_MORE_STATE;
|
||||
}
|
||||
// It is possible that we arrive here but confirm opened is not the last state, for example,
|
||||
// when merging or splitting a region, we unassign the region from a RS and the RS is crashed,
|
||||
// then there will be recovered edits for this region, we'd better make the region online
|
||||
// again and then unassign it, otherwise we have to fail the merge/split procedure as we may
|
||||
// loss data.
|
||||
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE);
|
||||
return Flow.HAS_MORE_STATE;
|
||||
}
|
||||
|
||||
if (incrementAndCheckMaxAttempts(env, regionNode)) {
|
||||
env.getAssignmentManager().regionFailedOpen(regionNode, true);
|
||||
setFailure(getClass().getSimpleName(), new RetriesExhaustedException(
|
||||
"Max attempts " + env.getAssignmentManager().getAssignMaxAttempts() + " exceeded"));
|
||||
regionNode.unsetProcedure(this);
|
||||
return Flow.NO_MORE_STATE;
|
||||
}
|
||||
env.getAssignmentManager().regionFailedOpen(regionNode, false);
|
||||
// we failed to assign the region, force a new plan
|
||||
forceNewPlan = true;
|
||||
regionNode.setRegionLocation(null);
|
||||
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
|
||||
// Here we do not throw exception because we want to the region to be online ASAP
|
||||
return Flow.HAS_MORE_STATE;
|
||||
}
|
||||
|
||||
private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) throws IOException {
|
||||
if (regionNode.isInState(State.OPEN, State.CLOSING, State.MERGING, State.SPLITTING)) {
|
||||
// this is the normal case
|
||||
env.getAssignmentManager().regionClosing(regionNode);
|
||||
addChildProcedure(
|
||||
new CloseRegionProcedure(getRegion(), regionNode.getRegionLocation(), assignCandidate));
|
||||
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED);
|
||||
} else {
|
||||
forceNewPlan = true;
|
||||
regionNode.setRegionLocation(null);
|
||||
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
|
||||
}
|
||||
}
|
||||
|
||||
private Flow confirmClosed(MasterProcedureEnv env, RegionStateNode regionNode)
|
||||
throws IOException {
|
||||
// notice that, for normal case, if we successfully opened a region, we will not arrive here, as
|
||||
// in reportTransition we will call unsetProcedure, and in executeFromState we will return
|
||||
// directly. But if the master is crashed before we finish the procedure, then next time we will
|
||||
// arrive here. So we still need to add code for normal cases.
|
||||
if (regionNode.isInState(State.CLOSED)) {
|
||||
attempt = 0;
|
||||
if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED) {
|
||||
// we are the last state, finish
|
||||
regionNode.unsetProcedure(this);
|
||||
return Flow.NO_MORE_STATE;
|
||||
}
|
||||
// This means we need to open the region again, should be a move or reopen
|
||||
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
|
||||
return Flow.HAS_MORE_STATE;
|
||||
}
|
||||
if (regionNode.isInState(State.CLOSING)) {
|
||||
// This is possible, think the target RS crashes and restarts immediately, the close region
|
||||
// operation will return a NotServingRegionException soon, we can only recover after SCP takes
|
||||
// care of this RS. So here we throw an IOException to let upper layer to retry with backoff.
|
||||
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE);
|
||||
throw new HBaseIOException("Failed to close region");
|
||||
}
|
||||
// abnormally closed, need to reopen it, no matter what is the last state, see the comment in
|
||||
// confirmOpened for more details that why we need to reopen the region first even if we just
|
||||
// want to close it.
|
||||
// The only exception is for non-default replica, where we do not need to deal with recovered
|
||||
// edits. Notice that the region will remain in ABNORMALLY_CLOSED state, the upper layer need to
|
||||
// deal with this state. For non-default replica, this is usually the same with CLOSED.
|
||||
assert regionNode.isInState(State.ABNORMALLY_CLOSED);
|
||||
if (!RegionReplicaUtil.isDefaultReplica(getRegion()) &&
|
||||
lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED) {
|
||||
regionNode.unsetProcedure(this);
|
||||
return Flow.NO_MORE_STATE;
|
||||
}
|
||||
attempt = 0;
|
||||
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
|
||||
return Flow.HAS_MORE_STATE;
|
||||
}
|
||||
|
||||
// Override to lock RegionStateNode
|
||||
@SuppressWarnings("rawtypes")
|
||||
@Override
|
||||
protected Procedure[] execute(MasterProcedureEnv env)
|
||||
throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
|
||||
RegionStateNode regionNode =
|
||||
env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
|
||||
regionNode.lock();
|
||||
try {
|
||||
return super.execute(env);
|
||||
} finally {
|
||||
regionNode.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
private RegionStateNode getRegionStateNode(MasterProcedureEnv env) {
|
||||
return env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Flow executeFromState(MasterProcedureEnv env, RegionStateTransitionState state)
|
||||
throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
|
||||
RegionStateNode regionNode = getRegionStateNode(env);
|
||||
if (regionNode.getProcedure() != this) {
|
||||
// This is possible, and is the normal case, as we will call unsetProcedure in
|
||||
// reportTransition, this means we have already done
|
||||
// This is because that, when we mark the region as OPENED or CLOSED, then all the works
|
||||
// should have already been done, and logically we could have another TRSP scheduled for this
|
||||
// region immediately(think of a RS crash at the point...).
|
||||
return Flow.NO_MORE_STATE;
|
||||
}
|
||||
try {
|
||||
switch (state) {
|
||||
case REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE:
|
||||
queueAssign(env, regionNode);
|
||||
return Flow.HAS_MORE_STATE;
|
||||
case REGION_STATE_TRANSITION_OPEN:
|
||||
openRegion(env, regionNode);
|
||||
return Flow.HAS_MORE_STATE;
|
||||
case REGION_STATE_TRANSITION_CONFIRM_OPENED:
|
||||
return confirmOpened(env, regionNode);
|
||||
case REGION_STATE_TRANSITION_CLOSE:
|
||||
closeRegion(env, regionNode);
|
||||
return Flow.HAS_MORE_STATE;
|
||||
case REGION_STATE_TRANSITION_CONFIRM_CLOSED:
|
||||
return confirmClosed(env, regionNode);
|
||||
default:
|
||||
throw new UnsupportedOperationException("unhandled state=" + state);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
long backoff = getBackoffTime(this.attempt++);
|
||||
LOG.warn(
|
||||
"Failed transition, suspend {}secs {}; {}; waiting on rectified condition fixed " +
|
||||
"by other Procedure or operator intervention",
|
||||
backoff / 1000, this, regionNode.toShortString(), e);
|
||||
regionNode.getProcedureEvent().suspend();
|
||||
if (regionNode.getProcedureEvent().suspendIfNotReady(this)) {
|
||||
setTimeout(Math.toIntExact(backoff));
|
||||
setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
|
||||
throw new ProcedureSuspendedException();
|
||||
}
|
||||
return Flow.HAS_MORE_STATE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* At end of timeout, wake ourselves up so we run again.
|
||||
*/
|
||||
@Override
|
||||
protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
|
||||
setState(ProcedureProtos.ProcedureState.RUNNABLE);
|
||||
getRegionStateNode(env).getProcedureEvent().wake(env.getProcedureScheduler());
|
||||
return false; // 'false' means that this procedure handled the timeout
|
||||
}
|
||||
|
||||
private void reportTransitionOpened(MasterProcedureEnv env, RegionStateNode regionNode,
|
||||
ServerName serverName, TransitionCode code, long openSeqNum) throws IOException {
|
||||
switch (code) {
|
||||
case OPENED:
|
||||
if (openSeqNum < 0) {
|
||||
throw new UnexpectedStateException("Received report unexpected " + code +
|
||||
" transition openSeqNum=" + openSeqNum + ", " + regionNode);
|
||||
}
|
||||
if (openSeqNum <= regionNode.getOpenSeqNum()) {
|
||||
if (openSeqNum != 0) {
|
||||
LOG.warn("Skip update of openSeqNum for {} with {} because the currentSeqNum={}",
|
||||
regionNode, openSeqNum, regionNode.getOpenSeqNum());
|
||||
}
|
||||
} else {
|
||||
regionNode.setOpenSeqNum(openSeqNum);
|
||||
}
|
||||
env.getAssignmentManager().regionOpened(regionNode);
|
||||
if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED) {
|
||||
// we are done
|
||||
regionNode.unsetProcedure(this);
|
||||
}
|
||||
regionNode.getProcedureEvent().wake(env.getProcedureScheduler());
|
||||
break;
|
||||
case FAILED_OPEN:
|
||||
// just wake up the procedure and see if we can retry
|
||||
regionNode.getProcedureEvent().wake(env.getProcedureScheduler());
|
||||
break;
|
||||
default:
|
||||
throw new UnexpectedStateException(
|
||||
"Received report unexpected " + code + " transition openSeqNum=" + openSeqNum + ", " +
|
||||
regionNode.toShortString() + ", " + this + ", expected OPENED or FAILED_OPEN.");
|
||||
}
|
||||
}
|
||||
|
||||
// we do not need seqId for closing a region
|
||||
private void reportTransitionClosed(MasterProcedureEnv env, RegionStateNode regionNode,
|
||||
ServerName serverName, TransitionCode code) throws IOException {
|
||||
switch (code) {
|
||||
case CLOSED:
|
||||
env.getAssignmentManager().regionClosed(regionNode, true);
|
||||
if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED) {
|
||||
// we are done
|
||||
regionNode.unsetProcedure(this);
|
||||
}
|
||||
regionNode.getProcedureEvent().wake(env.getProcedureScheduler());
|
||||
break;
|
||||
default:
|
||||
throw new UnexpectedStateException("Received report unexpected " + code + " transition, " +
|
||||
regionNode.toShortString() + ", " + this + ", expected CLOSED.");
|
||||
}
|
||||
}
|
||||
|
||||
// Should be called with RegionStateNode locked
|
||||
public void reportTransition(MasterProcedureEnv env, RegionStateNode regionNode,
|
||||
ServerName serverName, TransitionCode code, long seqId) throws IOException {
|
||||
switch (getCurrentState()) {
|
||||
case REGION_STATE_TRANSITION_CONFIRM_OPENED:
|
||||
reportTransitionOpened(env, regionNode, serverName, code, seqId);
|
||||
break;
|
||||
case REGION_STATE_TRANSITION_CONFIRM_CLOSED:
|
||||
reportTransitionClosed(env, regionNode, serverName, code);
|
||||
break;
|
||||
default:
|
||||
LOG.warn("{} received unexpected report transition call from {}, code={}, seqId={}", this,
|
||||
serverName, code, seqId);
|
||||
}
|
||||
}
|
||||
|
||||
// Should be called with RegionStateNode locked
|
||||
public void serverCrashed(MasterProcedureEnv env, RegionStateNode regionNode,
|
||||
ServerName serverName) throws IOException {
|
||||
// Notice that, in this method, we do not change the procedure state, instead, we update the
|
||||
// region state in hbase:meta. This is because that, the procedure state change will not be
|
||||
// persisted until the region is woken up and finish one step, if we crash before that then the
|
||||
// information will be lost. So here we will update the region state in hbase:meta, and when the
|
||||
// procedure is woken up, it will process the error and jump to the correct procedure state.
|
||||
RegionStateTransitionState currentState = getCurrentState();
|
||||
switch (currentState) {
|
||||
case REGION_STATE_TRANSITION_CLOSE:
|
||||
case REGION_STATE_TRANSITION_CONFIRM_CLOSED:
|
||||
case REGION_STATE_TRANSITION_CONFIRM_OPENED:
|
||||
// for these 3 states, the region may still be online on the crashed server
|
||||
if (serverName.equals(regionNode.getRegionLocation())) {
|
||||
env.getAssignmentManager().regionClosed(regionNode, false);
|
||||
if (currentState != RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE) {
|
||||
regionNode.getProcedureEvent().wake(env.getProcedureScheduler());
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// If the procedure is in other 2 states, then actually we should not arrive here, as we
|
||||
// know that the region is not online on any server, so we need to do nothing... But anyway
|
||||
// let's add a log here
|
||||
LOG.warn("{} received unexpected server crash call for region {} from {}", this, regionNode,
|
||||
serverName);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private long getBackoffTime(int attempts) {
|
||||
long backoffTime = (long) (1000 * Math.pow(2, attempts));
|
||||
long maxBackoffTime = 60 * 60 * 1000; // An hour. Hard-coded for for now.
|
||||
return backoffTime < maxBackoffTime ? backoffTime : maxBackoffTime;
|
||||
}
|
||||
|
||||
private boolean incrementAndCheckMaxAttempts(MasterProcedureEnv env, RegionStateNode regionNode) {
|
||||
int retries = env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode)
|
||||
.incrementAndGetRetries();
|
||||
int max = env.getAssignmentManager().getAssignMaxAttempts();
|
||||
LOG.info(
|
||||
"Retry=" + retries + " of max=" + max + "; " + this + "; " + regionNode.toShortString());
|
||||
return retries >= max;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void rollbackState(MasterProcedureEnv env, RegionStateTransitionState state)
|
||||
throws IOException, InterruptedException {
|
||||
// no rollback
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected RegionStateTransitionState getState(int stateId) {
|
||||
return RegionStateTransitionState.forNumber(stateId);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int getStateId(RegionStateTransitionState state) {
|
||||
return state.getNumber();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected RegionStateTransitionState getInitialState() {
|
||||
return initialState;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
super.serializeStateData(serializer);
|
||||
RegionStateTransitionStateData.Builder builder = RegionStateTransitionStateData.newBuilder()
|
||||
.setInitialState(initialState).setLastState(lastState).setForceNewPlan(forceNewPlan);
|
||||
if (assignCandidate != null) {
|
||||
builder.setAssignCandidate(ProtobufUtil.toServerName(assignCandidate));
|
||||
}
|
||||
serializer.serialize(builder.build());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
super.deserializeStateData(serializer);
|
||||
RegionStateTransitionStateData data =
|
||||
serializer.deserialize(RegionStateTransitionStateData.class);
|
||||
initialState = data.getInitialState();
|
||||
lastState = data.getLastState();
|
||||
forceNewPlan = data.getForceNewPlan();
|
||||
if (data.hasAssignCandidate()) {
|
||||
assignCandidate = ProtobufUtil.toServerName(data.getAssignCandidate());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ProcedureMetrics getProcedureMetrics(MasterProcedureEnv env) {
|
||||
// TODO: need to reimplement the metrics system for assign/unassign
|
||||
if (initialState == RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE) {
|
||||
return env.getAssignmentManager().getAssignmentManagerMetrics().getAssignProcMetrics();
|
||||
} else {
|
||||
return env.getAssignmentManager().getAssignmentManagerMetrics().getUnassignProcMetrics();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void toStringClassDetails(StringBuilder sb) {
|
||||
super.toStringClassDetails(sb);
|
||||
if (initialState == RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE) {
|
||||
sb.append(", ASSIGN");
|
||||
} else if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED) {
|
||||
sb.append(", UNASSIGN");
|
||||
} else {
|
||||
sb.append(", REOPEN/MOVE");
|
||||
}
|
||||
}
|
||||
|
||||
private static TransitRegionStateProcedure setOwner(MasterProcedureEnv env,
|
||||
TransitRegionStateProcedure proc) {
|
||||
proc.setOwner(env.getRequestUser().getShortName());
|
||||
return proc;
|
||||
}
|
||||
|
||||
// Be careful that, when you call these 4 methods below, you need to manually attach the returned
|
||||
// procedure with the RegionStateNode, otherwise the procedure will quit immediately without doing
|
||||
// anything. See the comment in executeFromState to find out why we need this assumption.
|
||||
public static TransitRegionStateProcedure assign(MasterProcedureEnv env, RegionInfo region,
|
||||
@Nullable ServerName targetServer) {
|
||||
return setOwner(env,
|
||||
new TransitRegionStateProcedure(env, region, targetServer, false,
|
||||
RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE,
|
||||
RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED));
|
||||
}
|
||||
|
||||
public static TransitRegionStateProcedure unassign(MasterProcedureEnv env, RegionInfo region) {
|
||||
return setOwner(env,
|
||||
new TransitRegionStateProcedure(env, region, null, false,
|
||||
RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE,
|
||||
RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED));
|
||||
}
|
||||
|
||||
public static TransitRegionStateProcedure reopen(MasterProcedureEnv env, RegionInfo region) {
|
||||
return setOwner(env,
|
||||
new TransitRegionStateProcedure(env, region, null, false,
|
||||
RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE,
|
||||
RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED));
|
||||
}
|
||||
|
||||
public static TransitRegionStateProcedure move(MasterProcedureEnv env, RegionInfo region,
|
||||
@Nullable ServerName targetServer) {
|
||||
return setOwner(env,
|
||||
new TransitRegionStateProcedure(env, region, targetServer, targetServer == null,
|
||||
RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE,
|
||||
RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED));
|
||||
}
|
||||
}
|
|
@ -20,104 +20,38 @@
|
|||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.NotServingRegionException;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
|
||||
import org.apache.hadoop.hbase.favored.FavoredNodesManager;
|
||||
import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
|
||||
import org.apache.hadoop.hbase.master.RegionState;
|
||||
import org.apache.hadoop.hbase.master.RegionState.State;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionCloseOperation;
|
||||
import org.apache.hadoop.hbase.master.procedure.ServerCrashException;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
|
||||
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteOperation;
|
||||
import org.apache.hadoop.hbase.regionserver.RegionServerAbortedException;
|
||||
import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.UnassignRegionStateData;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
||||
|
||||
/**
|
||||
* Procedure that describes the unassignment of a single region.
|
||||
* There can only be one RegionTransitionProcedure -- i.e. an assign or an unassign -- per region
|
||||
* running at a time, since each procedure takes a lock on the region.
|
||||
*
|
||||
* <p>The Unassign starts by placing a "close region" request in the Remote Dispatcher
|
||||
* queue, and the procedure will then go into a "waiting state" (suspend).
|
||||
* The Remote Dispatcher will batch the various requests for that server and
|
||||
* they will be sent to the RS for execution.
|
||||
* The RS will complete the open operation by calling master.reportRegionStateTransition().
|
||||
* The AM will intercept the transition report, and notify this procedure.
|
||||
* The procedure will wakeup and finish the unassign by publishing its new state on meta.
|
||||
* <p>If we are unable to contact the remote regionserver whether because of ConnectException
|
||||
* or socket timeout, we will call expire on the server we were trying to contact. We will remain
|
||||
* in suspended state waiting for a wake up from the ServerCrashProcedure that is processing the
|
||||
* failed server. The basic idea is that if we notice a crashed server, then we have a
|
||||
* responsibility; i.e. we should not let go of the region until we are sure the server that was
|
||||
* hosting has had its crash processed. If we let go of the region before then, an assign might
|
||||
* run before the logs have been split which would make for data loss.
|
||||
*
|
||||
* <p>TODO: Rather than this tricky coordination between SCP and this Procedure, instead, work on
|
||||
* returning a SCP as our subprocedure; probably needs work on the framework to do this,
|
||||
* especially if the SCP already created.
|
||||
* Leave here only for checking if we can successfully start the master.
|
||||
* @deprecated Do not use any more.
|
||||
* @see TransitRegionStateProcedure
|
||||
*/
|
||||
@Deprecated
|
||||
@InterfaceAudience.Private
|
||||
public class UnassignProcedure extends RegionTransitionProcedure {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(UnassignProcedure.class);
|
||||
|
||||
/**
|
||||
* Where to send the unassign RPC.
|
||||
*/
|
||||
protected volatile ServerName hostingServer;
|
||||
/**
|
||||
* The Server we will subsequently assign the region too (can be null).
|
||||
*/
|
||||
|
||||
protected volatile ServerName destinationServer;
|
||||
|
||||
// TODO: should this be in a reassign procedure?
|
||||
// ...and keep unassign for 'disable' case?
|
||||
private boolean force;
|
||||
|
||||
/**
|
||||
* Whether deleting the region from in-memory states after unassigning the region.
|
||||
*/
|
||||
private boolean removeAfterUnassigning;
|
||||
|
||||
public UnassignProcedure() {
|
||||
// Required by the Procedure framework to create the procedure on replay
|
||||
super();
|
||||
}
|
||||
|
||||
public UnassignProcedure(final RegionInfo regionInfo, final ServerName hostingServer,
|
||||
final boolean force, final boolean removeAfterUnassigning) {
|
||||
this(regionInfo, hostingServer, null, force, removeAfterUnassigning);
|
||||
}
|
||||
|
||||
public UnassignProcedure(final RegionInfo regionInfo,
|
||||
final ServerName hostingServer, final ServerName destinationServer, final boolean force) {
|
||||
this(regionInfo, hostingServer, destinationServer, force, false);
|
||||
}
|
||||
|
||||
public UnassignProcedure(final RegionInfo regionInfo, final ServerName hostingServer,
|
||||
final ServerName destinationServer, final boolean force,
|
||||
final boolean removeAfterUnassigning) {
|
||||
super(regionInfo);
|
||||
this.hostingServer = hostingServer;
|
||||
this.destinationServer = destinationServer;
|
||||
this.force = force;
|
||||
this.removeAfterUnassigning = removeAfterUnassigning;
|
||||
setTransitionState(RegionTransitionState.REGION_TRANSITION_QUEUE);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -137,10 +71,9 @@ public class UnassignProcedure extends RegionTransitionProcedure {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void serializeStateData(ProcedureStateSerializer serializer)
|
||||
throws IOException {
|
||||
UnassignRegionStateData.Builder state = UnassignRegionStateData.newBuilder()
|
||||
.setTransitionState(getTransitionState())
|
||||
protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
UnassignRegionStateData.Builder state =
|
||||
UnassignRegionStateData.newBuilder().setTransitionState(getTransitionState())
|
||||
.setHostingServer(ProtobufUtil.toServerName(this.hostingServer))
|
||||
.setRegionInfo(ProtobufUtil.toRegionInfo(getRegionInfo()));
|
||||
if (this.destinationServer != null) {
|
||||
|
@ -159,10 +92,8 @@ public class UnassignProcedure extends RegionTransitionProcedure {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void deserializeStateData(ProcedureStateSerializer serializer)
|
||||
throws IOException {
|
||||
final UnassignRegionStateData state =
|
||||
serializer.deserialize(UnassignRegionStateData.class);
|
||||
protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
final UnassignRegionStateData state = serializer.deserialize(UnassignRegionStateData.class);
|
||||
setTransitionState(state.getTransitionState());
|
||||
setRegionInfo(ProtobufUtil.toRegionInfo(state.getRegionInfo()));
|
||||
this.hostingServer = ProtobufUtil.toServerName(state.getHostingServer());
|
||||
|
@ -177,73 +108,27 @@ public class UnassignProcedure extends RegionTransitionProcedure {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected boolean startTransition(final MasterProcedureEnv env, final RegionStateNode regionNode) {
|
||||
// Check region is actually unassignable now we have lock on it. If not skirt to end.
|
||||
// It could have had its status changed on us post construction... perhaps a split removed
|
||||
// the region we are to unassign (a split and a move happening near-concurrently).
|
||||
// Don't overcheck. A region is set to have a SPLITTING state if it is the parent and it is
|
||||
// being split. Regions that are in this RSN state are unassignable. Regions that are SPLIT
|
||||
// are not.
|
||||
RegionStates regionStates = env.getAssignmentManager().getRegionStates();
|
||||
RegionState rs = regionStates.getRegionState(regionNode.getRegionInfo());
|
||||
// Don't try unassigning regions that are closed or split. RSN state could have been set
|
||||
// after our creation but before we got the region lock.
|
||||
if (rs.isClosing() || rs.isClosed() || rs.isSplit() || rs.isMerged()) {
|
||||
LOG.info("NOT unassignable {}, skipping {}", rs, this);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
protected boolean startTransition(final MasterProcedureEnv env,
|
||||
final RegionStateNode regionNode) {
|
||||
// nothing to do here. we skip the step in the constructor
|
||||
// by jumping to REGION_TRANSITION_DISPATCH
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean updateTransition(final MasterProcedureEnv env, final RegionStateNode regionNode)
|
||||
throws IOException {
|
||||
// if the region is already closed or offline we can't do much...
|
||||
if (regionNode.isInState(State.CLOSED, State.OFFLINE)) {
|
||||
LOG.info("Not unassigned " + this + "; " + regionNode.toShortString());
|
||||
return false;
|
||||
}
|
||||
|
||||
// if we haven't started the operation yet, we can abort
|
||||
if (aborted.get() && regionNode.isInState(State.OPEN)) {
|
||||
setAbortFailure(getClass().getSimpleName(), "abort requested");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// Mark the region as CLOSING.
|
||||
env.getAssignmentManager().markRegionAsClosing(regionNode);
|
||||
|
||||
// Add the close region operation to the server dispatch queue.
|
||||
if (!addToRemoteDispatcher(env, regionNode.getRegionLocation())) {
|
||||
// If addToRemoteDispatcher fails, it calls the callback #remoteCallFailed.
|
||||
}
|
||||
|
||||
// Return true to keep the procedure running.
|
||||
throws IOException {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void finishTransition(final MasterProcedureEnv env, final RegionStateNode regionNode)
|
||||
throws IOException {
|
||||
AssignmentManager am = env.getAssignmentManager();
|
||||
RegionInfo regionInfo = getRegionInfo();
|
||||
|
||||
if (!removeAfterUnassigning) {
|
||||
am.markRegionAsClosed(regionNode);
|
||||
} else {
|
||||
// Remove from in-memory states
|
||||
am.getRegionStates().deleteRegion(regionInfo);
|
||||
env.getMasterServices().getServerManager().removeRegion(regionInfo);
|
||||
FavoredNodesManager fnm = env.getMasterServices().getFavoredNodesManager();
|
||||
if (fnm != null) {
|
||||
fnm.deleteFavoredNodesForRegions(Lists.newArrayList(regionInfo));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public RemoteOperation remoteCallBuild(final MasterProcedureEnv env, final ServerName serverName) {
|
||||
public RemoteOperation remoteCallBuild(final MasterProcedureEnv env,
|
||||
final ServerName serverName) {
|
||||
assert serverName.equals(getRegionState(env).getRegionLocation());
|
||||
return new RegionCloseOperation(this, getRegionInfo(), this.destinationServer);
|
||||
}
|
||||
|
@ -251,48 +136,6 @@ public class UnassignProcedure extends RegionTransitionProcedure {
|
|||
@Override
|
||||
protected void reportTransition(final MasterProcedureEnv env, final RegionStateNode regionNode,
|
||||
final TransitionCode code, final long seqId) throws UnexpectedStateException {
|
||||
switch (code) {
|
||||
case CLOSED:
|
||||
setTransitionState(RegionTransitionState.REGION_TRANSITION_FINISH);
|
||||
break;
|
||||
default:
|
||||
throw new UnexpectedStateException(String.format(
|
||||
"Received report unexpected transition state=%s for region=%s server=%s, expected CLOSED.",
|
||||
code, regionNode.getRegionInfo(), regionNode.getRegionLocation()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Our remote call failed but there are a few states where it is safe to proceed with the
|
||||
* unassign; e.g. if a server crash and it has had all of its WALs processed, then we can allow
|
||||
* this unassign to go to completion.
|
||||
* @return True if it is safe to proceed with the unassign.
|
||||
*/
|
||||
private boolean isSafeToProceed(final MasterProcedureEnv env, final RegionStateNode regionNode,
|
||||
final IOException exception) {
|
||||
if (exception instanceof ServerCrashException) {
|
||||
// This exception comes from ServerCrashProcedure AFTER log splitting. Its a signaling
|
||||
// exception. SCP found this region as a RIT during its processing of the crash. Its call
|
||||
// into here says it is ok to let this procedure go complete.
|
||||
return true;
|
||||
}
|
||||
if (exception instanceof NotServingRegionException) {
|
||||
LOG.warn("IS OK? ANY LOGS TO REPLAY; ACTING AS THOUGH ALL GOOD {}", regionNode, exception);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set it up so when procedure is unsuspended, we'll move to the procedure finish.
|
||||
*/
|
||||
protected void proceed(final MasterProcedureEnv env, final RegionStateNode regionNode) {
|
||||
try {
|
||||
reportTransition(env, regionNode, TransitionCode.CLOSED, HConstants.NO_SEQNUM);
|
||||
} catch (UnexpectedStateException e) {
|
||||
// Should never happen.
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -301,61 +144,6 @@ public class UnassignProcedure extends RegionTransitionProcedure {
|
|||
@Override
|
||||
protected boolean remoteCallFailed(final MasterProcedureEnv env, final RegionStateNode regionNode,
|
||||
final IOException exception) {
|
||||
// Be careful reading the below; we do returns in middle of the method a few times.
|
||||
if (isSafeToProceed(env, regionNode, exception)) {
|
||||
proceed(env, regionNode);
|
||||
} else if (exception instanceof RegionServerAbortedException ||
|
||||
exception instanceof RegionServerStoppedException) {
|
||||
// RS is aborting/stopping, we cannot offline the region since the region may need to do WAL
|
||||
// recovery. Until we see the RS expiration, stay suspended; return false.
|
||||
LOG.info("Ignoring; waiting on ServerCrashProcedure", exception);
|
||||
return false;
|
||||
} else if (exception instanceof ServerNotRunningYetException) {
|
||||
// This should not happen. If it does, procedure will be woken-up and we'll retry.
|
||||
// TODO: Needs a pause and backoff?
|
||||
LOG.info("Retry", exception);
|
||||
} else {
|
||||
// We failed to RPC this server. Set it as expired.
|
||||
ServerName serverName = regionNode.getRegionLocation();
|
||||
LOG.warn("Expiring {}, {} {}; exception={}", serverName, this, regionNode.toShortString(),
|
||||
exception.getClass().getSimpleName());
|
||||
if (!env.getMasterServices().getServerManager().expireServer(serverName)) {
|
||||
// Failed to queue an expire. Lots of possible reasons including it may be already expired.
|
||||
// In ServerCrashProcedure and RecoverMetaProcedure, there is a handleRIT stage where we
|
||||
// will iterator over all the RIT procedures for the related regions of a crashed RS and
|
||||
// fail them with ServerCrashException. You can see the isSafeToProceed method above for
|
||||
// more details.
|
||||
// This can work for most cases, but since we do not hold the region lock in handleRIT,
|
||||
// there could be race that we arrive here after the handleRIT stage of the SCP. So here we
|
||||
// need to check whether it is safe to quit.
|
||||
// Notice that, the first assumption is that we can only quit after the log splitting is
|
||||
// done, as MRP can schedule an AssignProcedure right after us, and if the log splitting has
|
||||
// not been done then there will be data loss. And in SCP, we will change the state from
|
||||
// SPLITTING to OFFLINE(or SPLITTING_META_DONE for meta log processing) after finishing the
|
||||
// log splitting, and then calling handleRIT, so checking the state here can be a safe
|
||||
// fence. If the state is not OFFLINE(or SPLITTING_META_DONE), then we can just leave this
|
||||
// procedure in suspended state as we can make sure that the handleRIT has not been executed
|
||||
// yet and it will wake us up later. And if the state is OFFLINE(or SPLITTING_META_DONE), we
|
||||
// can safely quit since there will be no data loss. There could be duplicated
|
||||
// AssignProcedures for the same region but it is OK as we will do a check at the beginning
|
||||
// of AssignProcedure to prevent double assign. And there we have region lock so there will
|
||||
// be no race.
|
||||
if (env.getAssignmentManager().isLogSplittingDone(serverName, isMeta())) {
|
||||
// Its ok to proceed with this unassign.
|
||||
LOG.info("{} is dead and processed; moving procedure to finished state; {}", serverName,
|
||||
this);
|
||||
proceed(env, regionNode);
|
||||
// Return true; wake up the procedure so we can act on proceed.
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Return false so this procedure stays in suspended state. It will be woken up by the
|
||||
// ServerCrashProcedure that was scheduled when we called #expireServer above. SCP calls
|
||||
// #handleRIT which will call this method only the exception will be a ServerCrashException
|
||||
// this time around (See above).
|
||||
// TODO: Add a SCP as a new subprocedure that we now come to depend on.
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -365,11 +153,6 @@ public class UnassignProcedure extends RegionTransitionProcedure {
|
|||
sb.append(", server=").append(this.hostingServer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ServerName getServer(final MasterProcedureEnv env) {
|
||||
return this.hostingServer;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ProcedureMetrics getProcedureMetrics(MasterProcedureEnv env) {
|
||||
return env.getAssignmentManager().getAssignmentManagerMetrics().getUnassignProcMetrics();
|
||||
|
|
|
@ -1,72 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.ipc.HBaseRpcController;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.RequestConverter;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.AdminService;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.GetRegionInfoRequest;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.GetRegionInfoResponse;
|
||||
|
||||
/**
|
||||
* Utility for this assignment package only.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
class Util {
|
||||
private Util() {}
|
||||
|
||||
/**
|
||||
* Raw call to remote regionserver to get info on a particular region.
|
||||
* @throws IOException Let it out so can report this IOE as reason for failure
|
||||
*/
|
||||
static GetRegionInfoResponse getRegionInfoResponse(final MasterProcedureEnv env,
|
||||
final ServerName regionLocation, final RegionInfo hri)
|
||||
throws IOException {
|
||||
return getRegionInfoResponse(env, regionLocation, hri, false);
|
||||
}
|
||||
|
||||
static GetRegionInfoResponse getRegionInfoResponse(final MasterProcedureEnv env,
|
||||
final ServerName regionLocation, final RegionInfo hri, boolean includeBestSplitRow)
|
||||
throws IOException {
|
||||
// TODO: There is no timeout on this controller. Set one!
|
||||
HBaseRpcController controller = env.getMasterServices().getClusterConnection().
|
||||
getRpcControllerFactory().newController();
|
||||
final AdminService.BlockingInterface admin =
|
||||
env.getMasterServices().getClusterConnection().getAdmin(regionLocation);
|
||||
GetRegionInfoRequest request = null;
|
||||
if (includeBestSplitRow) {
|
||||
request = RequestConverter.buildGetRegionInfoRequest(hri.getRegionName(), false, true);
|
||||
} else {
|
||||
request = RequestConverter.buildGetRegionInfoRequest(hri.getRegionName());
|
||||
}
|
||||
try {
|
||||
return admin.getRegionInfo(controller, request);
|
||||
} catch (ServiceException e) {
|
||||
throw ProtobufUtil.handleRemoteException(e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -40,13 +40,12 @@ public abstract class AbstractStateMachineRegionProcedure<TState>
|
|||
extends AbstractStateMachineTableProcedure<TState> {
|
||||
private RegionInfo hri;
|
||||
|
||||
public AbstractStateMachineRegionProcedure(final MasterProcedureEnv env,
|
||||
final RegionInfo hri) {
|
||||
protected AbstractStateMachineRegionProcedure(MasterProcedureEnv env, RegionInfo hri) {
|
||||
super(env);
|
||||
this.hri = hri;
|
||||
}
|
||||
|
||||
public AbstractStateMachineRegionProcedure() {
|
||||
protected AbstractStateMachineRegionProcedure() {
|
||||
// Required by the Procedure framework to create the procedure on replay
|
||||
super();
|
||||
}
|
||||
|
@ -54,7 +53,7 @@ public abstract class AbstractStateMachineRegionProcedure<TState>
|
|||
/**
|
||||
* @return The RegionInfo of the region we are operating on.
|
||||
*/
|
||||
protected RegionInfo getRegion() {
|
||||
public RegionInfo getRegion() {
|
||||
return this.hri;
|
||||
}
|
||||
|
||||
|
|
|
@ -28,12 +28,10 @@ import org.apache.hadoop.hbase.TableNotFoundException;
|
|||
import org.apache.hadoop.hbase.UnknownRegionException;
|
||||
import org.apache.hadoop.hbase.client.DoNotRetryRegionException;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.RegionOfflineException;
|
||||
import org.apache.hadoop.hbase.client.TableState;
|
||||
import org.apache.hadoop.hbase.master.MasterServices;
|
||||
import org.apache.hadoop.hbase.master.RegionState;
|
||||
import org.apache.hadoop.hbase.master.TableStateManager;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStates;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.procedure2.StateMachineProcedure;
|
||||
import org.apache.hadoop.hbase.security.User;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
|
@ -180,26 +178,13 @@ public abstract class AbstractStateMachineTableProcedure<TState>
|
|||
/**
|
||||
* Check region is online.
|
||||
*/
|
||||
protected static void checkOnline(MasterProcedureEnv env, final RegionInfo ri)
|
||||
protected static void checkOnline(MasterProcedureEnv env, RegionInfo ri)
|
||||
throws DoNotRetryRegionException {
|
||||
RegionStates regionStates = env.getAssignmentManager().getRegionStates();
|
||||
RegionState rs = regionStates.getRegionState(ri);
|
||||
if (rs == null) {
|
||||
RegionStateNode regionNode =
|
||||
env.getAssignmentManager().getRegionStates().getRegionStateNode(ri);
|
||||
if (regionNode == null) {
|
||||
throw new UnknownRegionException("No RegionState found for " + ri.getEncodedName());
|
||||
}
|
||||
if (!rs.isOpened()) {
|
||||
throw new DoNotRetryRegionException(ri.getEncodedName() + " is not OPEN; regionState=" + rs);
|
||||
}
|
||||
if (ri.isSplitParent()) {
|
||||
throw new DoNotRetryRegionException(ri.getEncodedName() +
|
||||
" is not online (splitParent=true)");
|
||||
}
|
||||
if (ri.isSplit()) {
|
||||
throw new DoNotRetryRegionException(ri.getEncodedName() + " has split=true");
|
||||
}
|
||||
if (ri.isOffline()) {
|
||||
// RegionOfflineException is not instance of DNRIOE so wrap it.
|
||||
throw new DoNotRetryRegionException(new RegionOfflineException(ri.getEncodedName()));
|
||||
}
|
||||
regionNode.checkOnline();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -166,7 +166,7 @@ public class CreateTableProcedure
|
|||
|
||||
@Override
|
||||
protected CreateTableState getState(final int stateId) {
|
||||
return CreateTableState.valueOf(stateId);
|
||||
return CreateTableState.forNumber(stateId);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -177,7 +177,7 @@ public class DeleteTableProcedure
|
|||
|
||||
@Override
|
||||
protected DeleteTableState getState(final int stateId) {
|
||||
return DeleteTableState.valueOf(stateId);
|
||||
return DeleteTableState.forNumber(stateId);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -190,6 +190,11 @@ public class DeleteTableProcedure
|
|||
return DeleteTableState.DELETE_TABLE_PRE_OPERATION;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean holdLock(MasterProcedureEnv env) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TableName getTableName() {
|
||||
return tableName;
|
||||
|
@ -297,7 +302,9 @@ public class DeleteTableProcedure
|
|||
FileStatus[] files = fs.listStatus(tempdir);
|
||||
if (files != null && files.length > 0) {
|
||||
for (int i = 0; i < files.length; ++i) {
|
||||
if (!files[i].isDir()) continue;
|
||||
if (!files[i].isDirectory()) {
|
||||
continue;
|
||||
}
|
||||
HFileArchiver.archiveRegion(fs, mfs.getRootDir(), tempTableDir, files[i].getPath());
|
||||
}
|
||||
}
|
||||
|
@ -343,7 +350,6 @@ public class DeleteTableProcedure
|
|||
* There may be items for this table still up in hbase:meta in the case where the
|
||||
* info:regioninfo column was empty because of some write error. Remove ALL rows from hbase:meta
|
||||
* that have to do with this table. See HBASE-12980.
|
||||
* @throws IOException
|
||||
*/
|
||||
private static void cleanAnyRemainingRows(final MasterProcedureEnv env,
|
||||
final TableName tableName) throws IOException {
|
||||
|
|
|
@ -51,8 +51,6 @@ public class DisableTableProcedure
|
|||
private TableName tableName;
|
||||
private boolean skipTableStateCheck;
|
||||
|
||||
private Boolean traceEnabled = null;
|
||||
|
||||
public DisableTableProcedure() {
|
||||
super();
|
||||
}
|
||||
|
@ -64,8 +62,7 @@ public class DisableTableProcedure
|
|||
* @param skipTableStateCheck whether to check table state
|
||||
*/
|
||||
public DisableTableProcedure(final MasterProcedureEnv env, final TableName tableName,
|
||||
final boolean skipTableStateCheck)
|
||||
throws HBaseIOException {
|
||||
final boolean skipTableStateCheck) throws HBaseIOException {
|
||||
this(env, tableName, skipTableStateCheck, null);
|
||||
}
|
||||
|
||||
|
@ -77,7 +74,7 @@ public class DisableTableProcedure
|
|||
*/
|
||||
public DisableTableProcedure(final MasterProcedureEnv env, final TableName tableName,
|
||||
final boolean skipTableStateCheck, final ProcedurePrepareLatch syncLatch)
|
||||
throws HBaseIOException {
|
||||
throws HBaseIOException {
|
||||
super(env, syncLatch);
|
||||
this.tableName = tableName;
|
||||
preflightChecks(env, true);
|
||||
|
@ -107,7 +104,8 @@ public class DisableTableProcedure
|
|||
setNextState(DisableTableState.DISABLE_TABLE_MARK_REGIONS_OFFLINE);
|
||||
break;
|
||||
case DISABLE_TABLE_MARK_REGIONS_OFFLINE:
|
||||
addChildProcedure(env.getAssignmentManager().createUnassignProcedures(tableName));
|
||||
addChildProcedure(
|
||||
env.getAssignmentManager().createUnassignProceduresForDisabling(tableName));
|
||||
setNextState(DisableTableState.DISABLE_TABLE_ADD_REPLICATION_BARRIER);
|
||||
break;
|
||||
case DISABLE_TABLE_ADD_REPLICATION_BARRIER:
|
||||
|
@ -180,7 +178,7 @@ public class DisableTableProcedure
|
|||
|
||||
@Override
|
||||
protected DisableTableState getState(final int stateId) {
|
||||
return DisableTableState.valueOf(stateId);
|
||||
return DisableTableState.forNumber(stateId);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -219,6 +217,14 @@ public class DisableTableProcedure
|
|||
skipTableStateCheck = disableTableMsg.getSkipTableStateCheck();
|
||||
}
|
||||
|
||||
// For disabling a table, we does not care whether a region can be online so hold the table xlock
|
||||
// for ever. This will simplify the logic as we will not be conflict with procedures other than
|
||||
// SCP.
|
||||
@Override
|
||||
protected boolean holdLock(MasterProcedureEnv env) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TableName getTableName() {
|
||||
return tableName;
|
||||
|
@ -233,7 +239,6 @@ public class DisableTableProcedure
|
|||
* Action before any real action of disabling table. Set the exception in the procedure instead
|
||||
* of throwing it. This approach is to deal with backward compatible with 1.0.
|
||||
* @param env MasterProcedureEnv
|
||||
* @throws IOException
|
||||
*/
|
||||
private boolean prepareDisable(final MasterProcedureEnv env) throws IOException {
|
||||
boolean canTableBeDisabled = true;
|
||||
|
@ -272,8 +277,6 @@ public class DisableTableProcedure
|
|||
* Action before disabling table.
|
||||
* @param env MasterProcedureEnv
|
||||
* @param state the procedure state
|
||||
* @throws IOException
|
||||
* @throws InterruptedException
|
||||
*/
|
||||
protected void preDisable(final MasterProcedureEnv env, final DisableTableState state)
|
||||
throws IOException, InterruptedException {
|
||||
|
@ -283,14 +286,11 @@ public class DisableTableProcedure
|
|||
/**
|
||||
* Mark table state to Disabling
|
||||
* @param env MasterProcedureEnv
|
||||
* @throws IOException
|
||||
*/
|
||||
protected static void setTableStateToDisabling(
|
||||
final MasterProcedureEnv env,
|
||||
private static void setTableStateToDisabling(final MasterProcedureEnv env,
|
||||
final TableName tableName) throws IOException {
|
||||
// Set table disabling flag up in zk.
|
||||
env.getMasterServices().getTableStateManager().setTableState(
|
||||
tableName,
|
||||
env.getMasterServices().getTableStateManager().setTableState(tableName,
|
||||
TableState.State.DISABLING);
|
||||
LOG.info("Set {} to state={}", tableName, TableState.State.DISABLING);
|
||||
}
|
||||
|
@ -298,14 +298,11 @@ public class DisableTableProcedure
|
|||
/**
|
||||
* Mark table state to Disabled
|
||||
* @param env MasterProcedureEnv
|
||||
* @throws IOException
|
||||
*/
|
||||
protected static void setTableStateToDisabled(
|
||||
final MasterProcedureEnv env,
|
||||
protected static void setTableStateToDisabled(final MasterProcedureEnv env,
|
||||
final TableName tableName) throws IOException {
|
||||
// Flip the table to disabled
|
||||
env.getMasterServices().getTableStateManager().setTableState(
|
||||
tableName,
|
||||
env.getMasterServices().getTableStateManager().setTableState(tableName,
|
||||
TableState.State.DISABLED);
|
||||
LOG.info("Set {} to state={}", tableName, TableState.State.DISABLED);
|
||||
}
|
||||
|
@ -314,32 +311,16 @@ public class DisableTableProcedure
|
|||
* Action after disabling table.
|
||||
* @param env MasterProcedureEnv
|
||||
* @param state the procedure state
|
||||
* @throws IOException
|
||||
* @throws InterruptedException
|
||||
*/
|
||||
protected void postDisable(final MasterProcedureEnv env, final DisableTableState state)
|
||||
throws IOException, InterruptedException {
|
||||
runCoprocessorAction(env, state);
|
||||
}
|
||||
|
||||
/**
|
||||
* The procedure could be restarted from a different machine. If the variable is null, we need to
|
||||
* retrieve it.
|
||||
* @return traceEnabled
|
||||
*/
|
||||
private Boolean isTraceEnabled() {
|
||||
if (traceEnabled == null) {
|
||||
traceEnabled = LOG.isTraceEnabled();
|
||||
}
|
||||
return traceEnabled;
|
||||
}
|
||||
|
||||
/**
|
||||
* Coprocessor Action.
|
||||
* @param env MasterProcedureEnv
|
||||
* @param state the procedure state
|
||||
* @throws IOException
|
||||
* @throws InterruptedException
|
||||
*/
|
||||
private void runCoprocessorAction(final MasterProcedureEnv env, final DisableTableState state)
|
||||
throws IOException, InterruptedException {
|
||||
|
|
|
@ -84,7 +84,6 @@ public class EnableTableProcedure
|
|||
this.skipTableStateCheck = skipTableStateCheck;
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
@Override
|
||||
protected Flow executeFromState(final MasterProcedureEnv env, final EnableTableState state)
|
||||
throws InterruptedException {
|
||||
|
@ -255,7 +254,7 @@ public class EnableTableProcedure
|
|||
|
||||
@Override
|
||||
protected EnableTableState getState(final int stateId) {
|
||||
return EnableTableState.valueOf(stateId);
|
||||
return EnableTableState.forNumber(stateId);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -18,10 +18,11 @@
|
|||
package org.apache.hadoop.hbase.master.procedure;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
||||
import org.apache.hadoop.hbase.master.assignment.AssignProcedure;
|
||||
import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
|
||||
|
@ -32,7 +33,7 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.I
|
|||
|
||||
/**
|
||||
* This procedure is used to initialize meta table for a new hbase deploy. It will just schedule an
|
||||
* {@link AssignProcedure} to assign meta.
|
||||
* {@link TransitRegionStateProcedure} to assign meta.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class InitMetaProcedure extends AbstractStateMachineTableProcedure<InitMetaState> {
|
||||
|
@ -55,7 +56,7 @@ public class InitMetaProcedure extends AbstractStateMachineTableProcedure<InitMe
|
|||
switch (state) {
|
||||
case INIT_META_ASSIGN_META:
|
||||
addChildProcedure(env.getAssignmentManager()
|
||||
.createAssignProcedure(RegionInfoBuilder.FIRST_META_REGIONINFO));
|
||||
.createAssignProcedures(Arrays.asList(RegionInfoBuilder.FIRST_META_REGIONINFO)));
|
||||
return Flow.NO_MORE_STATE;
|
||||
default:
|
||||
throw new UnsupportedOperationException("unhandled state=" + state);
|
||||
|
|
|
@ -207,7 +207,8 @@ public class MasterProcedureScheduler extends AbstractProcedureScheduler {
|
|||
// check if the next procedure is still a child.
|
||||
// if not, remove the rq from the fairq and go back to the xlock state
|
||||
Procedure<?> nextProc = rq.peek();
|
||||
if (nextProc != null && !Procedure.haveSameParent(nextProc, pollResult)) {
|
||||
if (nextProc != null && !Procedure.haveSameParent(nextProc, pollResult) &&
|
||||
nextProc.getRootProcId() != pollResult.getRootProcId()) {
|
||||
removeFromRunQueue(fairq, rq);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,37 +18,23 @@
|
|||
package org.apache.hadoop.hbase.master.procedure;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
||||
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
|
||||
import org.apache.hadoop.hbase.master.MasterServices;
|
||||
import org.apache.hadoop.hbase.master.assignment.AssignProcedure;
|
||||
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionTransitionProcedure;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
|
||||
import org.apache.hadoop.hbase.procedure2.StateMachineProcedure;
|
||||
import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RecoverMetaState;
|
||||
|
||||
|
||||
/**
|
||||
* This procedure recovers meta from prior shutdown/ crash of a server, and brings meta online by
|
||||
* assigning meta region/s. Any place where meta is accessed and requires meta to be online, need to
|
||||
* submit this procedure instead of duplicating steps to recover meta in the code.
|
||||
* <p/>
|
||||
* Leave here only for checking if we can successfully start the master.
|
||||
* @deprecated Do not use any more, leave it here only for compatible. The recovery work will be
|
||||
* done in {@link ServerCrashProcedure} directly, and the initial work for meta table
|
||||
* will be done by {@link InitMetaProcedure}.
|
||||
|
@ -66,146 +52,17 @@ public class RecoverMetaProcedure
|
|||
private boolean shouldSplitWal;
|
||||
private int replicaId;
|
||||
|
||||
private final ProcedurePrepareLatch syncLatch;
|
||||
private MasterServices master;
|
||||
|
||||
/**
|
||||
* Call this constructor to queue up a {@link RecoverMetaProcedure} in response to meta
|
||||
* carrying server crash
|
||||
* @param failedMetaServer failed/ crashed region server that was carrying meta
|
||||
* @param shouldSplitLog split log file of meta region
|
||||
*/
|
||||
public RecoverMetaProcedure(final ServerName failedMetaServer, final boolean shouldSplitLog) {
|
||||
this(failedMetaServer, shouldSplitLog, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor with latch, for blocking/ sync usage
|
||||
*/
|
||||
public RecoverMetaProcedure(final ServerName failedMetaServer, final boolean shouldSplitLog,
|
||||
final ProcedurePrepareLatch latch) {
|
||||
this.failedMetaServer = failedMetaServer;
|
||||
this.shouldSplitWal = shouldSplitLog;
|
||||
this.replicaId = RegionInfo.DEFAULT_REPLICA_ID;
|
||||
this.syncLatch = latch;
|
||||
}
|
||||
|
||||
/**
|
||||
* This constructor is also used when deserializing from a procedure store; we'll construct one
|
||||
* of these then call #deserializeStateData(InputStream). Do not use directly.
|
||||
*/
|
||||
public RecoverMetaProcedure() {
|
||||
this(null, false);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Flow executeFromState(MasterProcedureEnv env,
|
||||
MasterProcedureProtos.RecoverMetaState state)
|
||||
throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
|
||||
prepare(env);
|
||||
|
||||
if (!isRunRequired()) {
|
||||
LOG.info(this + "; Meta already initialized. Skipping run");
|
||||
return Flow.NO_MORE_STATE;
|
||||
}
|
||||
|
||||
try {
|
||||
switch (state) {
|
||||
case RECOVER_META_PREPARE:
|
||||
// If Master is going down or cluster is up, skip this assign by returning NO_MORE_STATE
|
||||
if (!master.isClusterUp()) {
|
||||
String msg = "Cluster not up! Skipping hbase:meta assign.";
|
||||
LOG.warn(msg);
|
||||
return Flow.NO_MORE_STATE;
|
||||
}
|
||||
if (master.isStopping() || master.isStopped()) {
|
||||
String msg = "Master stopping=" + master.isStopping() + ", stopped=" +
|
||||
master.isStopped() + "; skipping hbase:meta assign.";
|
||||
LOG.warn(msg);
|
||||
return Flow.NO_MORE_STATE;
|
||||
}
|
||||
setNextState(RecoverMetaState.RECOVER_META_SPLIT_LOGS);
|
||||
break;
|
||||
case RECOVER_META_SPLIT_LOGS:
|
||||
LOG.info("Start " + this);
|
||||
if (shouldSplitWal) {
|
||||
// TODO: Matteo. We BLOCK here but most important thing to be doing at this moment.
|
||||
AssignmentManager am = env.getMasterServices().getAssignmentManager();
|
||||
if (failedMetaServer != null) {
|
||||
am.getRegionStates().metaLogSplitting(failedMetaServer);
|
||||
master.getMasterWalManager().splitMetaLog(failedMetaServer);
|
||||
am.getRegionStates().metaLogSplit(failedMetaServer);
|
||||
} else {
|
||||
ServerName serverName =
|
||||
master.getMetaTableLocator().getMetaRegionLocation(master.getZooKeeper());
|
||||
Set<ServerName> previouslyFailedServers =
|
||||
master.getMasterWalManager().getFailedServersFromLogFolders();
|
||||
if (serverName != null && previouslyFailedServers.contains(serverName)) {
|
||||
am.getRegionStates().metaLogSplitting(serverName);
|
||||
master.getMasterWalManager().splitMetaLog(serverName);
|
||||
am.getRegionStates().metaLogSplit(serverName);
|
||||
}
|
||||
}
|
||||
}
|
||||
setNextState(RecoverMetaState.RECOVER_META_ASSIGN_REGIONS);
|
||||
break;
|
||||
case RECOVER_META_ASSIGN_REGIONS:
|
||||
RegionInfo hri = RegionReplicaUtil.getRegionInfoForReplica(
|
||||
RegionInfoBuilder.FIRST_META_REGIONINFO, this.replicaId);
|
||||
|
||||
AssignProcedure metaAssignProcedure;
|
||||
AssignmentManager am = master.getAssignmentManager();
|
||||
if (failedMetaServer != null) {
|
||||
handleRIT(env, hri, this.failedMetaServer);
|
||||
LOG.info(this + "; Assigning meta with new plan; previous server=" + failedMetaServer);
|
||||
metaAssignProcedure = am.createAssignProcedure(hri);
|
||||
} else {
|
||||
// get server carrying meta from zk
|
||||
ServerName metaServer =
|
||||
MetaTableLocator.getMetaRegionState(master.getZooKeeper()).getServerName();
|
||||
LOG.info(this + "; Retaining meta assignment to server=" + metaServer);
|
||||
metaAssignProcedure = am.createAssignProcedure(hri, metaServer);
|
||||
}
|
||||
|
||||
addChildProcedure(metaAssignProcedure);
|
||||
return Flow.NO_MORE_STATE;
|
||||
|
||||
default:
|
||||
throw new UnsupportedOperationException("unhandled state=" + state);
|
||||
}
|
||||
} catch (IOException|KeeperException e) {
|
||||
LOG.warn(this + "; Failed state=" + state + ", retry " + this + "; cycles=" +
|
||||
getCycles(), e);
|
||||
}
|
||||
return Flow.HAS_MORE_STATE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is the region stuck assigning to this failedMetaServer? If so, cancel the call
|
||||
* just as we do over in ServerCrashProcedure#handleRIT except less to do here; less context
|
||||
* to carry.
|
||||
*/
|
||||
// NOTE: Make sure any fix or improvement done here is also done in SCP#handleRIT; the methods
|
||||
// have overlap.
|
||||
private void handleRIT(MasterProcedureEnv env, RegionInfo ri, ServerName crashedServerName) {
|
||||
AssignmentManager am = env.getAssignmentManager();
|
||||
RegionTransitionProcedure rtp = am.getRegionStates().getRegionTransitionProcedure(ri);
|
||||
if (rtp == null) {
|
||||
return; // Nothing to do. Not in RIT.
|
||||
}
|
||||
// Make sure the RIT is against this crashed server. In the case where there are many
|
||||
// processings of a crashed server -- backed up for whatever reason (slow WAL split)
|
||||
// -- then a previous SCP may have already failed an assign, etc., and it may have a
|
||||
// new location target; DO NOT fail these else we make for assign flux.
|
||||
ServerName rtpServerName = rtp.getServer(env);
|
||||
if (rtpServerName == null) {
|
||||
LOG.warn("RIT with ServerName null! " + rtp);
|
||||
} else if (rtpServerName.equals(crashedServerName)) {
|
||||
LOG.info("pid=" + getProcId() + " found RIT " + rtp + "; " +
|
||||
rtp.getRegionState(env).toShortString());
|
||||
rtp.remoteCallFailed(env, crashedServerName,
|
||||
new ServerCrashException(getProcId(), crashedServerName));
|
||||
}
|
||||
return Flow.NO_MORE_STATE;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -241,11 +98,10 @@ public class RecoverMetaProcedure
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void serializeStateData(ProcedureStateSerializer serializer)
|
||||
throws IOException {
|
||||
protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
super.serializeStateData(serializer);
|
||||
MasterProcedureProtos.RecoverMetaStateData.Builder state =
|
||||
MasterProcedureProtos.RecoverMetaStateData.newBuilder().setShouldSplitWal(shouldSplitWal);
|
||||
MasterProcedureProtos.RecoverMetaStateData.newBuilder().setShouldSplitWal(shouldSplitWal);
|
||||
if (failedMetaServer != null) {
|
||||
state.setFailedMetaServer(ProtobufUtil.toServerName(failedMetaServer));
|
||||
}
|
||||
|
@ -254,50 +110,13 @@ public class RecoverMetaProcedure
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void deserializeStateData(ProcedureStateSerializer serializer)
|
||||
throws IOException {
|
||||
protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
super.deserializeStateData(serializer);
|
||||
MasterProcedureProtos.RecoverMetaStateData state =
|
||||
serializer.deserialize(MasterProcedureProtos.RecoverMetaStateData.class);
|
||||
serializer.deserialize(MasterProcedureProtos.RecoverMetaStateData.class);
|
||||
this.shouldSplitWal = state.hasShouldSplitWal() && state.getShouldSplitWal();
|
||||
this.failedMetaServer = state.hasFailedMetaServer() ?
|
||||
ProtobufUtil.toServerName(state.getFailedMetaServer()) : null;
|
||||
this.failedMetaServer =
|
||||
state.hasFailedMetaServer() ? ProtobufUtil.toServerName(state.getFailedMetaServer()) : null;
|
||||
this.replicaId = state.hasReplicaId() ? state.getReplicaId() : RegionInfo.DEFAULT_REPLICA_ID;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected LockState acquireLock(MasterProcedureEnv env) {
|
||||
if (env.getProcedureScheduler().waitMetaExclusiveLock(this)) {
|
||||
return LockState.LOCK_EVENT_WAIT;
|
||||
}
|
||||
return LockState.LOCK_ACQUIRED;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void releaseLock(MasterProcedureEnv env) {
|
||||
env.getProcedureScheduler().wakeMetaExclusiveLock(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void completionCleanup(MasterProcedureEnv env) {
|
||||
ProcedurePrepareLatch.releaseLatch(syncLatch, this);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if failedMetaServer is not null (meta carrying server crashed) or meta is
|
||||
* already initialized
|
||||
*/
|
||||
private boolean isRunRequired() {
|
||||
return failedMetaServer != null || !master.getAssignmentManager().isMetaAssigned();
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare for execution
|
||||
*/
|
||||
private void prepare(MasterProcedureEnv env) {
|
||||
if (master == null) {
|
||||
master = env.getMasterServices();
|
||||
Preconditions.checkArgument(master != null);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,11 +21,10 @@ import java.io.IOException;
|
|||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.hbase.HBaseIOException;
|
||||
import org.apache.hadoop.hbase.HRegionLocation;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.master.RegionPlan;
|
||||
import org.apache.hadoop.hbase.master.assignment.MoveRegionProcedure;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
|
||||
|
@ -39,8 +38,6 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.R
|
|||
|
||||
/**
|
||||
* Used for reopening the regions for a table.
|
||||
* <p/>
|
||||
* Currently we use {@link MoveRegionProcedure} to reopen regions.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class ReopenTableRegionsProcedure
|
||||
|
@ -69,16 +66,6 @@ public class ReopenTableRegionsProcedure
|
|||
return TableOperationType.REGION_EDIT;
|
||||
}
|
||||
|
||||
private MoveRegionProcedure createReopenProcedure(MasterProcedureEnv env, HRegionLocation loc) {
|
||||
try {
|
||||
return new MoveRegionProcedure(env,
|
||||
new RegionPlan(loc.getRegion(), loc.getServerName(), loc.getServerName()), false);
|
||||
} catch (HBaseIOException e) {
|
||||
// we skip the checks so this should not happen
|
||||
throw new AssertionError(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState state)
|
||||
throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
|
||||
|
@ -93,8 +80,22 @@ public class ReopenTableRegionsProcedure
|
|||
setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS);
|
||||
return Flow.HAS_MORE_STATE;
|
||||
case REOPEN_TABLE_REGIONS_REOPEN_REGIONS:
|
||||
addChildProcedure(regions.stream().filter(l -> l.getSeqNum() >= 0)
|
||||
.map(l -> createReopenProcedure(env, l)).toArray(MoveRegionProcedure[]::new));
|
||||
for (HRegionLocation loc : regions) {
|
||||
RegionStateNode regionNode = env.getAssignmentManager().getRegionStates()
|
||||
.getOrCreateRegionStateNode(loc.getRegion());
|
||||
TransitRegionStateProcedure proc;
|
||||
regionNode.lock();
|
||||
try {
|
||||
if (regionNode.getProcedure() != null) {
|
||||
continue;
|
||||
}
|
||||
proc = TransitRegionStateProcedure.reopen(env, regionNode.getRegionInfo());
|
||||
regionNode.setProcedure(proc);
|
||||
} finally {
|
||||
regionNode.unlock();
|
||||
}
|
||||
addChildProcedure(proc);
|
||||
}
|
||||
setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_CONFIRM_REOPENED);
|
||||
return Flow.HAS_MORE_STATE;
|
||||
case REOPEN_TABLE_REGIONS_CONFIRM_REOPENED:
|
||||
|
|
|
@ -20,18 +20,17 @@ package org.apache.hadoop.hbase.master.procedure;
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
||||
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
|
||||
import org.apache.hadoop.hbase.client.TableState;
|
||||
import org.apache.hadoop.hbase.master.MasterServices;
|
||||
import org.apache.hadoop.hbase.master.MasterWalManager;
|
||||
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionTransitionProcedure;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
|
||||
|
@ -98,7 +97,10 @@ public class ServerCrashProcedure
|
|||
* #deserializeStateData(InputStream). Do not use directly.
|
||||
*/
|
||||
public ServerCrashProcedure() {
|
||||
super();
|
||||
}
|
||||
|
||||
public boolean isInRecoverMetaState() {
|
||||
return getCurrentState() == ServerCrashState.SERVER_CRASH_PROCESS_META;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -128,15 +130,7 @@ public class ServerCrashProcedure
|
|||
setNextState(ServerCrashState.SERVER_CRASH_ASSIGN_META);
|
||||
break;
|
||||
case SERVER_CRASH_ASSIGN_META:
|
||||
handleRIT(env, Arrays.asList(RegionInfoBuilder.FIRST_META_REGIONINFO));
|
||||
addChildProcedure(env.getAssignmentManager()
|
||||
.createAssignProcedure(RegionInfoBuilder.FIRST_META_REGIONINFO));
|
||||
setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS);
|
||||
break;
|
||||
case SERVER_CRASH_PROCESS_META:
|
||||
// not used any more but still leave it here to keep compatible as there maybe old SCP
|
||||
// which is stored in ProcedureStore which has this state.
|
||||
processMeta(env);
|
||||
assignRegions(env, Arrays.asList(RegionInfoBuilder.FIRST_META_REGIONINFO));
|
||||
setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS);
|
||||
break;
|
||||
case SERVER_CRASH_GET_REGIONS:
|
||||
|
@ -144,8 +138,8 @@ public class ServerCrashProcedure
|
|||
if (env.getAssignmentManager().waitMetaLoaded(this)) {
|
||||
throw new ProcedureSuspendedException();
|
||||
}
|
||||
this.regionsOnCrashedServer = services.getAssignmentManager().getRegionStates()
|
||||
.getServerRegionInfoSet(serverName);
|
||||
this.regionsOnCrashedServer =
|
||||
services.getAssignmentManager().getRegionStates().getServerRegionInfoSet(serverName);
|
||||
// Where to go next? Depends on whether we should split logs at all or
|
||||
// if we should do distributed log splitting.
|
||||
if (!this.shouldSplitWal) {
|
||||
|
@ -162,26 +156,15 @@ public class ServerCrashProcedure
|
|||
// If no regions to assign, skip assign and skip to the finish.
|
||||
// Filter out meta regions. Those are handled elsewhere in this procedure.
|
||||
// Filter changes this.regionsOnCrashedServer.
|
||||
if (filterDefaultMetaRegions(regionsOnCrashedServer)) {
|
||||
if (filterDefaultMetaRegions()) {
|
||||
if (LOG.isTraceEnabled()) {
|
||||
LOG.trace("Assigning regions " +
|
||||
RegionInfo.getShortNameToLog(regionsOnCrashedServer) + ", " + this +
|
||||
"; cycles=" + getCycles());
|
||||
LOG
|
||||
.trace("Assigning regions " + RegionInfo.getShortNameToLog(regionsOnCrashedServer) +
|
||||
", " + this + "; cycles=" + getCycles());
|
||||
}
|
||||
// Handle RIT against crashed server. Will cancel any ongoing assigns/unassigns.
|
||||
// Returns list of regions we need to reassign.
|
||||
// NOTE: there is nothing to stop a dispatch happening AFTER this point. Check for the
|
||||
// condition if a dispatch RPC fails inside in AssignProcedure/UnassignProcedure.
|
||||
// AssignProcedure just keeps retrying. UnassignProcedure is more complicated. See where
|
||||
// it does the check by calling am#isLogSplittingDone.
|
||||
List<RegionInfo> toAssign = handleRIT(env, regionsOnCrashedServer);
|
||||
AssignmentManager am = env.getAssignmentManager();
|
||||
// CreateAssignProcedure will try to use the old location for the region deploy.
|
||||
addChildProcedure(am.createAssignProcedures(toAssign));
|
||||
setNextState(ServerCrashState.SERVER_CRASH_HANDLE_RIT2);
|
||||
} else {
|
||||
setNextState(ServerCrashState.SERVER_CRASH_FINISH);
|
||||
assignRegions(env, regionsOnCrashedServer);
|
||||
}
|
||||
setNextState(ServerCrashState.SERVER_CRASH_FINISH);
|
||||
break;
|
||||
case SERVER_CRASH_HANDLE_RIT2:
|
||||
// Noop. Left in place because we used to call handleRIT here for a second time
|
||||
|
@ -201,28 +184,16 @@ public class ServerCrashProcedure
|
|||
return Flow.HAS_MORE_STATE;
|
||||
}
|
||||
|
||||
private void processMeta(final MasterProcedureEnv env) throws IOException {
|
||||
LOG.debug("{}; processing hbase:meta", this);
|
||||
|
||||
// Assign meta if still carrying it. Check again: region may be assigned because of RIT timeout
|
||||
final AssignmentManager am = env.getMasterServices().getAssignmentManager();
|
||||
for (RegionInfo hri: am.getRegionStates().getServerRegionInfoSet(serverName)) {
|
||||
if (!isDefaultMetaRegion(hri)) {
|
||||
continue;
|
||||
}
|
||||
addChildProcedure(new RecoverMetaProcedure(serverName, this.shouldSplitWal));
|
||||
private boolean filterDefaultMetaRegions() {
|
||||
if (regionsOnCrashedServer == null) {
|
||||
return false;
|
||||
}
|
||||
regionsOnCrashedServer.removeIf(this::isDefaultMetaRegion);
|
||||
return !regionsOnCrashedServer.isEmpty();
|
||||
}
|
||||
|
||||
private boolean filterDefaultMetaRegions(final List<RegionInfo> regions) {
|
||||
if (regions == null) return false;
|
||||
regions.removeIf(this::isDefaultMetaRegion);
|
||||
return !regions.isEmpty();
|
||||
}
|
||||
|
||||
private boolean isDefaultMetaRegion(final RegionInfo hri) {
|
||||
return hri.getTable().equals(TableName.META_TABLE_NAME) &&
|
||||
RegionReplicaUtil.isDefaultReplica(hri);
|
||||
private boolean isDefaultMetaRegion(RegionInfo hri) {
|
||||
return hri.isMetaRegion() && RegionReplicaUtil.isDefaultReplica(hri);
|
||||
}
|
||||
|
||||
private void splitMetaLogs(MasterProcedureEnv env) throws IOException {
|
||||
|
@ -372,54 +343,37 @@ public class ServerCrashProcedure
|
|||
}
|
||||
|
||||
/**
|
||||
* Handle any outstanding RIT that are up against this.serverName, the crashed server.
|
||||
* Notify them of crash. Remove assign entries from the passed in <code>regions</code>
|
||||
* otherwise we have two assigns going on and they will fight over who has lock.
|
||||
* Notify Unassigns. If unable to unassign because server went away, unassigns block waiting
|
||||
* on the below callback from a ServerCrashProcedure before proceeding.
|
||||
* @param regions Regions on the Crashed Server.
|
||||
* @return List of regions we should assign to new homes (not same as regions on crashed server).
|
||||
* Assign the regions on the crashed RS to other Rses.
|
||||
* <p/>
|
||||
* In this method we will go through all the RegionStateNodes of the give regions to find out
|
||||
* whether there is already an TRSP for the region, if so we interrupt it and let it retry on
|
||||
* other server, otherwise we will schedule a TRSP to bring the region online.
|
||||
* <p/>
|
||||
* We will also check whether the table for a region is enabled, if not, we will skip assigning
|
||||
* it.
|
||||
*/
|
||||
private List<RegionInfo> handleRIT(final MasterProcedureEnv env, List<RegionInfo> regions) {
|
||||
if (regions == null || regions.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
private void assignRegions(MasterProcedureEnv env, List<RegionInfo> regions) throws IOException {
|
||||
AssignmentManager am = env.getMasterServices().getAssignmentManager();
|
||||
List<RegionInfo> toAssign = new ArrayList<RegionInfo>(regions);
|
||||
// Get an iterator so can remove items.
|
||||
final Iterator<RegionInfo> it = toAssign.iterator();
|
||||
ServerCrashException sce = null;
|
||||
while (it.hasNext()) {
|
||||
final RegionInfo hri = it.next();
|
||||
RegionTransitionProcedure rtp = am.getRegionStates().getRegionTransitionProcedure(hri);
|
||||
if (rtp == null) {
|
||||
continue;
|
||||
for (RegionInfo region : regions) {
|
||||
RegionStateNode regionNode = am.getRegionStates().getOrCreateRegionStateNode(region);
|
||||
regionNode.lock();
|
||||
try {
|
||||
if (regionNode.getProcedure() != null) {
|
||||
LOG.info("{} found RIT {}; {}", this, regionNode.getProcedure(), regionNode);
|
||||
regionNode.getProcedure().serverCrashed(env, regionNode, getServerName());
|
||||
} else {
|
||||
if (env.getMasterServices().getTableStateManager().isTableState(regionNode.getTable(),
|
||||
TableState.State.DISABLING, TableState.State.DISABLED)) {
|
||||
continue;
|
||||
}
|
||||
TransitRegionStateProcedure proc = TransitRegionStateProcedure.assign(env, region, null);
|
||||
regionNode.setProcedure(proc);
|
||||
addChildProcedure(proc);
|
||||
}
|
||||
} finally {
|
||||
regionNode.unlock();
|
||||
}
|
||||
// Make sure the RIT is against this crashed server. In the case where there are many
|
||||
// processings of a crashed server -- backed up for whatever reason (slow WAL split) --
|
||||
// then a previous SCP may have already failed an assign, etc., and it may have a new
|
||||
// location target; DO NOT fail these else we make for assign flux.
|
||||
ServerName rtpServerName = rtp.getServer(env);
|
||||
if (rtpServerName == null) {
|
||||
LOG.warn("RIT with ServerName null! " + rtp);
|
||||
continue;
|
||||
}
|
||||
if (!rtpServerName.equals(this.serverName)) continue;
|
||||
LOG.info("pid=" + getProcId() + " found RIT " + rtp + "; " +
|
||||
rtp.getRegionState(env).toShortString());
|
||||
// Notify RIT on server crash.
|
||||
if (sce == null) {
|
||||
sce = new ServerCrashException(getProcId(), getServerName());
|
||||
}
|
||||
rtp.remoteCallFailed(env, this.serverName, sce);
|
||||
// If an assign, remove from passed-in list of regions so we subsequently do not create
|
||||
// a new assign; the exisitng assign after the call to remoteCallFailed will recalibrate
|
||||
// and assign to a server other than the crashed one; no need to create new assign.
|
||||
// If an unassign, do not return this region; the above cancel will wake up the unassign and
|
||||
// it will complete. Done.
|
||||
it.remove();
|
||||
}
|
||||
return toAssign;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -189,7 +189,7 @@ public class TruncateTableProcedure
|
|||
|
||||
@Override
|
||||
protected TruncateTableState getState(final int stateId) {
|
||||
return TruncateTableState.valueOf(stateId);
|
||||
return TruncateTableState.forNumber(stateId);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -202,6 +202,11 @@ public class TruncateTableProcedure
|
|||
return TruncateTableState.TRUNCATE_TABLE_PRE_OPERATION;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean holdLock(MasterProcedureEnv env) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TableName getTableName() {
|
||||
return tableName;
|
||||
|
|
|
@ -1,294 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.MiniHBaseCluster;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.Admin;
|
||||
import org.apache.hadoop.hbase.client.Put;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.Table;
|
||||
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegion;
|
||||
import org.apache.hadoop.hbase.regionserver.Region;
|
||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.JVMClusterUtil;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.junit.rules.TestName;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@Category({MasterTests.class, MediumTests.class})
|
||||
public class TestAssignmentListener {
|
||||
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestAssignmentListener.class);
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(TestAssignmentListener.class);
|
||||
|
||||
private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
|
||||
|
||||
@Rule
|
||||
public TestName name = new TestName();
|
||||
|
||||
static class DummyListener {
|
||||
protected AtomicInteger modified = new AtomicInteger(0);
|
||||
|
||||
public void awaitModifications(int count) throws InterruptedException {
|
||||
while (!modified.compareAndSet(count, 0)) {
|
||||
Thread.sleep(100);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static class DummyAssignmentListener extends DummyListener implements AssignmentListener {
|
||||
private AtomicInteger closeCount = new AtomicInteger(0);
|
||||
private AtomicInteger openCount = new AtomicInteger(0);
|
||||
|
||||
public DummyAssignmentListener() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void regionOpened(final RegionInfo regionInfo, final ServerName serverName) {
|
||||
LOG.info("Assignment open region=" + regionInfo + " server=" + serverName);
|
||||
openCount.incrementAndGet();
|
||||
modified.incrementAndGet();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void regionClosed(final RegionInfo regionInfo) {
|
||||
LOG.info("Assignment close region=" + regionInfo);
|
||||
closeCount.incrementAndGet();
|
||||
modified.incrementAndGet();
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
openCount.set(0);
|
||||
closeCount.set(0);
|
||||
}
|
||||
|
||||
public int getLoadCount() {
|
||||
return openCount.get();
|
||||
}
|
||||
|
||||
public int getCloseCount() {
|
||||
return closeCount.get();
|
||||
}
|
||||
}
|
||||
|
||||
static class DummyServerListener extends DummyListener implements ServerListener {
|
||||
private AtomicInteger removedCount = new AtomicInteger(0);
|
||||
private AtomicInteger addedCount = new AtomicInteger(0);
|
||||
|
||||
public DummyServerListener() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void serverAdded(final ServerName serverName) {
|
||||
LOG.info("Server added " + serverName);
|
||||
addedCount.incrementAndGet();
|
||||
modified.incrementAndGet();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void serverRemoved(final ServerName serverName) {
|
||||
LOG.info("Server removed " + serverName);
|
||||
removedCount.incrementAndGet();
|
||||
modified.incrementAndGet();
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
addedCount.set(0);
|
||||
removedCount.set(0);
|
||||
}
|
||||
|
||||
public int getAddedCount() {
|
||||
return addedCount.get();
|
||||
}
|
||||
|
||||
public int getRemovedCount() {
|
||||
return removedCount.get();
|
||||
}
|
||||
}
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeAllTests() throws Exception {
|
||||
TEST_UTIL.startMiniCluster(2);
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void afterAllTests() throws Exception {
|
||||
TEST_UTIL.shutdownMiniCluster();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testServerListener() throws IOException, InterruptedException {
|
||||
ServerManager serverManager = TEST_UTIL.getHBaseCluster().getMaster().getServerManager();
|
||||
|
||||
DummyServerListener listener = new DummyServerListener();
|
||||
serverManager.registerListener(listener);
|
||||
try {
|
||||
MiniHBaseCluster miniCluster = TEST_UTIL.getMiniHBaseCluster();
|
||||
|
||||
// Start a new Region Server
|
||||
miniCluster.startRegionServer();
|
||||
listener.awaitModifications(1);
|
||||
assertEquals(1, listener.getAddedCount());
|
||||
assertEquals(0, listener.getRemovedCount());
|
||||
|
||||
// Start another Region Server
|
||||
listener.reset();
|
||||
miniCluster.startRegionServer();
|
||||
listener.awaitModifications(1);
|
||||
assertEquals(1, listener.getAddedCount());
|
||||
assertEquals(0, listener.getRemovedCount());
|
||||
|
||||
int nrs = miniCluster.getRegionServerThreads().size();
|
||||
|
||||
// Stop a Region Server
|
||||
listener.reset();
|
||||
miniCluster.stopRegionServer(nrs - 1);
|
||||
listener.awaitModifications(1);
|
||||
assertEquals(0, listener.getAddedCount());
|
||||
assertEquals(1, listener.getRemovedCount());
|
||||
|
||||
// Stop another Region Server
|
||||
listener.reset();
|
||||
miniCluster.stopRegionServer(nrs - 2);
|
||||
listener.awaitModifications(1);
|
||||
assertEquals(0, listener.getAddedCount());
|
||||
assertEquals(1, listener.getRemovedCount());
|
||||
} finally {
|
||||
serverManager.unregisterListener(listener);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAssignmentListener() throws IOException, InterruptedException {
|
||||
AssignmentManager am = TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager();
|
||||
Admin admin = TEST_UTIL.getAdmin();
|
||||
|
||||
DummyAssignmentListener listener = new DummyAssignmentListener();
|
||||
am.registerListener(listener);
|
||||
try {
|
||||
final TableName tableName = TableName.valueOf(name.getMethodName());
|
||||
final byte[] FAMILY = Bytes.toBytes("cf");
|
||||
|
||||
// Create a new table, with a single region
|
||||
LOG.info("Create Table");
|
||||
TEST_UTIL.createTable(tableName, FAMILY);
|
||||
listener.awaitModifications(1);
|
||||
assertEquals(1, listener.getLoadCount());
|
||||
assertEquals(0, listener.getCloseCount());
|
||||
|
||||
// Add some data
|
||||
Table table = TEST_UTIL.getConnection().getTable(tableName);
|
||||
try {
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
byte[] key = Bytes.toBytes("row-" + i);
|
||||
Put put = new Put(key);
|
||||
put.addColumn(FAMILY, null, key);
|
||||
table.put(put);
|
||||
}
|
||||
} finally {
|
||||
table.close();
|
||||
}
|
||||
|
||||
// Split the table in two
|
||||
LOG.info("Split Table");
|
||||
listener.reset();
|
||||
admin.split(tableName, Bytes.toBytes("row-3"));
|
||||
listener.awaitModifications(3);
|
||||
assertEquals(2, listener.getLoadCount()); // daughters added
|
||||
assertEquals(1, listener.getCloseCount()); // parent removed
|
||||
|
||||
// Wait for the Regions to be mergeable
|
||||
MiniHBaseCluster miniCluster = TEST_UTIL.getMiniHBaseCluster();
|
||||
int mergeable = 0;
|
||||
while (mergeable < 2) {
|
||||
Thread.sleep(100);
|
||||
admin.majorCompact(tableName);
|
||||
mergeable = 0;
|
||||
for (JVMClusterUtil.RegionServerThread regionThread: miniCluster.getRegionServerThreads()) {
|
||||
for (Region region: regionThread.getRegionServer().getRegions(tableName)) {
|
||||
mergeable += ((HRegion)region).isMergeable() ? 1 : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Merge the two regions
|
||||
LOG.info("Merge Regions");
|
||||
listener.reset();
|
||||
List<RegionInfo> regions = admin.getRegions(tableName);
|
||||
assertEquals(2, regions.size());
|
||||
boolean sameServer = areAllRegionsLocatedOnSameServer(tableName);
|
||||
// If the regions are located by different server, we need to move
|
||||
// regions to same server before merging. So the expected modifications
|
||||
// will increaes to 5. (open + close)
|
||||
final int expectedModifications = sameServer ? 3 : 5;
|
||||
final int expectedLoadCount = sameServer ? 1 : 2;
|
||||
final int expectedCloseCount = sameServer ? 2 : 3;
|
||||
admin.mergeRegionsAsync(regions.get(0).getEncodedNameAsBytes(),
|
||||
regions.get(1).getEncodedNameAsBytes(), true);
|
||||
listener.awaitModifications(expectedModifications);
|
||||
assertEquals(1, admin.getRegions(tableName).size());
|
||||
assertEquals(expectedLoadCount, listener.getLoadCount()); // new merged region added
|
||||
assertEquals(expectedCloseCount, listener.getCloseCount()); // daughters removed
|
||||
|
||||
// Delete the table
|
||||
LOG.info("Drop Table");
|
||||
listener.reset();
|
||||
TEST_UTIL.deleteTable(tableName);
|
||||
listener.awaitModifications(1);
|
||||
assertEquals(0, listener.getLoadCount());
|
||||
assertEquals(1, listener.getCloseCount());
|
||||
} finally {
|
||||
am.unregisterListener(listener);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean areAllRegionsLocatedOnSameServer(TableName TABLE_NAME) {
|
||||
MiniHBaseCluster miniCluster = TEST_UTIL.getMiniHBaseCluster();
|
||||
int serverCount = 0;
|
||||
for (JVMClusterUtil.RegionServerThread regionThread: miniCluster.getRegionServerThreads()) {
|
||||
if (!regionThread.getRegionServer().getRegions(TABLE_NAME).isEmpty()) {
|
||||
++serverCount;
|
||||
}
|
||||
if (serverCount > 1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return serverCount == 1;
|
||||
}
|
||||
}
|
|
@ -20,7 +20,6 @@ package org.apache.hadoop.hbase.master;
|
|||
import java.io.IOException;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
|
@ -30,12 +29,14 @@ import org.apache.hadoop.hbase.coprocessor.ObserverContext;
|
|||
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessor;
|
||||
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
|
||||
import org.apache.hadoop.hbase.coprocessor.RegionObserver;
|
||||
import org.apache.hadoop.hbase.master.assignment.MoveRegionProcedure;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegionServer;
|
||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.JVMClusterUtil;
|
||||
import org.apache.hadoop.hbase.util.Threads;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.Assert;
|
||||
import org.junit.BeforeClass;
|
||||
|
@ -45,15 +46,14 @@ import org.junit.experimental.categories.Category;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
||||
@Category({ MasterTests.class, MediumTests.class })
|
||||
public class TestMasterAbortAndRSGotKilled {
|
||||
private static Logger LOG = LoggerFactory
|
||||
.getLogger(TestMasterAbortAndRSGotKilled.class.getName());
|
||||
private static Logger LOG =
|
||||
LoggerFactory.getLogger(TestMasterAbortAndRSGotKilled.class.getName());
|
||||
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestMasterAbortAndRSGotKilled.class);
|
||||
HBaseClassTestRule.forClass(TestMasterAbortAndRSGotKilled.class);
|
||||
|
||||
private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
|
||||
|
||||
|
@ -61,14 +61,12 @@ public class TestMasterAbortAndRSGotKilled {
|
|||
|
||||
private static CountDownLatch countDownLatch = new CountDownLatch(1);
|
||||
|
||||
|
||||
|
||||
private static byte[] CF = Bytes.toBytes("cf");
|
||||
|
||||
@BeforeClass
|
||||
public static void setUp() throws Exception {
|
||||
UTIL.getConfiguration().setStrings(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY,
|
||||
DelayCloseCP.class.getName());
|
||||
DelayCloseCP.class.getName());
|
||||
UTIL.startMiniCluster(3);
|
||||
UTIL.getAdmin().balancerSwitch(false, true);
|
||||
UTIL.createTable(TABLE_NAME, CF);
|
||||
|
@ -84,48 +82,44 @@ public class TestMasterAbortAndRSGotKilled {
|
|||
public void test() throws Exception {
|
||||
JVMClusterUtil.RegionServerThread rsThread = null;
|
||||
for (JVMClusterUtil.RegionServerThread t : UTIL.getMiniHBaseCluster()
|
||||
.getRegionServerThreads()) {
|
||||
.getRegionServerThreads()) {
|
||||
if (!t.getRegionServer().getRegions(TABLE_NAME).isEmpty()) {
|
||||
rsThread = t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
//find the rs and hri of the table
|
||||
// find the rs and hri of the table
|
||||
HRegionServer rs = rsThread.getRegionServer();
|
||||
RegionInfo hri = rs.getRegions(TABLE_NAME).get(0).getRegionInfo();
|
||||
MoveRegionProcedure moveRegionProcedure = new MoveRegionProcedure(
|
||||
UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor()
|
||||
.getEnvironment(),
|
||||
new RegionPlan(hri, rs.getServerName(), rs.getServerName()), true);
|
||||
long procID = UTIL.getMiniHBaseCluster().getMaster()
|
||||
.getMasterProcedureExecutor().submitProcedure(moveRegionProcedure);
|
||||
TransitRegionStateProcedure moveRegionProcedure = TransitRegionStateProcedure.reopen(
|
||||
UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor().getEnvironment(), hri);
|
||||
RegionStateNode regionNode = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager()
|
||||
.getRegionStates().getOrCreateRegionStateNode(hri);
|
||||
regionNode.setProcedure(moveRegionProcedure);
|
||||
UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor()
|
||||
.submitProcedure(moveRegionProcedure);
|
||||
countDownLatch.await();
|
||||
UTIL.getMiniHBaseCluster().stopMaster(0);
|
||||
UTIL.getMiniHBaseCluster().startMaster();
|
||||
//wait until master initialized
|
||||
UTIL.waitFor(30000,
|
||||
() -> UTIL.getMiniHBaseCluster().getMaster() != null && UTIL
|
||||
.getMiniHBaseCluster().getMaster().isInitialized());
|
||||
// wait until master initialized
|
||||
UTIL.waitFor(30000, () -> UTIL.getMiniHBaseCluster().getMaster() != null &&
|
||||
UTIL.getMiniHBaseCluster().getMaster().isInitialized());
|
||||
Assert.assertTrue("Should be 3 RS after master restart",
|
||||
UTIL.getMiniHBaseCluster().getLiveRegionServerThreads().size() == 3);
|
||||
UTIL.getMiniHBaseCluster().getLiveRegionServerThreads().size() == 3);
|
||||
|
||||
}
|
||||
|
||||
public static class DelayCloseCP implements RegionCoprocessor,
|
||||
RegionObserver {
|
||||
@Override
|
||||
public void preClose(ObserverContext<RegionCoprocessorEnvironment> c,
|
||||
boolean abortRequested) throws IOException {
|
||||
try {
|
||||
if (!c.getEnvironment().getRegion().getRegionInfo().getTable().isSystemTable()) {
|
||||
LOG.error("begin to sleep");
|
||||
countDownLatch.countDown();
|
||||
//Sleep here so we can stuck the RPC call
|
||||
Thread.sleep(10000);
|
||||
LOG.error("finish sleep");
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
public static class DelayCloseCP implements RegionCoprocessor, RegionObserver {
|
||||
|
||||
@Override
|
||||
public void preClose(ObserverContext<RegionCoprocessorEnvironment> c, boolean abortRequested)
|
||||
throws IOException {
|
||||
if (!c.getEnvironment().getRegion().getRegionInfo().getTable().isSystemTable()) {
|
||||
LOG.info("begin to sleep");
|
||||
countDownLatch.countDown();
|
||||
// Sleep here so we can stuck the RPC call
|
||||
Threads.sleep(10000);
|
||||
LOG.info("finish sleep");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -134,5 +128,4 @@ public class TestMasterAbortAndRSGotKilled {
|
|||
return Optional.of(this);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.hadoop.hbase.master;
|
|||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
|
@ -31,7 +30,7 @@ import org.apache.hadoop.hbase.client.ResultScanner;
|
|||
import org.apache.hadoop.hbase.client.Scan;
|
||||
import org.apache.hadoop.hbase.client.Table;
|
||||
import org.apache.hadoop.hbase.master.assignment.MergeTableRegionsProcedure;
|
||||
import org.apache.hadoop.hbase.master.assignment.UnassignProcedure;
|
||||
import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
|
||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||
|
@ -103,9 +102,9 @@ public class TestMergeTableRegionsWhileRSCrash {
|
|||
MergeTableRegionsProcedure mergeTableRegionsProcedure = new MergeTableRegionsProcedure(
|
||||
env, regionInfos.get(0), regionInfos.get(1));
|
||||
executor.submitProcedure(mergeTableRegionsProcedure);
|
||||
UTIL.waitFor(30000, () -> executor.getProcedures().stream()
|
||||
.filter(p -> p instanceof UnassignProcedure)
|
||||
.map(p -> (UnassignProcedure) p)
|
||||
UTIL.waitFor(30000,
|
||||
() -> executor.getProcedures().stream().filter(p -> p instanceof TransitRegionStateProcedure)
|
||||
.map(p -> (TransitRegionStateProcedure) p)
|
||||
.anyMatch(p -> TABLE_NAME.equals(p.getTableName())));
|
||||
UTIL.getMiniHBaseCluster().killRegionServer(
|
||||
UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName());
|
||||
|
|
|
@ -26,7 +26,7 @@ import org.apache.hadoop.hbase.client.AsyncAdmin;
|
|||
import org.apache.hadoop.hbase.client.AsyncConnection;
|
||||
import org.apache.hadoop.hbase.client.ConnectionFactory;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.master.assignment.AssignProcedure;
|
||||
import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegionServer;
|
||||
|
@ -83,9 +83,10 @@ public class TestServerCrashProcedureCarryingMetaStuck {
|
|||
rs.abort("For testing!");
|
||||
|
||||
UTIL.waitFor(30000,
|
||||
() -> executor.getProcedures().stream().filter(p -> p instanceof AssignProcedure)
|
||||
.map(p -> (AssignProcedure) p)
|
||||
.anyMatch(p -> Bytes.equals(hri.getRegionName(), p.getRegionInfo().getRegionName())));
|
||||
() -> executor.getProcedures().stream()
|
||||
.filter(p -> p instanceof TransitRegionStateProcedure)
|
||||
.map(p -> (TransitRegionStateProcedure) p)
|
||||
.anyMatch(p -> Bytes.equals(hri.getRegionName(), p.getRegion().getRegionName())));
|
||||
proc.resume();
|
||||
UTIL.waitFor(30000, () -> executor.isFinished(procId));
|
||||
// see whether the move region procedure can finish properly
|
||||
|
|
|
@ -26,7 +26,7 @@ import org.apache.hadoop.hbase.client.AsyncAdmin;
|
|||
import org.apache.hadoop.hbase.client.AsyncConnection;
|
||||
import org.apache.hadoop.hbase.client.ConnectionFactory;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.master.assignment.AssignProcedure;
|
||||
import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegionServer;
|
||||
|
@ -92,9 +92,10 @@ public class TestServerCrashProcedureStuck {
|
|||
rs.abort("For testing!");
|
||||
|
||||
UTIL.waitFor(30000,
|
||||
() -> executor.getProcedures().stream().filter(p -> p instanceof AssignProcedure)
|
||||
.map(p -> (AssignProcedure) p)
|
||||
.anyMatch(p -> Bytes.equals(hri.getRegionName(), p.getRegionInfo().getRegionName())));
|
||||
() -> executor.getProcedures().stream()
|
||||
.filter(p -> p instanceof TransitRegionStateProcedure)
|
||||
.map(p -> (TransitRegionStateProcedure) p)
|
||||
.anyMatch(p -> Bytes.equals(hri.getRegionName(), p.getRegion().getRegionName())));
|
||||
proc.resume();
|
||||
UTIL.waitFor(30000, () -> executor.isFinished(procId));
|
||||
// see whether the move region procedure can finish properly
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.hadoop.hbase.master;
|
|||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
|
@ -31,7 +30,7 @@ import org.apache.hadoop.hbase.client.ResultScanner;
|
|||
import org.apache.hadoop.hbase.client.Scan;
|
||||
import org.apache.hadoop.hbase.client.Table;
|
||||
import org.apache.hadoop.hbase.master.assignment.SplitTableRegionProcedure;
|
||||
import org.apache.hadoop.hbase.master.assignment.UnassignProcedure;
|
||||
import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
|
||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||
|
@ -103,8 +102,8 @@ public class TestSplitRegionWhileRSCrash {
|
|||
executor.submitProcedure(splitProcedure);
|
||||
LOG.info("SplitProcedure submitted");
|
||||
UTIL.waitFor(30000, () -> executor.getProcedures().stream()
|
||||
.filter(p -> p instanceof UnassignProcedure)
|
||||
.map(p -> (UnassignProcedure) p)
|
||||
.filter(p -> p instanceof TransitRegionStateProcedure)
|
||||
.map(p -> (TransitRegionStateProcedure) p)
|
||||
.anyMatch(p -> TABLE_NAME.equals(p.getTableName())));
|
||||
UTIL.getMiniHBaseCluster().killRegionServer(
|
||||
UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName());
|
||||
|
|
|
@ -319,7 +319,7 @@ public class MockMasterServices extends MockNoopMasterServices {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void updateRegionLocation(RegionStates.RegionStateNode regionNode) throws IOException {
|
||||
public void updateRegionLocation(RegionStateNode regionNode) throws IOException {
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.testclassification.LargeTests;
|
||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@Category({ MasterTests.class, LargeTests.class })
|
||||
public class TestAMAssignWithRandExec extends TestAssignmentManagerBase {
|
||||
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestAMAssignWithRandExec.class);
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(TestAMAssignWithRandExec.class);
|
||||
|
||||
@Test
|
||||
public void testAssignWithRandExec() throws Exception {
|
||||
TableName tableName = TableName.valueOf("testAssignWithRandExec");
|
||||
RegionInfo hri = createRegionInfo(tableName, 1);
|
||||
|
||||
rsDispatcher.setMockRsExecutor(new RandRsExecutor());
|
||||
// Loop a bunch of times so we hit various combos of exceptions.
|
||||
for (int i = 0; i < 10; i++) {
|
||||
LOG.info("ROUND=" + i);
|
||||
TransitRegionStateProcedure proc = createAssignProcedure(hri);
|
||||
waitOnFuture(submitProcedure(proc));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.hadoop.hbase.DoNotRetryIOException;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.RetriesExhaustedException;
|
||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@Category({ MasterTests.class, MediumTests.class })
|
||||
public class TestAMServerFailedOpen extends TestAssignmentManagerBase {
|
||||
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestAMServerFailedOpen.class);
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(TestAMServerFailedOpen.class);
|
||||
|
||||
@Override
|
||||
protected int getAssignMaxAttempts() {
|
||||
// do not need to retry so many times as we will finally fail...
|
||||
return 10;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testServerNotYetRunning() throws Exception {
|
||||
testRetriesExhaustedFailure(TableName.valueOf(this.name.getMethodName()),
|
||||
new ServerNotYetRunningRsExecutor());
|
||||
}
|
||||
|
||||
private void testRetriesExhaustedFailure(final TableName tableName, final MockRSExecutor executor)
|
||||
throws Exception {
|
||||
RegionInfo hri = createRegionInfo(tableName, 1);
|
||||
|
||||
// collect AM metrics before test
|
||||
collectAssignmentManagerMetrics();
|
||||
|
||||
// Test Assign operation failure
|
||||
rsDispatcher.setMockRsExecutor(executor);
|
||||
try {
|
||||
waitOnFuture(submitProcedure(createAssignProcedure(hri)));
|
||||
fail("unexpected assign completion");
|
||||
} catch (RetriesExhaustedException e) {
|
||||
// expected exception
|
||||
LOG.info("expected exception from assign operation: " + e.getMessage(), e);
|
||||
}
|
||||
|
||||
// Assign the region (without problems)
|
||||
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
|
||||
waitOnFuture(submitProcedure(createAssignProcedure(hri)));
|
||||
|
||||
// TODO: Currently unassign just keeps trying until it sees a server crash.
|
||||
// There is no count on unassign.
|
||||
/*
|
||||
* // Test Unassign operation failure rsDispatcher.setMockRsExecutor(executor);
|
||||
* waitOnFuture(submitProcedure(createUnassignProcedure(hri)));
|
||||
* assertEquals(assignSubmittedCount + 2, assignProcMetrics.getSubmittedCounter().getCount());
|
||||
* assertEquals(assignFailedCount + 1, assignProcMetrics.getFailedCounter().getCount());
|
||||
* assertEquals(unassignSubmittedCount + 1,
|
||||
* unassignProcMetrics.getSubmittedCounter().getCount()); // TODO: We supposed to have 1 failed
|
||||
* assign, 1 successful assign and a failed unassign // operation. But ProcV2 framework marks
|
||||
* aborted unassign operation as success. Fix it! assertEquals(unassignFailedCount,
|
||||
* unassignProcMetrics.getFailedCounter().getCount());
|
||||
*/
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIOExceptionOnAssignment() throws Exception {
|
||||
// collect AM metrics before test
|
||||
collectAssignmentManagerMetrics();
|
||||
|
||||
testFailedOpen(TableName.valueOf("testExceptionOnAssignment"),
|
||||
new FaultyRsExecutor(new IOException("test fault")));
|
||||
|
||||
assertEquals(assignSubmittedCount + 1, assignProcMetrics.getSubmittedCounter().getCount());
|
||||
assertEquals(assignFailedCount + 1, assignProcMetrics.getFailedCounter().getCount());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoNotRetryExceptionOnAssignment() throws Exception {
|
||||
// collect AM metrics before test
|
||||
collectAssignmentManagerMetrics();
|
||||
|
||||
testFailedOpen(TableName.valueOf("testDoNotRetryExceptionOnAssignment"),
|
||||
new FaultyRsExecutor(new DoNotRetryIOException("test do not retry fault")));
|
||||
|
||||
assertEquals(assignSubmittedCount + 1, assignProcMetrics.getSubmittedCounter().getCount());
|
||||
assertEquals(assignFailedCount + 1, assignProcMetrics.getFailedCounter().getCount());
|
||||
}
|
||||
|
||||
private void testFailedOpen(final TableName tableName, final MockRSExecutor executor)
|
||||
throws Exception {
|
||||
final RegionInfo hri = createRegionInfo(tableName, 1);
|
||||
|
||||
// Test Assign operation failure
|
||||
rsDispatcher.setMockRsExecutor(executor);
|
||||
try {
|
||||
waitOnFuture(submitProcedure(createAssignProcedure(hri)));
|
||||
fail("unexpected assign completion");
|
||||
} catch (RetriesExhaustedException e) {
|
||||
// expected exception
|
||||
LOG.info("REGION STATE " + am.getRegionStates().getRegionStateNode(hri));
|
||||
LOG.info("expected exception from assign operation: " + e.getMessage(), e);
|
||||
assertEquals(true, am.getRegionStates().getRegionState(hri).isFailedOpen());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -18,151 +18,43 @@
|
|||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNotEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InterruptedIOException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.util.NavigableMap;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import java.util.concurrent.ConcurrentSkipListMap;
|
||||
import java.util.concurrent.ConcurrentSkipListSet;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.DoNotRetryIOException;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.NotServingRegionException;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
||||
import org.apache.hadoop.hbase.client.RetriesExhaustedException;
|
||||
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
|
||||
import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
|
||||
import org.apache.hadoop.hbase.master.MasterServices;
|
||||
import org.apache.hadoop.hbase.master.RegionState.State;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureConstants;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
|
||||
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher;
|
||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
|
||||
import org.apache.hadoop.hbase.procedure2.store.wal.WALProcedureStore;
|
||||
import org.apache.hadoop.hbase.procedure2.util.StringUtils;
|
||||
import org.apache.hadoop.hbase.regionserver.RegionServerAbortedException;
|
||||
import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
|
||||
import org.apache.hadoop.hbase.testclassification.LargeTests;
|
||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.FSUtils;
|
||||
import org.apache.hadoop.ipc.RemoteException;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.junit.rules.ExpectedException;
|
||||
import org.junit.rules.TestName;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.CloseRegionRequest;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.CloseRegionResponse;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.ExecuteProceduresRequest;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.ExecuteProceduresResponse;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.OpenRegionRequest;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.OpenRegionRequest.RegionOpenInfo;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.OpenRegionResponse;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.OpenRegionResponse.RegionOpeningState;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
|
||||
|
||||
@Category({MasterTests.class, LargeTests.class})
|
||||
public class TestAssignmentManager {
|
||||
@Category({ MasterTests.class, LargeTests.class })
|
||||
public class TestAssignmentManager extends TestAssignmentManagerBase {
|
||||
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestAssignmentManager.class);
|
||||
HBaseClassTestRule.forClass(TestAssignmentManager.class);
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(TestAssignmentManager.class);
|
||||
|
||||
@Rule public TestName name = new TestName();
|
||||
@Rule public final ExpectedException exception = ExpectedException.none();
|
||||
|
||||
private static final int PROC_NTHREADS = 64;
|
||||
private static final int NREGIONS = 1 * 1000;
|
||||
private static final int NSERVERS = Math.max(1, NREGIONS / 100);
|
||||
|
||||
private HBaseTestingUtility UTIL;
|
||||
private MockRSProcedureDispatcher rsDispatcher;
|
||||
private MockMasterServices master;
|
||||
private AssignmentManager am;
|
||||
private NavigableMap<ServerName, SortedSet<byte []>> regionsToRegionServers =
|
||||
new ConcurrentSkipListMap<ServerName, SortedSet<byte []>>();
|
||||
// Simple executor to run some simple tasks.
|
||||
private ScheduledExecutorService executor;
|
||||
|
||||
private ProcedureMetrics assignProcMetrics;
|
||||
private ProcedureMetrics unassignProcMetrics;
|
||||
|
||||
private long assignSubmittedCount = 0;
|
||||
private long assignFailedCount = 0;
|
||||
private long unassignSubmittedCount = 0;
|
||||
private long unassignFailedCount = 0;
|
||||
|
||||
private void setupConfiguration(Configuration conf) throws Exception {
|
||||
FSUtils.setRootDir(conf, UTIL.getDataTestDir());
|
||||
conf.setBoolean(WALProcedureStore.USE_HSYNC_CONF_KEY, false);
|
||||
conf.setInt(WALProcedureStore.SYNC_WAIT_MSEC_CONF_KEY, 10);
|
||||
conf.setInt(MasterProcedureConstants.MASTER_PROCEDURE_THREADS, PROC_NTHREADS);
|
||||
conf.setInt(RSProcedureDispatcher.RS_RPC_STARTUP_WAIT_TIME_CONF_KEY, 1000);
|
||||
conf.setInt(AssignmentManager.ASSIGN_MAX_ATTEMPTS, 100); // Have many so we succeed eventually.
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
UTIL = new HBaseTestingUtility();
|
||||
this.executor = Executors.newSingleThreadScheduledExecutor();
|
||||
setupConfiguration(UTIL.getConfiguration());
|
||||
master = new MockMasterServices(UTIL.getConfiguration(), this.regionsToRegionServers);
|
||||
rsDispatcher = new MockRSProcedureDispatcher(master);
|
||||
master.start(NSERVERS, rsDispatcher);
|
||||
am = master.getAssignmentManager();
|
||||
assignProcMetrics = am.getAssignmentManagerMetrics().getAssignProcMetrics();
|
||||
unassignProcMetrics = am.getAssignmentManagerMetrics().getUnassignProcMetrics();
|
||||
setUpMeta();
|
||||
}
|
||||
|
||||
private void setUpMeta() throws Exception {
|
||||
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
|
||||
am.assign(RegionInfoBuilder.FIRST_META_REGIONINFO);
|
||||
am.wakeMetaLoadedEvent();
|
||||
}
|
||||
|
||||
@After
|
||||
public void tearDown() throws Exception {
|
||||
master.stop("tearDown");
|
||||
this.executor.shutdownNow();
|
||||
}
|
||||
|
||||
@Test (expected=NullPointerException.class)
|
||||
@Test(expected = NullPointerException.class)
|
||||
public void testWaitServerReportEventWithNullServer() throws UnexpectedStateException {
|
||||
// Test what happens if we pass in null server. I'd expect it throws NPE.
|
||||
if (this.am.waitServerReportEvent(null, null)) throw new UnexpectedStateException();
|
||||
if (this.am.waitServerReportEvent(null, null)) {
|
||||
throw new UnexpectedStateException();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -173,61 +65,48 @@ public class TestAssignmentManager {
|
|||
testAssign(new GoodRsExecutor());
|
||||
|
||||
assertEquals(assignSubmittedCount + NREGIONS,
|
||||
assignProcMetrics.getSubmittedCounter().getCount());
|
||||
assignProcMetrics.getSubmittedCounter().getCount());
|
||||
assertEquals(assignFailedCount, assignProcMetrics.getFailedCounter().getCount());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAssignAndCrashBeforeResponse() throws Exception {
|
||||
final TableName tableName = TableName.valueOf("testAssignAndCrashBeforeResponse");
|
||||
final RegionInfo hri = createRegionInfo(tableName, 1);
|
||||
TableName tableName = TableName.valueOf("testAssignAndCrashBeforeResponse");
|
||||
RegionInfo hri = createRegionInfo(tableName, 1);
|
||||
rsDispatcher.setMockRsExecutor(new HangThenRSCrashExecutor());
|
||||
AssignProcedure proc = am.createAssignProcedure(hri);
|
||||
TransitRegionStateProcedure proc = createAssignProcedure(hri);
|
||||
waitOnFuture(submitProcedure(proc));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnassignAndCrashBeforeResponse() throws Exception {
|
||||
final TableName tableName = TableName.valueOf("testAssignAndCrashBeforeResponse");
|
||||
final RegionInfo hri = createRegionInfo(tableName, 1);
|
||||
TableName tableName = TableName.valueOf("testAssignAndCrashBeforeResponse");
|
||||
RegionInfo hri = createRegionInfo(tableName, 1);
|
||||
rsDispatcher.setMockRsExecutor(new HangOnCloseThenRSCrashExecutor());
|
||||
for (int i = 0; i < HangOnCloseThenRSCrashExecutor.TYPES_OF_FAILURE; i++) {
|
||||
AssignProcedure assign = am.createAssignProcedure(hri);
|
||||
TransitRegionStateProcedure assign = createAssignProcedure(hri);
|
||||
waitOnFuture(submitProcedure(assign));
|
||||
UnassignProcedure unassign = am.createUnassignProcedure(hri,
|
||||
am.getRegionStates().getRegionServerOfRegion(hri), false);
|
||||
TransitRegionStateProcedure unassign = createUnassignProcedure(hri);
|
||||
waitOnFuture(submitProcedure(unassign));
|
||||
}
|
||||
}
|
||||
|
||||
// Disabled for now. Since HBASE-18551, this mock is insufficient.
|
||||
@Ignore
|
||||
@Test
|
||||
public void testAssignWithRandExec() throws Exception {
|
||||
final TableName tableName = TableName.valueOf("testAssignWithRandExec");
|
||||
final RegionInfo hri = createRegionInfo(tableName, 1);
|
||||
|
||||
rsDispatcher.setMockRsExecutor(new RandRsExecutor());
|
||||
// Loop a bunch of times so we hit various combos of exceptions.
|
||||
for (int i = 0; i < 10; i++) {
|
||||
LOG.info("ROUND=" + i);
|
||||
AssignProcedure proc = am.createAssignProcedure(hri);
|
||||
waitOnFuture(submitProcedure(proc));
|
||||
}
|
||||
}
|
||||
|
||||
@Ignore @Test // Disabled for now. Since HBASE-18551, this mock is insufficient.
|
||||
public void testSocketTimeout() throws Exception {
|
||||
final TableName tableName = TableName.valueOf(this.name.getMethodName());
|
||||
final RegionInfo hri = createRegionInfo(tableName, 1);
|
||||
TableName tableName = TableName.valueOf(this.name.getMethodName());
|
||||
RegionInfo hri = createRegionInfo(tableName, 1);
|
||||
|
||||
// collect AM metrics before test
|
||||
collectAssignmentManagerMetrics();
|
||||
|
||||
rsDispatcher.setMockRsExecutor(new SocketTimeoutRsExecutor(20, 3));
|
||||
waitOnFuture(submitProcedure(am.createAssignProcedure(hri)));
|
||||
waitOnFuture(submitProcedure(createAssignProcedure(hri)));
|
||||
|
||||
rsDispatcher.setMockRsExecutor(new SocketTimeoutRsExecutor(20, 1));
|
||||
// exception.expect(ServerCrashException.class);
|
||||
waitOnFuture(submitProcedure(am.createUnassignProcedure(hri, null, false)));
|
||||
waitOnFuture(submitProcedure(createUnassignProcedure(hri)));
|
||||
|
||||
assertEquals(assignSubmittedCount + 1, assignProcMetrics.getSubmittedCounter().getCount());
|
||||
assertEquals(assignFailedCount, assignProcMetrics.getFailedCounter().getCount());
|
||||
|
@ -235,113 +114,26 @@ public class TestAssignmentManager {
|
|||
assertEquals(unassignFailedCount + 1, unassignProcMetrics.getFailedCounter().getCount());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testServerNotYetRunning() throws Exception {
|
||||
testRetriesExhaustedFailure(TableName.valueOf(this.name.getMethodName()),
|
||||
new ServerNotYetRunningRsExecutor());
|
||||
}
|
||||
|
||||
private void testRetriesExhaustedFailure(final TableName tableName,
|
||||
final MockRSExecutor executor) throws Exception {
|
||||
final RegionInfo hri = createRegionInfo(tableName, 1);
|
||||
|
||||
// collect AM metrics before test
|
||||
collectAssignmentManagerMetrics();
|
||||
|
||||
// Test Assign operation failure
|
||||
rsDispatcher.setMockRsExecutor(executor);
|
||||
try {
|
||||
waitOnFuture(submitProcedure(am.createAssignProcedure(hri)));
|
||||
fail("unexpected assign completion");
|
||||
} catch (RetriesExhaustedException e) {
|
||||
// expected exception
|
||||
LOG.info("expected exception from assign operation: " + e.getMessage(), e);
|
||||
}
|
||||
|
||||
// Assign the region (without problems)
|
||||
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
|
||||
waitOnFuture(submitProcedure(am.createAssignProcedure(hri)));
|
||||
|
||||
// TODO: Currently unassign just keeps trying until it sees a server crash.
|
||||
// There is no count on unassign.
|
||||
/*
|
||||
// Test Unassign operation failure
|
||||
rsDispatcher.setMockRsExecutor(executor);
|
||||
waitOnFuture(submitProcedure(am.createUnassignProcedure(hri, null, false)));
|
||||
|
||||
assertEquals(assignSubmittedCount + 2, assignProcMetrics.getSubmittedCounter().getCount());
|
||||
assertEquals(assignFailedCount + 1, assignProcMetrics.getFailedCounter().getCount());
|
||||
assertEquals(unassignSubmittedCount + 1, unassignProcMetrics.getSubmittedCounter().getCount());
|
||||
|
||||
// TODO: We supposed to have 1 failed assign, 1 successful assign and a failed unassign
|
||||
// operation. But ProcV2 framework marks aborted unassign operation as success. Fix it!
|
||||
assertEquals(unassignFailedCount, unassignProcMetrics.getFailedCounter().getCount());
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testIOExceptionOnAssignment() throws Exception {
|
||||
// collect AM metrics before test
|
||||
collectAssignmentManagerMetrics();
|
||||
|
||||
testFailedOpen(TableName.valueOf("testExceptionOnAssignment"),
|
||||
new FaultyRsExecutor(new IOException("test fault")));
|
||||
|
||||
assertEquals(assignSubmittedCount + 1, assignProcMetrics.getSubmittedCounter().getCount());
|
||||
assertEquals(assignFailedCount + 1, assignProcMetrics.getFailedCounter().getCount());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoNotRetryExceptionOnAssignment() throws Exception {
|
||||
// collect AM metrics before test
|
||||
collectAssignmentManagerMetrics();
|
||||
|
||||
testFailedOpen(TableName.valueOf("testDoNotRetryExceptionOnAssignment"),
|
||||
new FaultyRsExecutor(new DoNotRetryIOException("test do not retry fault")));
|
||||
|
||||
assertEquals(assignSubmittedCount + 1, assignProcMetrics.getSubmittedCounter().getCount());
|
||||
assertEquals(assignFailedCount + 1, assignProcMetrics.getFailedCounter().getCount());
|
||||
}
|
||||
|
||||
private void testFailedOpen(final TableName tableName,
|
||||
final MockRSExecutor executor) throws Exception {
|
||||
final RegionInfo hri = createRegionInfo(tableName, 1);
|
||||
|
||||
// Test Assign operation failure
|
||||
rsDispatcher.setMockRsExecutor(executor);
|
||||
try {
|
||||
waitOnFuture(submitProcedure(am.createAssignProcedure(hri)));
|
||||
fail("unexpected assign completion");
|
||||
} catch (RetriesExhaustedException e) {
|
||||
// expected exception
|
||||
LOG.info("REGION STATE " + am.getRegionStates().getRegionStateNode(hri));
|
||||
LOG.info("expected exception from assign operation: " + e.getMessage(), e);
|
||||
assertEquals(true, am.getRegionStates().getRegionState(hri).isFailedOpen());
|
||||
}
|
||||
}
|
||||
|
||||
private void testAssign(final MockRSExecutor executor) throws Exception {
|
||||
testAssign(executor, NREGIONS);
|
||||
}
|
||||
|
||||
private void testAssign(final MockRSExecutor executor, final int nregions) throws Exception {
|
||||
private void testAssign(MockRSExecutor executor, int nRegions) throws Exception {
|
||||
rsDispatcher.setMockRsExecutor(executor);
|
||||
|
||||
AssignProcedure[] assignments = new AssignProcedure[nregions];
|
||||
TransitRegionStateProcedure[] assignments = new TransitRegionStateProcedure[nRegions];
|
||||
|
||||
long st = System.currentTimeMillis();
|
||||
bulkSubmit(assignments);
|
||||
|
||||
for (int i = 0; i < assignments.length; ++i) {
|
||||
ProcedureTestingUtility.waitProcedure(
|
||||
master.getMasterProcedureExecutor(), assignments[i]);
|
||||
ProcedureTestingUtility.waitProcedure(master.getMasterProcedureExecutor(), assignments[i]);
|
||||
assertTrue(assignments[i].toString(), assignments[i].isSuccess());
|
||||
}
|
||||
long et = System.currentTimeMillis();
|
||||
float sec = ((et - st) / 1000.0f);
|
||||
LOG.info(String.format("[T] Assigning %dprocs in %s (%.2fproc/sec)",
|
||||
assignments.length, StringUtils.humanTimeDiff(et - st), assignments.length / sec));
|
||||
LOG.info(String.format("[T] Assigning %dprocs in %s (%.2fproc/sec)", assignments.length,
|
||||
StringUtils.humanTimeDiff(et - st), assignments.length / sec));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -354,7 +146,7 @@ public class TestAssignmentManager {
|
|||
|
||||
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
|
||||
|
||||
final Future<byte[]> futureA = submitProcedure(am.createAssignProcedure(hri));
|
||||
Future<byte[]> futureA = submitProcedure(createAssignProcedure(hri));
|
||||
|
||||
// wait first assign
|
||||
waitOnFuture(futureA);
|
||||
|
@ -362,7 +154,7 @@ public class TestAssignmentManager {
|
|||
// Second should be a noop. We should recognize region is already OPEN internally
|
||||
// and skip out doing nothing.
|
||||
// wait second assign
|
||||
final Future<byte[]> futureB = submitProcedure(am.createAssignProcedure(hri));
|
||||
Future<byte[]> futureB = submitProcedure(createAssignProcedure(hri));
|
||||
waitOnFuture(futureB);
|
||||
am.getRegionStates().isRegionInState(hri, State.OPEN);
|
||||
// TODO: What else can we do to ensure just a noop.
|
||||
|
@ -371,7 +163,6 @@ public class TestAssignmentManager {
|
|||
// better way?
|
||||
assertEquals(assignSubmittedCount + 2, assignProcMetrics.getSubmittedCounter().getCount());
|
||||
assertEquals(assignFailedCount, assignProcMetrics.getFailedCounter().getCount());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -385,18 +176,16 @@ public class TestAssignmentManager {
|
|||
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
|
||||
|
||||
// assign the region first
|
||||
waitOnFuture(submitProcedure(am.createAssignProcedure(hri)));
|
||||
waitOnFuture(submitProcedure(createAssignProcedure(hri)));
|
||||
|
||||
final Future<byte[]> futureA = submitProcedure(am.createUnassignProcedure(hri, null, false));
|
||||
final Future<byte[]> futureA = submitProcedure(createUnassignProcedure(hri));
|
||||
|
||||
// Wait first unassign.
|
||||
waitOnFuture(futureA);
|
||||
am.getRegionStates().isRegionInState(hri, State.CLOSED);
|
||||
// Second should be a noop. We should recognize region is already CLOSED internally
|
||||
// and skip out doing nothing.
|
||||
final Future<byte[]> futureB =
|
||||
submitProcedure(am.createUnassignProcedure(hri,
|
||||
ServerName.valueOf("example.org,1234,1"), false));
|
||||
final Future<byte[]> futureB = submitProcedure(createUnassignProcedure(hri));
|
||||
waitOnFuture(futureB);
|
||||
// Ensure we are still CLOSED.
|
||||
am.getRegionStates().isRegionInState(hri, State.CLOSED);
|
||||
|
@ -411,17 +200,17 @@ public class TestAssignmentManager {
|
|||
}
|
||||
|
||||
/**
|
||||
* It is possible that when AM send assign meta request to a RS successfully,
|
||||
* but RS can not send back any response, which cause master startup hangs forever
|
||||
* It is possible that when AM send assign meta request to a RS successfully, but RS can not send
|
||||
* back any response, which cause master startup hangs forever
|
||||
*/
|
||||
@Test
|
||||
public void testAssignMetaAndCrashBeforeResponse() throws Exception {
|
||||
tearDown();
|
||||
// See setUp(), start HBase until set up meta
|
||||
UTIL = new HBaseTestingUtility();
|
||||
util = new HBaseTestingUtility();
|
||||
this.executor = Executors.newSingleThreadScheduledExecutor();
|
||||
setupConfiguration(UTIL.getConfiguration());
|
||||
master = new MockMasterServices(UTIL.getConfiguration(), this.regionsToRegionServers);
|
||||
setupConfiguration(util.getConfiguration());
|
||||
master = new MockMasterServices(util.getConfiguration(), this.regionsToRegionServers);
|
||||
rsDispatcher = new MockRSProcedureDispatcher(master);
|
||||
master.start(NSERVERS, rsDispatcher);
|
||||
am = master.getAssignmentManager();
|
||||
|
@ -434,418 +223,4 @@ public class TestAssignmentManager {
|
|||
// set it back as default, see setUpMeta()
|
||||
am.wakeMetaLoadedEvent();
|
||||
}
|
||||
|
||||
private Future<byte[]> submitProcedure(final Procedure<MasterProcedureEnv> proc) {
|
||||
return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc);
|
||||
}
|
||||
|
||||
private byte[] waitOnFuture(final Future<byte[]> future) throws Exception {
|
||||
try {
|
||||
return future.get(5, TimeUnit.SECONDS);
|
||||
} catch (ExecutionException e) {
|
||||
LOG.info("ExecutionException", e);
|
||||
Exception ee = (Exception)e.getCause();
|
||||
if (ee instanceof InterruptedIOException) {
|
||||
for (Procedure<?> p: this.master.getMasterProcedureExecutor().getProcedures()) {
|
||||
LOG.info(p.toStringDetails());
|
||||
}
|
||||
}
|
||||
throw (Exception)e.getCause();
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================================
|
||||
// Helpers
|
||||
// ============================================================================================
|
||||
private void bulkSubmit(final AssignProcedure[] procs) throws Exception {
|
||||
final Thread[] threads = new Thread[PROC_NTHREADS];
|
||||
for (int i = 0; i < threads.length; ++i) {
|
||||
final int threadId = i;
|
||||
threads[i] = new Thread() {
|
||||
@Override
|
||||
public void run() {
|
||||
TableName tableName = TableName.valueOf("table-" + threadId);
|
||||
int n = (procs.length / threads.length);
|
||||
int start = threadId * n;
|
||||
int stop = start + n;
|
||||
for (int j = start; j < stop; ++j) {
|
||||
procs[j] = createAndSubmitAssign(tableName, j);
|
||||
}
|
||||
}
|
||||
};
|
||||
threads[i].start();
|
||||
}
|
||||
for (int i = 0; i < threads.length; ++i) {
|
||||
threads[i].join();
|
||||
}
|
||||
for (int i = procs.length - 1; i >= 0 && procs[i] == null; --i) {
|
||||
procs[i] = createAndSubmitAssign(TableName.valueOf("table-sync"), i);
|
||||
}
|
||||
}
|
||||
|
||||
private AssignProcedure createAndSubmitAssign(TableName tableName, int regionId) {
|
||||
RegionInfo hri = createRegionInfo(tableName, regionId);
|
||||
AssignProcedure proc = am.createAssignProcedure(hri);
|
||||
master.getMasterProcedureExecutor().submitProcedure(proc);
|
||||
return proc;
|
||||
}
|
||||
|
||||
private RegionInfo createRegionInfo(final TableName tableName, final long regionId) {
|
||||
return RegionInfoBuilder.newBuilder(tableName)
|
||||
.setStartKey(Bytes.toBytes(regionId))
|
||||
.setEndKey(Bytes.toBytes(regionId + 1))
|
||||
.setSplit(false)
|
||||
.setRegionId(0)
|
||||
.build();
|
||||
}
|
||||
|
||||
private void sendTransitionReport(final ServerName serverName,
|
||||
final org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos.RegionInfo regionInfo,
|
||||
final TransitionCode state) throws IOException {
|
||||
ReportRegionStateTransitionRequest.Builder req =
|
||||
ReportRegionStateTransitionRequest.newBuilder();
|
||||
req.setServer(ProtobufUtil.toServerName(serverName));
|
||||
req.addTransition(RegionStateTransition.newBuilder()
|
||||
.addRegionInfo(regionInfo)
|
||||
.setTransitionCode(state)
|
||||
.setOpenSeqNum(1)
|
||||
.build());
|
||||
am.reportRegionStateTransition(req.build());
|
||||
}
|
||||
|
||||
private void doCrash(final ServerName serverName) {
|
||||
this.am.submitServerCrash(serverName, false/*No WALs here*/);
|
||||
}
|
||||
|
||||
private void doRestart(final ServerName serverName) {
|
||||
try {
|
||||
this.master.restartRegionServer(serverName);
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Can not restart RS with new startcode");
|
||||
}
|
||||
}
|
||||
|
||||
private class NoopRsExecutor implements MockRSExecutor {
|
||||
@Override
|
||||
public ExecuteProceduresResponse sendRequest(ServerName server,
|
||||
ExecuteProceduresRequest request) throws IOException {
|
||||
if (request.getOpenRegionCount() > 0) {
|
||||
for (OpenRegionRequest req : request.getOpenRegionList()) {
|
||||
for (RegionOpenInfo openReq : req.getOpenInfoList()) {
|
||||
execOpenRegion(server, openReq);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (request.getCloseRegionCount() > 0) {
|
||||
for (CloseRegionRequest req : request.getCloseRegionList()) {
|
||||
execCloseRegion(server, req.getRegion().getValue().toByteArray());
|
||||
}
|
||||
}
|
||||
return ExecuteProceduresResponse.newBuilder().build();
|
||||
}
|
||||
|
||||
protected RegionOpeningState execOpenRegion(ServerName server, RegionOpenInfo regionInfo)
|
||||
throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
protected CloseRegionResponse execCloseRegion(ServerName server, byte[] regionName)
|
||||
throws IOException {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private class GoodRsExecutor extends NoopRsExecutor {
|
||||
@Override
|
||||
protected RegionOpeningState execOpenRegion(ServerName server, RegionOpenInfo openReq)
|
||||
throws IOException {
|
||||
sendTransitionReport(server, openReq.getRegion(), TransitionCode.OPENED);
|
||||
// Concurrency?
|
||||
// Now update the state of our cluster in regionsToRegionServers.
|
||||
SortedSet<byte []> regions = regionsToRegionServers.get(server);
|
||||
if (regions == null) {
|
||||
regions = new ConcurrentSkipListSet<byte[]>(Bytes.BYTES_COMPARATOR);
|
||||
regionsToRegionServers.put(server, regions);
|
||||
}
|
||||
RegionInfo hri = ProtobufUtil.toRegionInfo(openReq.getRegion());
|
||||
if (regions.contains(hri.getRegionName())) {
|
||||
throw new UnsupportedOperationException(hri.getRegionNameAsString());
|
||||
}
|
||||
regions.add(hri.getRegionName());
|
||||
return RegionOpeningState.OPENED;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected CloseRegionResponse execCloseRegion(ServerName server, byte[] regionName)
|
||||
throws IOException {
|
||||
RegionInfo hri = am.getRegionInfo(regionName);
|
||||
sendTransitionReport(server, ProtobufUtil.toRegionInfo(hri), TransitionCode.CLOSED);
|
||||
return CloseRegionResponse.newBuilder().setClosed(true).build();
|
||||
}
|
||||
}
|
||||
|
||||
private static class ServerNotYetRunningRsExecutor implements MockRSExecutor {
|
||||
@Override
|
||||
public ExecuteProceduresResponse sendRequest(ServerName server, ExecuteProceduresRequest req)
|
||||
throws IOException {
|
||||
throw new ServerNotRunningYetException("wait on server startup");
|
||||
}
|
||||
}
|
||||
|
||||
private static class FaultyRsExecutor implements MockRSExecutor {
|
||||
private final IOException exception;
|
||||
|
||||
public FaultyRsExecutor(final IOException exception) {
|
||||
this.exception = exception;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExecuteProceduresResponse sendRequest(ServerName server, ExecuteProceduresRequest req)
|
||||
throws IOException {
|
||||
throw exception;
|
||||
}
|
||||
}
|
||||
|
||||
private class SocketTimeoutRsExecutor extends GoodRsExecutor {
|
||||
private final int maxSocketTimeoutRetries;
|
||||
private final int maxServerRetries;
|
||||
|
||||
private ServerName lastServer;
|
||||
private int sockTimeoutRetries;
|
||||
private int serverRetries;
|
||||
|
||||
public SocketTimeoutRsExecutor(int maxSocketTimeoutRetries, int maxServerRetries) {
|
||||
this.maxServerRetries = maxServerRetries;
|
||||
this.maxSocketTimeoutRetries = maxSocketTimeoutRetries;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExecuteProceduresResponse sendRequest(ServerName server, ExecuteProceduresRequest req)
|
||||
throws IOException {
|
||||
// SocketTimeoutException should be a temporary problem
|
||||
// unless the server will be declared dead.
|
||||
if (sockTimeoutRetries++ < maxSocketTimeoutRetries) {
|
||||
if (sockTimeoutRetries == 1) assertNotEquals(lastServer, server);
|
||||
lastServer = server;
|
||||
LOG.debug("Socket timeout for server=" + server + " retries=" + sockTimeoutRetries);
|
||||
throw new SocketTimeoutException("simulate socket timeout");
|
||||
} else if (serverRetries++ < maxServerRetries) {
|
||||
LOG.info("Mark server=" + server + " as dead. serverRetries=" + serverRetries);
|
||||
master.getServerManager().moveFromOnlineToDeadServers(server);
|
||||
sockTimeoutRetries = 0;
|
||||
throw new SocketTimeoutException("simulate socket timeout");
|
||||
} else {
|
||||
return super.sendRequest(server, req);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes open request and then returns nothing so acts like a RS that went zombie.
|
||||
* No response (so proc is stuck/suspended on the Master and won't wake up.). We
|
||||
* then send in a crash for this server after a few seconds; crash is supposed to
|
||||
* take care of the suspended procedures.
|
||||
*/
|
||||
private class HangThenRSCrashExecutor extends GoodRsExecutor {
|
||||
private int invocations;
|
||||
|
||||
@Override
|
||||
protected RegionOpeningState execOpenRegion(final ServerName server, RegionOpenInfo openReq)
|
||||
throws IOException {
|
||||
if (this.invocations++ > 0) {
|
||||
// Return w/o problem the second time through here.
|
||||
return super.execOpenRegion(server, openReq);
|
||||
}
|
||||
// The procedure on master will just hang forever because nothing comes back
|
||||
// from the RS in this case.
|
||||
LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout");
|
||||
executor.schedule(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
LOG.info("Sending in CRASH of " + server);
|
||||
doCrash(server);
|
||||
}
|
||||
}, 1, TimeUnit.SECONDS);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes open request and then returns nothing so acts like a RS that went zombie.
|
||||
* No response (so proc is stuck/suspended on the Master and won't wake up.).
|
||||
* Different with HangThenRSCrashExecutor, HangThenRSCrashExecutor will create
|
||||
* ServerCrashProcedure to handle the server crash. However, this HangThenRSRestartExecutor
|
||||
* will restart RS directly, situation for RS crashed when SCP is not enabled.
|
||||
*/
|
||||
private class HangThenRSRestartExecutor extends GoodRsExecutor {
|
||||
private int invocations;
|
||||
|
||||
@Override
|
||||
protected RegionOpeningState execOpenRegion(final ServerName server, RegionOpenInfo openReq)
|
||||
throws IOException {
|
||||
if (this.invocations++ > 0) {
|
||||
// Return w/o problem the second time through here.
|
||||
return super.execOpenRegion(server, openReq);
|
||||
}
|
||||
// The procedure on master will just hang forever because nothing comes back
|
||||
// from the RS in this case.
|
||||
LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout");
|
||||
executor.schedule(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
LOG.info("Restarting RS of " + server);
|
||||
doRestart(server);
|
||||
}
|
||||
}, 1, TimeUnit.SECONDS);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor {
|
||||
public static final int TYPES_OF_FAILURE = 6;
|
||||
private int invocations;
|
||||
|
||||
@Override
|
||||
protected CloseRegionResponse execCloseRegion(ServerName server, byte[] regionName)
|
||||
throws IOException {
|
||||
switch (this.invocations++) {
|
||||
case 0: throw new NotServingRegionException("Fake");
|
||||
case 1:
|
||||
executor.schedule(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
LOG.info("Sending in CRASH of " + server);
|
||||
doCrash(server);
|
||||
}
|
||||
}, 1, TimeUnit.SECONDS);
|
||||
throw new RegionServerAbortedException("Fake!");
|
||||
case 2:
|
||||
executor.schedule(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
LOG.info("Sending in CRASH of " + server);
|
||||
doCrash(server);
|
||||
}
|
||||
}, 1, TimeUnit.SECONDS);
|
||||
throw new RegionServerStoppedException("Fake!");
|
||||
case 3: throw new ServerNotRunningYetException("Fake!");
|
||||
case 4:
|
||||
LOG.info("Returned null from serverName={}; means STUCK...TODO timeout", server);
|
||||
executor.schedule(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
LOG.info("Sending in CRASH of " + server);
|
||||
doCrash(server);
|
||||
}
|
||||
}, 1, TimeUnit.SECONDS);
|
||||
return null;
|
||||
default:
|
||||
return super.execCloseRegion(server, regionName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class RandRsExecutor extends NoopRsExecutor {
|
||||
private final Random rand = new Random();
|
||||
|
||||
@Override
|
||||
public ExecuteProceduresResponse sendRequest(ServerName server, ExecuteProceduresRequest req)
|
||||
throws IOException {
|
||||
switch (rand.nextInt(5)) {
|
||||
case 0: throw new ServerNotRunningYetException("wait on server startup");
|
||||
case 1: throw new SocketTimeoutException("simulate socket timeout");
|
||||
case 2: throw new RemoteException("java.io.IOException", "unexpected exception");
|
||||
default:
|
||||
// fall out
|
||||
}
|
||||
return super.sendRequest(server, req);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected RegionOpeningState execOpenRegion(final ServerName server, RegionOpenInfo openReq)
|
||||
throws IOException {
|
||||
switch (rand.nextInt(6)) {
|
||||
case 0:
|
||||
LOG.info("Return OPENED response");
|
||||
sendTransitionReport(server, openReq.getRegion(), TransitionCode.OPENED);
|
||||
return OpenRegionResponse.RegionOpeningState.OPENED;
|
||||
case 1:
|
||||
LOG.info("Return transition report that OPENED/ALREADY_OPENED response");
|
||||
sendTransitionReport(server, openReq.getRegion(), TransitionCode.OPENED);
|
||||
return OpenRegionResponse.RegionOpeningState.ALREADY_OPENED;
|
||||
case 2:
|
||||
LOG.info("Return transition report that FAILED_OPEN/FAILED_OPENING response");
|
||||
sendTransitionReport(server, openReq.getRegion(), TransitionCode.FAILED_OPEN);
|
||||
return OpenRegionResponse.RegionOpeningState.FAILED_OPENING;
|
||||
default:
|
||||
// fall out
|
||||
}
|
||||
// The procedure on master will just hang forever because nothing comes back
|
||||
// from the RS in this case.
|
||||
LOG.info("Return null as response; means proc stuck so we send in a crash report after a few seconds...");
|
||||
executor.schedule(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
LOG.info("Delayed CRASHING of " + server);
|
||||
doCrash(server);
|
||||
}
|
||||
}, 5, TimeUnit.SECONDS);
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected CloseRegionResponse execCloseRegion(ServerName server, byte[] regionName)
|
||||
throws IOException {
|
||||
CloseRegionResponse.Builder resp = CloseRegionResponse.newBuilder();
|
||||
boolean closed = rand.nextBoolean();
|
||||
if (closed) {
|
||||
RegionInfo hri = am.getRegionInfo(regionName);
|
||||
sendTransitionReport(server, ProtobufUtil.toRegionInfo(hri), TransitionCode.CLOSED);
|
||||
}
|
||||
resp.setClosed(closed);
|
||||
return resp.build();
|
||||
}
|
||||
}
|
||||
|
||||
private interface MockRSExecutor {
|
||||
ExecuteProceduresResponse sendRequest(ServerName server, ExecuteProceduresRequest req)
|
||||
throws IOException;
|
||||
}
|
||||
|
||||
private class MockRSProcedureDispatcher extends RSProcedureDispatcher {
|
||||
private MockRSExecutor mockRsExec;
|
||||
|
||||
public MockRSProcedureDispatcher(final MasterServices master) {
|
||||
super(master);
|
||||
}
|
||||
|
||||
public void setMockRsExecutor(final MockRSExecutor mockRsExec) {
|
||||
this.mockRsExec = mockRsExec;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void remoteDispatch(ServerName serverName, Set<RemoteProcedure> remoteProcedures) {
|
||||
submitTask(new MockRemoteCall(serverName, remoteProcedures));
|
||||
}
|
||||
|
||||
private class MockRemoteCall extends ExecuteProceduresRemoteCall {
|
||||
public MockRemoteCall(final ServerName serverName,
|
||||
final Set<RemoteProcedure> operations) {
|
||||
super(serverName, operations);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ExecuteProceduresResponse sendRequest(final ServerName serverName,
|
||||
final ExecuteProceduresRequest request) throws IOException {
|
||||
return mockRsExec.sendRequest(serverName, request);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void collectAssignmentManagerMetrics() {
|
||||
assignSubmittedCount = assignProcMetrics.getSubmittedCounter().getCount();
|
||||
assignFailedCount = assignProcMetrics.getFailedCounter().getCount();
|
||||
unassignSubmittedCount = unassignProcMetrics.getSubmittedCounter().getCount();
|
||||
unassignFailedCount = unassignProcMetrics.getFailedCounter().getCount();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,586 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertNotEquals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InterruptedIOException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.util.Arrays;
|
||||
import java.util.NavigableMap;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import java.util.concurrent.ConcurrentSkipListMap;
|
||||
import java.util.concurrent.ConcurrentSkipListSet;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.NotServingRegionException;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
||||
import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
|
||||
import org.apache.hadoop.hbase.master.MasterServices;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureConstants;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
|
||||
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher;
|
||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
|
||||
import org.apache.hadoop.hbase.procedure2.store.wal.WALProcedureStore;
|
||||
import org.apache.hadoop.hbase.regionserver.RegionServerAbortedException;
|
||||
import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.FSUtils;
|
||||
import org.apache.hadoop.ipc.RemoteException;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Rule;
|
||||
import org.junit.rules.ExpectedException;
|
||||
import org.junit.rules.TestName;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.CloseRegionRequest;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.CloseRegionResponse;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.ExecuteProceduresRequest;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.ExecuteProceduresResponse;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.OpenRegionRequest;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.OpenRegionRequest.RegionOpenInfo;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.OpenRegionResponse;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.OpenRegionResponse.RegionOpeningState;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
|
||||
|
||||
/**
|
||||
* Base class for AM test.
|
||||
*/
|
||||
public abstract class TestAssignmentManagerBase {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(TestAssignmentManagerBase.class);
|
||||
|
||||
@Rule
|
||||
public TestName name = new TestName();
|
||||
@Rule
|
||||
public final ExpectedException exception = ExpectedException.none();
|
||||
|
||||
protected static final int PROC_NTHREADS = 64;
|
||||
protected static final int NREGIONS = 1 * 1000;
|
||||
protected static final int NSERVERS = Math.max(1, NREGIONS / 100);
|
||||
|
||||
protected HBaseTestingUtility util;
|
||||
protected MockRSProcedureDispatcher rsDispatcher;
|
||||
protected MockMasterServices master;
|
||||
protected AssignmentManager am;
|
||||
protected NavigableMap<ServerName, SortedSet<byte[]>> regionsToRegionServers =
|
||||
new ConcurrentSkipListMap<ServerName, SortedSet<byte[]>>();
|
||||
// Simple executor to run some simple tasks.
|
||||
protected ScheduledExecutorService executor;
|
||||
|
||||
protected ProcedureMetrics assignProcMetrics;
|
||||
protected ProcedureMetrics unassignProcMetrics;
|
||||
|
||||
protected long assignSubmittedCount = 0;
|
||||
protected long assignFailedCount = 0;
|
||||
protected long unassignSubmittedCount = 0;
|
||||
protected long unassignFailedCount = 0;
|
||||
|
||||
protected int getAssignMaxAttempts() {
|
||||
// Have many so we succeed eventually.
|
||||
return 100;
|
||||
}
|
||||
|
||||
protected void setupConfiguration(Configuration conf) throws Exception {
|
||||
FSUtils.setRootDir(conf, util.getDataTestDir());
|
||||
conf.setBoolean(WALProcedureStore.USE_HSYNC_CONF_KEY, false);
|
||||
conf.setInt(WALProcedureStore.SYNC_WAIT_MSEC_CONF_KEY, 10);
|
||||
conf.setInt(MasterProcedureConstants.MASTER_PROCEDURE_THREADS, PROC_NTHREADS);
|
||||
conf.setInt(RSProcedureDispatcher.RS_RPC_STARTUP_WAIT_TIME_CONF_KEY, 1000);
|
||||
conf.setInt(AssignmentManager.ASSIGN_MAX_ATTEMPTS, getAssignMaxAttempts());
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
util = new HBaseTestingUtility();
|
||||
this.executor = Executors.newSingleThreadScheduledExecutor();
|
||||
setupConfiguration(util.getConfiguration());
|
||||
master = new MockMasterServices(util.getConfiguration(), this.regionsToRegionServers);
|
||||
rsDispatcher = new MockRSProcedureDispatcher(master);
|
||||
master.start(NSERVERS, rsDispatcher);
|
||||
am = master.getAssignmentManager();
|
||||
assignProcMetrics = am.getAssignmentManagerMetrics().getAssignProcMetrics();
|
||||
unassignProcMetrics = am.getAssignmentManagerMetrics().getUnassignProcMetrics();
|
||||
setUpMeta();
|
||||
}
|
||||
|
||||
protected void setUpMeta() throws Exception {
|
||||
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
|
||||
am.assign(RegionInfoBuilder.FIRST_META_REGIONINFO);
|
||||
am.wakeMetaLoadedEvent();
|
||||
}
|
||||
|
||||
@After
|
||||
public void tearDown() throws Exception {
|
||||
master.stop("tearDown");
|
||||
this.executor.shutdownNow();
|
||||
}
|
||||
|
||||
protected class NoopRsExecutor implements MockRSExecutor {
|
||||
@Override
|
||||
public ExecuteProceduresResponse sendRequest(ServerName server,
|
||||
ExecuteProceduresRequest request) throws IOException {
|
||||
if (request.getOpenRegionCount() > 0) {
|
||||
for (OpenRegionRequest req : request.getOpenRegionList()) {
|
||||
for (RegionOpenInfo openReq : req.getOpenInfoList()) {
|
||||
execOpenRegion(server, openReq);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (request.getCloseRegionCount() > 0) {
|
||||
for (CloseRegionRequest req : request.getCloseRegionList()) {
|
||||
execCloseRegion(server, req.getRegion().getValue().toByteArray());
|
||||
}
|
||||
}
|
||||
return ExecuteProceduresResponse.newBuilder().build();
|
||||
}
|
||||
|
||||
protected RegionOpeningState execOpenRegion(ServerName server, RegionOpenInfo regionInfo)
|
||||
throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
protected CloseRegionResponse execCloseRegion(ServerName server, byte[] regionName)
|
||||
throws IOException {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
protected Future<byte[]> submitProcedure(final Procedure<MasterProcedureEnv> proc) {
|
||||
return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc);
|
||||
}
|
||||
|
||||
protected byte[] waitOnFuture(final Future<byte[]> future) throws Exception {
|
||||
try {
|
||||
return future.get(5, TimeUnit.SECONDS);
|
||||
} catch (ExecutionException e) {
|
||||
LOG.info("ExecutionException", e);
|
||||
Exception ee = (Exception) e.getCause();
|
||||
if (ee instanceof InterruptedIOException) {
|
||||
for (Procedure<?> p : this.master.getMasterProcedureExecutor().getProcedures()) {
|
||||
LOG.info(p.toStringDetails());
|
||||
}
|
||||
}
|
||||
throw (Exception) e.getCause();
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================================
|
||||
// Helpers
|
||||
// ============================================================================================
|
||||
protected void bulkSubmit(TransitRegionStateProcedure[] procs) throws Exception {
|
||||
Thread[] threads = new Thread[PROC_NTHREADS];
|
||||
for (int i = 0; i < threads.length; ++i) {
|
||||
final int threadId = i;
|
||||
threads[i] = new Thread() {
|
||||
@Override
|
||||
public void run() {
|
||||
TableName tableName = TableName.valueOf("table-" + threadId);
|
||||
int n = (procs.length / threads.length);
|
||||
int start = threadId * n;
|
||||
int stop = start + n;
|
||||
for (int j = start; j < stop; ++j) {
|
||||
procs[j] = createAndSubmitAssign(tableName, j);
|
||||
}
|
||||
}
|
||||
};
|
||||
threads[i].start();
|
||||
}
|
||||
for (int i = 0; i < threads.length; ++i) {
|
||||
threads[i].join();
|
||||
}
|
||||
for (int i = procs.length - 1; i >= 0 && procs[i] == null; --i) {
|
||||
procs[i] = createAndSubmitAssign(TableName.valueOf("table-sync"), i);
|
||||
}
|
||||
}
|
||||
|
||||
protected TransitRegionStateProcedure createAndSubmitAssign(TableName tableName, int regionId) {
|
||||
RegionInfo hri = createRegionInfo(tableName, regionId);
|
||||
TransitRegionStateProcedure proc = createAssignProcedure(hri);
|
||||
master.getMasterProcedureExecutor().submitProcedure(proc);
|
||||
return proc;
|
||||
}
|
||||
|
||||
protected RegionInfo createRegionInfo(final TableName tableName, final long regionId) {
|
||||
return RegionInfoBuilder.newBuilder(tableName).setStartKey(Bytes.toBytes(regionId))
|
||||
.setEndKey(Bytes.toBytes(regionId + 1)).setSplit(false).setRegionId(0).build();
|
||||
}
|
||||
|
||||
protected TransitRegionStateProcedure createAssignProcedure(RegionInfo hri) {
|
||||
return am.createAssignProcedures(Arrays.asList(hri))[0];
|
||||
}
|
||||
|
||||
protected TransitRegionStateProcedure createUnassignProcedure(RegionInfo hri) {
|
||||
RegionStateNode regionNode = am.getRegionStates().getRegionStateNode(hri);
|
||||
TransitRegionStateProcedure proc;
|
||||
regionNode.lock();
|
||||
try {
|
||||
assertFalse(regionNode.isInTransition());
|
||||
proc = TransitRegionStateProcedure
|
||||
.unassign(master.getMasterProcedureExecutor().getEnvironment(), hri);
|
||||
regionNode.setProcedure(proc);
|
||||
} finally {
|
||||
regionNode.unlock();
|
||||
}
|
||||
return proc;
|
||||
}
|
||||
|
||||
protected void sendTransitionReport(final ServerName serverName,
|
||||
final org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos.RegionInfo regionInfo,
|
||||
final TransitionCode state) throws IOException {
|
||||
ReportRegionStateTransitionRequest.Builder req =
|
||||
ReportRegionStateTransitionRequest.newBuilder();
|
||||
req.setServer(ProtobufUtil.toServerName(serverName));
|
||||
req.addTransition(RegionStateTransition.newBuilder().addRegionInfo(regionInfo)
|
||||
.setTransitionCode(state).setOpenSeqNum(1).build());
|
||||
am.reportRegionStateTransition(req.build());
|
||||
}
|
||||
|
||||
protected void doCrash(final ServerName serverName) {
|
||||
this.am.submitServerCrash(serverName, false/* No WALs here */);
|
||||
}
|
||||
|
||||
protected void doRestart(final ServerName serverName) {
|
||||
try {
|
||||
this.master.restartRegionServer(serverName);
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Can not restart RS with new startcode");
|
||||
}
|
||||
}
|
||||
|
||||
protected class GoodRsExecutor extends NoopRsExecutor {
|
||||
@Override
|
||||
protected RegionOpeningState execOpenRegion(ServerName server, RegionOpenInfo openReq)
|
||||
throws IOException {
|
||||
sendTransitionReport(server, openReq.getRegion(), TransitionCode.OPENED);
|
||||
// Concurrency?
|
||||
// Now update the state of our cluster in regionsToRegionServers.
|
||||
SortedSet<byte[]> regions = regionsToRegionServers.get(server);
|
||||
if (regions == null) {
|
||||
regions = new ConcurrentSkipListSet<byte[]>(Bytes.BYTES_COMPARATOR);
|
||||
regionsToRegionServers.put(server, regions);
|
||||
}
|
||||
RegionInfo hri = ProtobufUtil.toRegionInfo(openReq.getRegion());
|
||||
if (regions.contains(hri.getRegionName())) {
|
||||
throw new UnsupportedOperationException(hri.getRegionNameAsString());
|
||||
}
|
||||
regions.add(hri.getRegionName());
|
||||
return RegionOpeningState.OPENED;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected CloseRegionResponse execCloseRegion(ServerName server, byte[] regionName)
|
||||
throws IOException {
|
||||
RegionInfo hri = am.getRegionInfo(regionName);
|
||||
sendTransitionReport(server, ProtobufUtil.toRegionInfo(hri), TransitionCode.CLOSED);
|
||||
return CloseRegionResponse.newBuilder().setClosed(true).build();
|
||||
}
|
||||
}
|
||||
|
||||
protected static class ServerNotYetRunningRsExecutor implements MockRSExecutor {
|
||||
@Override
|
||||
public ExecuteProceduresResponse sendRequest(ServerName server, ExecuteProceduresRequest req)
|
||||
throws IOException {
|
||||
throw new ServerNotRunningYetException("wait on server startup");
|
||||
}
|
||||
}
|
||||
|
||||
protected static class FaultyRsExecutor implements MockRSExecutor {
|
||||
private final IOException exception;
|
||||
|
||||
public FaultyRsExecutor(final IOException exception) {
|
||||
this.exception = exception;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExecuteProceduresResponse sendRequest(ServerName server, ExecuteProceduresRequest req)
|
||||
throws IOException {
|
||||
throw exception;
|
||||
}
|
||||
}
|
||||
|
||||
protected class SocketTimeoutRsExecutor extends GoodRsExecutor {
|
||||
private final int maxSocketTimeoutRetries;
|
||||
private final int maxServerRetries;
|
||||
|
||||
private ServerName lastServer;
|
||||
private int sockTimeoutRetries;
|
||||
private int serverRetries;
|
||||
|
||||
public SocketTimeoutRsExecutor(int maxSocketTimeoutRetries, int maxServerRetries) {
|
||||
this.maxServerRetries = maxServerRetries;
|
||||
this.maxSocketTimeoutRetries = maxSocketTimeoutRetries;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExecuteProceduresResponse sendRequest(ServerName server, ExecuteProceduresRequest req)
|
||||
throws IOException {
|
||||
// SocketTimeoutException should be a temporary problem
|
||||
// unless the server will be declared dead.
|
||||
if (sockTimeoutRetries++ < maxSocketTimeoutRetries) {
|
||||
if (sockTimeoutRetries == 1) {
|
||||
assertNotEquals(lastServer, server);
|
||||
}
|
||||
lastServer = server;
|
||||
LOG.debug("Socket timeout for server=" + server + " retries=" + sockTimeoutRetries);
|
||||
throw new SocketTimeoutException("simulate socket timeout");
|
||||
} else if (serverRetries++ < maxServerRetries) {
|
||||
LOG.info("Mark server=" + server + " as dead. serverRetries=" + serverRetries);
|
||||
master.getServerManager().moveFromOnlineToDeadServers(server);
|
||||
sockTimeoutRetries = 0;
|
||||
throw new SocketTimeoutException("simulate socket timeout");
|
||||
} else {
|
||||
return super.sendRequest(server, req);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes open request and then returns nothing so acts like a RS that went zombie. No response (so
|
||||
* proc is stuck/suspended on the Master and won't wake up.). We then send in a crash for this
|
||||
* server after a few seconds; crash is supposed to take care of the suspended procedures.
|
||||
*/
|
||||
protected class HangThenRSCrashExecutor extends GoodRsExecutor {
|
||||
private int invocations;
|
||||
|
||||
@Override
|
||||
protected RegionOpeningState execOpenRegion(final ServerName server, RegionOpenInfo openReq)
|
||||
throws IOException {
|
||||
if (this.invocations++ > 0) {
|
||||
// Return w/o problem the second time through here.
|
||||
return super.execOpenRegion(server, openReq);
|
||||
}
|
||||
// The procedure on master will just hang forever because nothing comes back
|
||||
// from the RS in this case.
|
||||
LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout");
|
||||
executor.schedule(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
LOG.info("Sending in CRASH of " + server);
|
||||
doCrash(server);
|
||||
}
|
||||
}, 1, TimeUnit.SECONDS);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes open request and then returns nothing so acts like a RS that went zombie. No response (so
|
||||
* proc is stuck/suspended on the Master and won't wake up.). Different with
|
||||
* HangThenRSCrashExecutor, HangThenRSCrashExecutor will create ServerCrashProcedure to handle the
|
||||
* server crash. However, this HangThenRSRestartExecutor will restart RS directly, situation for
|
||||
* RS crashed when SCP is not enabled.
|
||||
*/
|
||||
protected class HangThenRSRestartExecutor extends GoodRsExecutor {
|
||||
private int invocations;
|
||||
|
||||
@Override
|
||||
protected RegionOpeningState execOpenRegion(final ServerName server, RegionOpenInfo openReq)
|
||||
throws IOException {
|
||||
if (this.invocations++ > 0) {
|
||||
// Return w/o problem the second time through here.
|
||||
return super.execOpenRegion(server, openReq);
|
||||
}
|
||||
// The procedure on master will just hang forever because nothing comes back
|
||||
// from the RS in this case.
|
||||
LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout");
|
||||
executor.schedule(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
LOG.info("Restarting RS of " + server);
|
||||
doRestart(server);
|
||||
}
|
||||
}, 1, TimeUnit.SECONDS);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
protected class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor {
|
||||
public static final int TYPES_OF_FAILURE = 6;
|
||||
private int invocations;
|
||||
|
||||
@Override
|
||||
protected CloseRegionResponse execCloseRegion(ServerName server, byte[] regionName)
|
||||
throws IOException {
|
||||
switch (this.invocations++) {
|
||||
case 0:
|
||||
throw new NotServingRegionException("Fake");
|
||||
case 1:
|
||||
executor.schedule(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
LOG.info("Sending in CRASH of " + server);
|
||||
doCrash(server);
|
||||
}
|
||||
}, 1, TimeUnit.SECONDS);
|
||||
throw new RegionServerAbortedException("Fake!");
|
||||
case 2:
|
||||
executor.schedule(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
LOG.info("Sending in CRASH of " + server);
|
||||
doCrash(server);
|
||||
}
|
||||
}, 1, TimeUnit.SECONDS);
|
||||
throw new RegionServerStoppedException("Fake!");
|
||||
case 3:
|
||||
throw new ServerNotRunningYetException("Fake!");
|
||||
case 4:
|
||||
LOG.info("Returned null from serverName={}; means STUCK...TODO timeout", server);
|
||||
executor.schedule(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
LOG.info("Sending in CRASH of " + server);
|
||||
doCrash(server);
|
||||
}
|
||||
}, 1, TimeUnit.SECONDS);
|
||||
return null;
|
||||
default:
|
||||
return super.execCloseRegion(server, regionName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected class RandRsExecutor extends NoopRsExecutor {
|
||||
private final Random rand = new Random();
|
||||
|
||||
@Override
|
||||
public ExecuteProceduresResponse sendRequest(ServerName server, ExecuteProceduresRequest req)
|
||||
throws IOException {
|
||||
switch (rand.nextInt(5)) {
|
||||
case 0:
|
||||
throw new ServerNotRunningYetException("wait on server startup");
|
||||
case 1:
|
||||
throw new SocketTimeoutException("simulate socket timeout");
|
||||
case 2:
|
||||
throw new RemoteException("java.io.IOException", "unexpected exception");
|
||||
default:
|
||||
// fall out
|
||||
}
|
||||
return super.sendRequest(server, req);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected RegionOpeningState execOpenRegion(final ServerName server, RegionOpenInfo openReq)
|
||||
throws IOException {
|
||||
switch (rand.nextInt(6)) {
|
||||
case 0:
|
||||
LOG.info("Return OPENED response");
|
||||
sendTransitionReport(server, openReq.getRegion(), TransitionCode.OPENED);
|
||||
return OpenRegionResponse.RegionOpeningState.OPENED;
|
||||
case 1:
|
||||
LOG.info("Return transition report that OPENED/ALREADY_OPENED response");
|
||||
sendTransitionReport(server, openReq.getRegion(), TransitionCode.OPENED);
|
||||
return OpenRegionResponse.RegionOpeningState.ALREADY_OPENED;
|
||||
case 2:
|
||||
LOG.info("Return transition report that FAILED_OPEN/FAILED_OPENING response");
|
||||
sendTransitionReport(server, openReq.getRegion(), TransitionCode.FAILED_OPEN);
|
||||
return OpenRegionResponse.RegionOpeningState.FAILED_OPENING;
|
||||
default:
|
||||
// fall out
|
||||
}
|
||||
// The procedure on master will just hang forever because nothing comes back
|
||||
// from the RS in this case.
|
||||
LOG.info("Return null as response; means proc stuck so we send in a crash report after" +
|
||||
" a few seconds...");
|
||||
executor.schedule(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
LOG.info("Delayed CRASHING of " + server);
|
||||
doCrash(server);
|
||||
}
|
||||
}, 5, TimeUnit.SECONDS);
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected CloseRegionResponse execCloseRegion(ServerName server, byte[] regionName)
|
||||
throws IOException {
|
||||
CloseRegionResponse.Builder resp = CloseRegionResponse.newBuilder();
|
||||
boolean closed = rand.nextBoolean();
|
||||
if (closed) {
|
||||
RegionInfo hri = am.getRegionInfo(regionName);
|
||||
sendTransitionReport(server, ProtobufUtil.toRegionInfo(hri), TransitionCode.CLOSED);
|
||||
}
|
||||
resp.setClosed(closed);
|
||||
return resp.build();
|
||||
}
|
||||
}
|
||||
|
||||
protected interface MockRSExecutor {
|
||||
ExecuteProceduresResponse sendRequest(ServerName server, ExecuteProceduresRequest req)
|
||||
throws IOException;
|
||||
}
|
||||
|
||||
protected class MockRSProcedureDispatcher extends RSProcedureDispatcher {
|
||||
private MockRSExecutor mockRsExec;
|
||||
|
||||
public MockRSProcedureDispatcher(final MasterServices master) {
|
||||
super(master);
|
||||
}
|
||||
|
||||
public void setMockRsExecutor(final MockRSExecutor mockRsExec) {
|
||||
this.mockRsExec = mockRsExec;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void remoteDispatch(ServerName serverName,
|
||||
@SuppressWarnings("rawtypes") Set<RemoteProcedure> remoteProcedures) {
|
||||
submitTask(new MockRemoteCall(serverName, remoteProcedures));
|
||||
}
|
||||
|
||||
private class MockRemoteCall extends ExecuteProceduresRemoteCall {
|
||||
public MockRemoteCall(final ServerName serverName,
|
||||
@SuppressWarnings("rawtypes") final Set<RemoteProcedure> operations) {
|
||||
super(serverName, operations);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ExecuteProceduresResponse sendRequest(final ServerName serverName,
|
||||
final ExecuteProceduresRequest request) throws IOException {
|
||||
return mockRsExec.sendRequest(serverName, request);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected void collectAssignmentManagerMetrics() {
|
||||
assignSubmittedCount = assignProcMetrics.getSubmittedCounter().getCount();
|
||||
assignFailedCount = assignProcMetrics.getFailedCounter().getCount();
|
||||
unassignSubmittedCount = unassignProcMetrics.getSubmittedCounter().getCount();
|
||||
unassignFailedCount = unassignProcMetrics.getFailedCounter().getCount();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseIOException;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
|
||||
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
|
||||
import org.apache.hadoop.hbase.master.HMaster;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.junit.After;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
@Category({ MasterTests.class, MediumTests.class })
|
||||
public class TestAssignmentManagerUtil {
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestAssignmentManagerUtil.class);
|
||||
|
||||
private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
|
||||
|
||||
private static TableName TABLE_NAME = TableName.valueOf("AM");
|
||||
|
||||
private static MasterProcedureEnv ENV;
|
||||
|
||||
private static AssignmentManager AM;
|
||||
|
||||
private static int REGION_REPLICATION = 3;
|
||||
|
||||
@BeforeClass
|
||||
public static void setUp() throws Exception {
|
||||
UTIL.startMiniCluster(1);
|
||||
UTIL.getAdmin().balancerSwitch(false, true);
|
||||
UTIL.createTable(TableDescriptorBuilder.newBuilder(TABLE_NAME)
|
||||
.setColumnFamily(ColumnFamilyDescriptorBuilder.of("cf"))
|
||||
.setRegionReplication(REGION_REPLICATION).build(), new byte[][] { Bytes.toBytes(0) });
|
||||
UTIL.waitTableAvailable(TABLE_NAME);
|
||||
HMaster master = UTIL.getMiniHBaseCluster().getMaster();
|
||||
ENV = master.getMasterProcedureExecutor().getEnvironment();
|
||||
AM = master.getAssignmentManager();
|
||||
}
|
||||
|
||||
@After
|
||||
public void tearDownAfterTest() throws IOException {
|
||||
for (RegionInfo region : UTIL.getAdmin().getRegions(TABLE_NAME)) {
|
||||
RegionStateNode regionNode = AM.getRegionStates().getRegionStateNode(region);
|
||||
// confirm that we have released the lock
|
||||
assertFalse(((ReentrantLock) regionNode.lock).isLocked());
|
||||
TransitRegionStateProcedure proc = regionNode.getProcedure();
|
||||
if (proc != null) {
|
||||
regionNode.unsetProcedure(proc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void tearDown() throws Exception {
|
||||
UTIL.shutdownMiniCluster();
|
||||
}
|
||||
|
||||
private List<RegionInfo> getPrimaryRegions() throws IOException {
|
||||
return UTIL.getAdmin().getRegions(TABLE_NAME).stream()
|
||||
.filter(r -> RegionReplicaUtil.isDefaultReplica(r)).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreateUnassignProcedureForSplitFail() throws IOException {
|
||||
RegionInfo region = getPrimaryRegions().get(0);
|
||||
AM.getRegionStates().getRegionStateNode(region)
|
||||
.setProcedure(TransitRegionStateProcedure.unassign(ENV, region));
|
||||
try {
|
||||
AssignmentManagerUtil.createUnassignProceduresForSplitOrMerge(ENV, Stream.of(region),
|
||||
REGION_REPLICATION);
|
||||
fail("Should fail as the region is in transition");
|
||||
} catch (HBaseIOException e) {
|
||||
// expected
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreateUnassignProceduresForMergeFail() throws IOException {
|
||||
List<RegionInfo> regions = getPrimaryRegions();
|
||||
RegionInfo regionA = regions.get(0);
|
||||
RegionInfo regionB = regions.get(1);
|
||||
AM.getRegionStates().getRegionStateNode(regionB)
|
||||
.setProcedure(TransitRegionStateProcedure.unassign(ENV, regionB));
|
||||
try {
|
||||
AssignmentManagerUtil.createUnassignProceduresForSplitOrMerge(ENV,
|
||||
Stream.of(regionA, regionB), REGION_REPLICATION);
|
||||
fail("Should fail as the region is in transition");
|
||||
} catch (HBaseIOException e) {
|
||||
// expected
|
||||
}
|
||||
IntStream.range(0, REGION_REPLICATION)
|
||||
.mapToObj(i -> RegionReplicaUtil.getRegionInfoForReplica(regionA, i))
|
||||
.map(AM.getRegionStates()::getRegionStateNode).forEachOrdered(
|
||||
rn -> assertFalse("Should have unset the proc for " + rn, rn.isInTransition()));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,237 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.Put;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.Table;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
|
||||
import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface;
|
||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegionServer;
|
||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.gson.JsonArray;
|
||||
import org.apache.hbase.thirdparty.com.google.gson.JsonElement;
|
||||
import org.apache.hbase.thirdparty.com.google.gson.JsonObject;
|
||||
import org.apache.hbase.thirdparty.com.google.gson.JsonParser;
|
||||
|
||||
/**
|
||||
* Confirm that we will do backoff when retrying on closing a region, to avoid consuming all the
|
||||
* CPUs.
|
||||
*/
|
||||
@Category({ MasterTests.class, MediumTests.class })
|
||||
public class TestCloseRegionWhileRSCrash {
|
||||
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestCloseRegionWhileRSCrash.class);
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(TestCloseRegionWhileRSCrash.class);
|
||||
|
||||
private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
|
||||
|
||||
private static TableName TABLE_NAME = TableName.valueOf("Backoff");
|
||||
|
||||
private static byte[] CF = Bytes.toBytes("cf");
|
||||
|
||||
private static CountDownLatch ARRIVE = new CountDownLatch(1);
|
||||
|
||||
private static CountDownLatch RESUME = new CountDownLatch(1);
|
||||
|
||||
public static final class DummyServerProcedure extends Procedure<MasterProcedureEnv>
|
||||
implements ServerProcedureInterface {
|
||||
|
||||
private ServerName serverName;
|
||||
|
||||
public DummyServerProcedure() {
|
||||
}
|
||||
|
||||
public DummyServerProcedure(ServerName serverName) {
|
||||
this.serverName = serverName;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ServerName getServerName() {
|
||||
return serverName;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasMetaTableRegion() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ServerOperationType getServerOperationType() {
|
||||
return ServerOperationType.CRASH_HANDLER;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env)
|
||||
throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException {
|
||||
ARRIVE.countDown();
|
||||
RESUME.await();
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected LockState acquireLock(final MasterProcedureEnv env) {
|
||||
if (env.getProcedureScheduler().waitServerExclusiveLock(this, getServerName())) {
|
||||
return LockState.LOCK_EVENT_WAIT;
|
||||
}
|
||||
return LockState.LOCK_ACQUIRED;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void releaseLock(final MasterProcedureEnv env) {
|
||||
env.getProcedureScheduler().wakeServerExclusiveLock(this, getServerName());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean holdLock(MasterProcedureEnv env) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void rollback(MasterProcedureEnv env) throws IOException, InterruptedException {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean abort(MasterProcedureEnv env) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
}
|
||||
}
|
||||
|
||||
@BeforeClass
|
||||
public static void setUp() throws Exception {
|
||||
UTIL.startMiniCluster(3);
|
||||
UTIL.createTable(TABLE_NAME, CF);
|
||||
UTIL.getAdmin().balancerSwitch(false, true);
|
||||
HRegionServer srcRs = UTIL.getRSForFirstRegionInTable(TABLE_NAME);
|
||||
if (!srcRs.getRegions(TableName.META_TABLE_NAME).isEmpty()) {
|
||||
RegionInfo metaRegion = srcRs.getRegions(TableName.META_TABLE_NAME).get(0).getRegionInfo();
|
||||
HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs);
|
||||
UTIL.getAdmin().move(metaRegion.getEncodedNameAsBytes(),
|
||||
Bytes.toBytes(dstRs.getServerName().getServerName()));
|
||||
UTIL.waitFor(30000, () -> !dstRs.getRegions(TableName.META_TABLE_NAME).isEmpty());
|
||||
}
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void tearDown() throws Exception {
|
||||
UTIL.shutdownMiniCluster();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRetryBackoff() throws IOException, InterruptedException {
|
||||
HRegionServer srcRs = UTIL.getRSForFirstRegionInTable(TABLE_NAME);
|
||||
RegionInfo region = srcRs.getRegions(TABLE_NAME).get(0).getRegionInfo();
|
||||
HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs);
|
||||
ProcedureExecutor<MasterProcedureEnv> procExec =
|
||||
UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
|
||||
procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName()));
|
||||
ARRIVE.await();
|
||||
UTIL.getMiniHBaseCluster().killRegionServer(srcRs.getServerName());
|
||||
UTIL.waitFor(30000,
|
||||
() -> procExec.getProcedures().stream().anyMatch(p -> p instanceof ServerCrashProcedure));
|
||||
Thread t = new Thread(() -> {
|
||||
try {
|
||||
UTIL.getAdmin().move(region.getEncodedNameAsBytes(),
|
||||
Bytes.toBytes(dstRs.getServerName().getServerName()));
|
||||
} catch (IOException e) {
|
||||
}
|
||||
});
|
||||
t.start();
|
||||
JsonParser parser = new JsonParser();
|
||||
long oldTimeout = 0;
|
||||
int timeoutIncrements = 0;
|
||||
// wait until we enter the WAITING_TIMEOUT state
|
||||
UTIL.waitFor(30000, () -> getTimeout(parser, UTIL.getAdmin().getProcedures()) > 0);
|
||||
while (true) {
|
||||
long timeout = getTimeout(parser, UTIL.getAdmin().getProcedures());
|
||||
if (timeout > oldTimeout) {
|
||||
LOG.info("Timeout incremented, was {}, now is {}, increments={}", timeout, oldTimeout,
|
||||
timeoutIncrements);
|
||||
oldTimeout = timeout;
|
||||
timeoutIncrements++;
|
||||
if (timeoutIncrements > 3) {
|
||||
// If we incremented at least twice, break; the backoff is working.
|
||||
break;
|
||||
}
|
||||
}
|
||||
Thread.sleep(1000);
|
||||
}
|
||||
RESUME.countDown();
|
||||
t.join();
|
||||
// Make sure that the region is online, it may not on the original target server, as we will set
|
||||
// forceNewPlan to true if there is a server crash
|
||||
try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) {
|
||||
table.put(new Put(Bytes.toBytes(1)).addColumn(CF, Bytes.toBytes("cq"), Bytes.toBytes(1)));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param proceduresAsJSON This is String returned by admin.getProcedures call... an array of
|
||||
* Procedures as JSON.
|
||||
* @return The Procedure timeout value parsed from the TRSP.
|
||||
*/
|
||||
private long getTimeout(JsonParser parser, String proceduresAsJSON) {
|
||||
JsonArray array = parser.parse(proceduresAsJSON).getAsJsonArray();
|
||||
Iterator<JsonElement> iterator = array.iterator();
|
||||
while (iterator.hasNext()) {
|
||||
JsonElement element = iterator.next();
|
||||
JsonObject obj = element.getAsJsonObject();
|
||||
String className = obj.get("className").getAsString();
|
||||
String actualClassName = TransitRegionStateProcedure.class.getName();
|
||||
if (className.equals(actualClassName) && obj.has("timeout")) {
|
||||
return obj.get("timeout").getAsLong();
|
||||
}
|
||||
}
|
||||
return -1L;
|
||||
}
|
||||
}
|
|
@ -24,13 +24,14 @@ import java.util.List;
|
|||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.HColumnDescriptor;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.HTableDescriptor;
|
||||
import org.apache.hadoop.hbase.MetaTableAccessor;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.Admin;
|
||||
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.TableDescriptor;
|
||||
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureConstants;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureTestingUtility;
|
||||
|
@ -61,15 +62,16 @@ public class TestMergeTableRegionsProcedure {
|
|||
HBaseClassTestRule.forClass(TestMergeTableRegionsProcedure.class);
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(TestMergeTableRegionsProcedure.class);
|
||||
@Rule public final TestName name = new TestName();
|
||||
@Rule
|
||||
public final TestName name = new TestName();
|
||||
|
||||
protected static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
|
||||
private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
|
||||
private static long nonceGroup = HConstants.NO_NONCE;
|
||||
private static long nonce = HConstants.NO_NONCE;
|
||||
|
||||
private static final int initialRegionCount = 4;
|
||||
private final static byte[] FAMILY = Bytes.toBytes("FAMILY");
|
||||
final static Configuration conf = UTIL.getConfiguration();
|
||||
private final static Configuration conf = UTIL.getConfiguration();
|
||||
private static Admin admin;
|
||||
|
||||
private AssignmentManager am;
|
||||
|
@ -95,16 +97,12 @@ public class TestMergeTableRegionsProcedure {
|
|||
public static void setupCluster() throws Exception {
|
||||
setupConf(conf);
|
||||
UTIL.startMiniCluster(1);
|
||||
admin = UTIL.getHBaseAdmin();
|
||||
admin = UTIL.getAdmin();
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void cleanupTest() throws Exception {
|
||||
try {
|
||||
UTIL.shutdownMiniCluster();
|
||||
} catch (Exception e) {
|
||||
LOG.warn("failure shutting down cluster", e);
|
||||
}
|
||||
UTIL.shutdownMiniCluster();
|
||||
}
|
||||
|
||||
@Before
|
||||
|
@ -114,7 +112,7 @@ public class TestMergeTableRegionsProcedure {
|
|||
MasterProcedureTestingUtility.generateNonceGroup(UTIL.getHBaseCluster().getMaster());
|
||||
nonce = MasterProcedureTestingUtility.generateNonce(UTIL.getHBaseCluster().getMaster());
|
||||
// Turn off balancer so it doesn't cut in and mess up our placements.
|
||||
UTIL.getHBaseAdmin().setBalancerRunning(false, true);
|
||||
admin.balancerSwitch(false, true);
|
||||
// Turn off the meta scanner so it don't remove parent on us.
|
||||
UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(false);
|
||||
resetProcExecutorTestingKillFlag();
|
||||
|
@ -127,7 +125,7 @@ public class TestMergeTableRegionsProcedure {
|
|||
@After
|
||||
public void tearDown() throws Exception {
|
||||
resetProcExecutorTestingKillFlag();
|
||||
for (HTableDescriptor htd: UTIL.getHBaseAdmin().listTables()) {
|
||||
for (TableDescriptor htd: admin.listTableDescriptors()) {
|
||||
LOG.info("Tear down, remove table=" + htd.getTableName());
|
||||
UTIL.deleteTable(htd.getTableName());
|
||||
}
|
||||
|
@ -233,6 +231,7 @@ public class TestMergeTableRegionsProcedure {
|
|||
List<RegionInfo> tableRegions = createTable(tableName);
|
||||
|
||||
ProcedureTestingUtility.waitNoProcedureRunning(procExec);
|
||||
ProcedureTestingUtility.setKillIfHasParent(procExec, false);
|
||||
ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
|
||||
|
||||
RegionInfo[] regionsToMerge = new RegionInfo[2];
|
||||
|
@ -303,10 +302,9 @@ public class TestMergeTableRegionsProcedure {
|
|||
assertRegionCount(tableName, initialRegionCount - 1);
|
||||
}
|
||||
|
||||
private List<RegionInfo> createTable(final TableName tableName)
|
||||
throws Exception {
|
||||
HTableDescriptor desc = new HTableDescriptor(tableName);
|
||||
desc.addFamily(new HColumnDescriptor(FAMILY));
|
||||
private List<RegionInfo> createTable(final TableName tableName) throws Exception {
|
||||
TableDescriptor desc = TableDescriptorBuilder.newBuilder(tableName)
|
||||
.setColumnFamily(ColumnFamilyDescriptorBuilder.of(FAMILY)).build();
|
||||
byte[][] splitRows = new byte[initialRegionCount - 1][];
|
||||
for (int i = 0; i < splitRows.length; ++i) {
|
||||
splitRows[i] = Bytes.toBytes(String.format("%d", i));
|
||||
|
|
|
@ -1,174 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseIOException;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.Admin;
|
||||
import org.apache.hadoop.hbase.client.Put;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.Table;
|
||||
import org.apache.hadoop.hbase.master.HMaster;
|
||||
import org.apache.hadoop.hbase.master.RegionPlan;
|
||||
import org.apache.hadoop.hbase.master.RegionState;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegionServer;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos;
|
||||
import org.apache.hadoop.hbase.testclassification.LargeTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.junit.rules.TestName;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Like TestRegionMove in regionserver package but in here in this package so I can get access to
|
||||
* Procedure internals to mess with the assignment to manufacture states seen out on clusters.
|
||||
*/
|
||||
@Category({LargeTests.class})
|
||||
public class TestRegionMove2 {
|
||||
private final static Logger LOG = LoggerFactory.getLogger(TestRegionMove2.class);
|
||||
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestRegionMove2.class);
|
||||
|
||||
@Rule
|
||||
public TestName name = new TestName();
|
||||
|
||||
private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
|
||||
|
||||
@BeforeClass
|
||||
public static void startCluster() throws Exception {
|
||||
TEST_UTIL.startMiniCluster(2);
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void stopCluster() throws Exception {
|
||||
TEST_UTIL.shutdownMiniCluster();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that we make it through to the end if parent Region is offlined between start of this
|
||||
* Move and when we go to run the move UnassignProcedure.
|
||||
*/
|
||||
@Test
|
||||
public void testMoveOfRegionOfflinedPostStart() throws Exception {
|
||||
TableName tableName = TableName.valueOf(this.name.getMethodName());
|
||||
// Create a table with more than one region
|
||||
byte [] cf = Bytes.toBytes("cf");
|
||||
Table t = TEST_UTIL.createMultiRegionTable(tableName, cf, 10);
|
||||
TEST_UTIL.waitUntilAllRegionsAssigned(tableName);
|
||||
HRegionServer rs1 = null;
|
||||
HRegionServer rs2 = null;
|
||||
List<RegionInfo> regionsOnRS1ForTable = null;
|
||||
try (Admin admin = TEST_UTIL.getAdmin()) {
|
||||
// Write an update to each region
|
||||
for (RegionInfo regionInfo : admin.getRegions(tableName)) {
|
||||
byte[] startKey = regionInfo.getStartKey();
|
||||
// StartKey of first region is "empty", which would throw an error if we try to Put that.
|
||||
byte[] rowKey =
|
||||
org.apache.hbase.thirdparty.com.google.common.primitives.Bytes.concat(startKey,
|
||||
Bytes.toBytes("1"));
|
||||
Put p = new Put(rowKey);
|
||||
p.addColumn(cf, Bytes.toBytes("q1"), Bytes.toBytes("value"));
|
||||
t.put(p);
|
||||
}
|
||||
|
||||
// Get a Region which is on the first RS
|
||||
rs1 = TEST_UTIL.getRSForFirstRegionInTable(tableName);
|
||||
rs2 = TEST_UTIL.getOtherRegionServer(rs1);
|
||||
regionsOnRS1ForTable = admin.getRegions(rs1.getServerName()).stream().
|
||||
filter((regionInfo) -> regionInfo.getTable().equals(tableName)).
|
||||
collect(Collectors.toList());
|
||||
}
|
||||
assertTrue("Expected to find at least one region for " + tableName + " on " +
|
||||
rs1.getServerName() + ", but found none", !regionsOnRS1ForTable.isEmpty());
|
||||
final RegionInfo regionToMove = regionsOnRS1ForTable.get(0);
|
||||
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
|
||||
|
||||
// Try to move the region. HackedMoveRegionProcedure should intercede and mess up the region
|
||||
// state setting it to SPLIT when we run the UnassignProcedure part of move region.
|
||||
// Then when we go to do the unassignprocedure, we should notice the region-to-move is not
|
||||
// online.... spew some log, and then fast-track to the end of the unassign. The assign under
|
||||
// move will also notice that the parent is not-online but SPLIT and will skip it... so the
|
||||
// move will "succeed" but we won't have moved the region!
|
||||
RegionPlan rp = new RegionPlan(regionToMove, rs1.getServerName(), rs2.getServerName());
|
||||
MasterProcedureEnv env = master.getMasterProcedureExecutor().getEnvironment();
|
||||
HackedMoveRegionProcedure p = new HackedMoveRegionProcedure(env, rp);
|
||||
master.getMasterProcedureExecutor().submitProcedure(p);
|
||||
ProcedureTestingUtility.waitProcedure(master.getMasterProcedureExecutor(), p);
|
||||
// Split should have been called.
|
||||
assertTrue(p.split.get());
|
||||
// The region should not have been moved!
|
||||
assertTrue(rs1.getOnlineRegion(regionToMove.getRegionName()) != null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Class just so we can mess around with RegionStateNode state at a particular point in the
|
||||
* Procedure to try and mess it up.
|
||||
*/
|
||||
public static class HackedMoveRegionProcedure extends MoveRegionProcedure {
|
||||
/**
|
||||
* Set to true after we hack this regions RSN to SPLIT
|
||||
*/
|
||||
public static AtomicBoolean split = new AtomicBoolean(false);
|
||||
|
||||
// Required by the Procedure framework to create the procedure on replay
|
||||
public HackedMoveRegionProcedure() {
|
||||
super();
|
||||
}
|
||||
|
||||
public HackedMoveRegionProcedure(MasterProcedureEnv env, RegionPlan plan)
|
||||
throws HBaseIOException {
|
||||
super(env, plan, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Flow executeFromState(MasterProcedureEnv env,
|
||||
MasterProcedureProtos.MoveRegionState state) throws InterruptedException {
|
||||
Flow flow = null;
|
||||
switch (state) {
|
||||
case MOVE_REGION_UNASSIGN:
|
||||
// Just before the unassign, flip the state to SPLIT. The unassign should exit!
|
||||
RegionStates.RegionStateNode rsn =
|
||||
env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
|
||||
rsn.setState(RegionState.State.SPLIT);
|
||||
LOG.info("HACKED RSN, setting it to SPLIT: {}", rsn);
|
||||
split.set(true);
|
||||
default:
|
||||
flow = super.executeFromState(env, state);
|
||||
}
|
||||
return flow;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -30,7 +30,6 @@ import org.apache.hadoop.hbase.DoNotRetryIOException;
|
|||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.HTableDescriptor;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.Waiter;
|
||||
import org.apache.hadoop.hbase.client.CompactionState;
|
||||
|
@ -40,6 +39,7 @@ import org.apache.hadoop.hbase.client.Put;
|
|||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.Result;
|
||||
import org.apache.hadoop.hbase.client.Table;
|
||||
import org.apache.hadoop.hbase.client.TableDescriptor;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureConstants;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureTestingUtility;
|
||||
|
@ -120,7 +120,7 @@ public class TestSplitTableRegionProcedure {
|
|||
ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(getMasterProcedureExecutor(), false);
|
||||
|
||||
// Turn off balancer so it doesn't cut in and mess up our placements.
|
||||
UTIL.getAdmin().setBalancerRunning(false, true);
|
||||
UTIL.getAdmin().balancerSwitch(false, true);
|
||||
// Turn off the meta scanner so it don't remove parent on us.
|
||||
UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(false);
|
||||
am = UTIL.getHBaseCluster().getMaster().getAssignmentManager();
|
||||
|
@ -132,7 +132,7 @@ public class TestSplitTableRegionProcedure {
|
|||
@After
|
||||
public void tearDown() throws Exception {
|
||||
ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(getMasterProcedureExecutor(), false);
|
||||
for (HTableDescriptor htd: UTIL.getAdmin().listTables()) {
|
||||
for (TableDescriptor htd : UTIL.getAdmin().listTableDescriptors()) {
|
||||
UTIL.deleteTable(htd.getTableName());
|
||||
}
|
||||
}
|
||||
|
@ -374,7 +374,7 @@ public class TestSplitTableRegionProcedure {
|
|||
MasterProcedureTestingUtility.testRollbackAndDoubleExecution(procExec, procId, numberOfSteps,
|
||||
true);
|
||||
// check that we have only 1 region
|
||||
assertEquals(1, UTIL.getHBaseAdmin().getTableRegions(tableName).size());
|
||||
assertEquals(1, UTIL.getAdmin().getRegions(tableName).size());
|
||||
List<HRegion> daughters = UTIL.getMiniHBaseCluster().getRegions(tableName);
|
||||
assertEquals(1, daughters.size());
|
||||
verifyData(daughters.get(0), startRowNum, rowCount,
|
||||
|
@ -398,6 +398,7 @@ public class TestSplitTableRegionProcedure {
|
|||
assertTrue("not able to find a splittable region", regions != null);
|
||||
assertTrue("not able to find a splittable region", regions.length == 1);
|
||||
ProcedureTestingUtility.waitNoProcedureRunning(procExec);
|
||||
ProcedureTestingUtility.setKillIfHasParent(procExec, false);
|
||||
ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
|
||||
|
||||
// collect AM metrics before test
|
||||
|
|
|
@ -0,0 +1,164 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.master.HMaster;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureConstants;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureTestingUtility;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegion;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegionServer;
|
||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.junit.After;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.junit.rules.TestName;
|
||||
|
||||
@Category({ MasterTests.class, MediumTests.class })
|
||||
public class TestTransitRegionStateProcedure {
|
||||
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestTransitRegionStateProcedure.class);
|
||||
|
||||
private static HBaseTestingUtility UTIL = new HBaseTestingUtility();
|
||||
|
||||
private static byte[] CF = Bytes.toBytes("cf");
|
||||
|
||||
@Rule
|
||||
public TestName name = new TestName();
|
||||
|
||||
private TableName tableName;
|
||||
|
||||
@BeforeClass
|
||||
public static void setUpBeforeClass() throws Exception {
|
||||
UTIL.getConfiguration().setInt(MasterProcedureConstants.MASTER_PROCEDURE_THREADS, 1);
|
||||
UTIL.startMiniCluster(3);
|
||||
UTIL.getAdmin().balancerSwitch(false, true);
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void tearDownAfterClass() throws Exception {
|
||||
UTIL.shutdownMiniCluster();
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setUp() throws IOException, InterruptedException {
|
||||
tableName = TableName.valueOf(name.getMethodName());
|
||||
UTIL.createTable(tableName, CF);
|
||||
UTIL.waitTableAvailable(tableName);
|
||||
}
|
||||
|
||||
private void resetProcExecutorTestingKillFlag() {
|
||||
ProcedureExecutor<MasterProcedureEnv> procExec =
|
||||
UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor();
|
||||
ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, false);
|
||||
assertTrue("expected executor to be running", procExec.isRunning());
|
||||
}
|
||||
|
||||
@After
|
||||
public void tearDown() throws IOException {
|
||||
resetProcExecutorTestingKillFlag();
|
||||
UTIL.deleteTable(tableName);
|
||||
}
|
||||
|
||||
private void testRecoveryAndDoubleExcution(TransitRegionStateProcedure proc) throws Exception {
|
||||
HMaster master = UTIL.getHBaseCluster().getMaster();
|
||||
AssignmentManager am = master.getAssignmentManager();
|
||||
RegionStateNode regionNode = am.getRegionStates().getRegionStateNode(proc.getRegion());
|
||||
assertFalse(regionNode.isInTransition());
|
||||
regionNode.setProcedure(proc);
|
||||
assertTrue(regionNode.isInTransition());
|
||||
ProcedureExecutor<MasterProcedureEnv> procExec = master.getMasterProcedureExecutor();
|
||||
ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
|
||||
long procId = procExec.submitProcedure(proc);
|
||||
MasterProcedureTestingUtility.testRecoveryAndDoubleExecution(procExec, procId);
|
||||
regionNode = am.getRegionStates().getRegionStateNode(proc.getRegion());
|
||||
assertFalse(regionNode.isInTransition());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRecoveryAndDoubleExecutionMove() throws Exception {
|
||||
MasterProcedureEnv env =
|
||||
UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor().getEnvironment();
|
||||
HRegion region = UTIL.getMiniHBaseCluster().getRegions(tableName).get(0);
|
||||
long openSeqNum = region.getOpenSeqNum();
|
||||
TransitRegionStateProcedure proc =
|
||||
TransitRegionStateProcedure.move(env, region.getRegionInfo(), null);
|
||||
testRecoveryAndDoubleExcution(proc);
|
||||
HRegion region2 = UTIL.getMiniHBaseCluster().getRegions(tableName).get(0);
|
||||
long openSeqNum2 = region2.getOpenSeqNum();
|
||||
// confirm that the region is successfully opened
|
||||
assertTrue(openSeqNum2 > openSeqNum);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRecoveryAndDoubleExecutionReopen() throws Exception {
|
||||
MasterProcedureEnv env =
|
||||
UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor().getEnvironment();
|
||||
HRegionServer rs = UTIL.getRSForFirstRegionInTable(tableName);
|
||||
HRegion region = rs.getRegions(tableName).get(0);
|
||||
long openSeqNum = region.getOpenSeqNum();
|
||||
TransitRegionStateProcedure proc =
|
||||
TransitRegionStateProcedure.reopen(env, region.getRegionInfo());
|
||||
testRecoveryAndDoubleExcution(proc);
|
||||
// should still be on the same RS
|
||||
HRegion region2 = rs.getRegions(tableName).get(0);
|
||||
long openSeqNum2 = region2.getOpenSeqNum();
|
||||
// confirm that the region is successfully opened
|
||||
assertTrue(openSeqNum2 > openSeqNum);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRecoveryAndDoubleExecutionUnassignAndAssign() throws Exception {
|
||||
HMaster master = UTIL.getMiniHBaseCluster().getMaster();
|
||||
MasterProcedureEnv env = master.getMasterProcedureExecutor().getEnvironment();
|
||||
HRegion region = UTIL.getMiniHBaseCluster().getRegions(tableName).get(0);
|
||||
RegionInfo regionInfo = region.getRegionInfo();
|
||||
long openSeqNum = region.getOpenSeqNum();
|
||||
TransitRegionStateProcedure unassign = TransitRegionStateProcedure.unassign(env, regionInfo);
|
||||
testRecoveryAndDoubleExcution(unassign);
|
||||
AssignmentManager am = master.getAssignmentManager();
|
||||
assertTrue(am.getRegionStates().getRegionState(regionInfo).isClosed());
|
||||
|
||||
TransitRegionStateProcedure assign = TransitRegionStateProcedure.assign(env, regionInfo, null);
|
||||
testRecoveryAndDoubleExcution(assign);
|
||||
|
||||
HRegion region2 = UTIL.getMiniHBaseCluster().getRegions(tableName).get(0);
|
||||
long openSeqNum2 = region2.getOpenSeqNum();
|
||||
// confirm that the region is successfully opened
|
||||
assertTrue(openSeqNum2 > openSeqNum);
|
||||
}
|
||||
}
|
|
@ -1,177 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.assignment;
|
||||
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.Admin;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.master.HMaster;
|
||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.Threads;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.junit.rules.TestName;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.gson.JsonArray;
|
||||
import org.apache.hbase.thirdparty.com.google.gson.JsonElement;
|
||||
import org.apache.hbase.thirdparty.com.google.gson.JsonObject;
|
||||
import org.apache.hbase.thirdparty.com.google.gson.JsonParser;
|
||||
|
||||
/**
|
||||
* Tests for HBASE-18408 "AM consumes CPU and fills up the logs really fast when there is no RS to
|
||||
* assign". If an {@link org.apache.hadoop.hbase.exceptions.UnexpectedStateException}, we'd spin on
|
||||
* the ProcedureExecutor consuming CPU and filling logs. Test new back-off facility.
|
||||
*/
|
||||
@Category({MasterTests.class, MediumTests.class})
|
||||
public class TestUnexpectedStateException {
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestUnexpectedStateException.class);
|
||||
@Rule public final TestName name = new TestName();
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(TestUnexpectedStateException.class);
|
||||
private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
|
||||
private static final byte [] FAMILY = Bytes.toBytes("family");
|
||||
private TableName tableName;
|
||||
private static final int REGIONS = 10;
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
TEST_UTIL.getConfiguration().setBoolean("hbase.localcluster.assign.random.ports", false);
|
||||
TEST_UTIL.getConfiguration().setInt(HConstants.MASTER_INFO_PORT, 50655);
|
||||
TEST_UTIL.startMiniCluster();
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void afterClass() throws Exception {
|
||||
TEST_UTIL.shutdownMiniCluster();
|
||||
}
|
||||
|
||||
@Before
|
||||
public void before() throws IOException {
|
||||
this.tableName = TableName.valueOf(this.name.getMethodName());
|
||||
TEST_UTIL.createMultiRegionTable(this.tableName, FAMILY, REGIONS);
|
||||
}
|
||||
|
||||
private RegionInfo pickArbitraryRegion(Admin admin) throws IOException {
|
||||
List<RegionInfo> regions = admin.getRegions(this.tableName);
|
||||
return regions.get(3);
|
||||
}
|
||||
|
||||
/**
|
||||
* Manufacture a state that will throw UnexpectedStateException.
|
||||
* Change an assigned region's 'state' to be OPENING. That'll mess up a subsequent unassign
|
||||
* causing it to throw UnexpectedStateException. We can easily manufacture this infinite retry
|
||||
* state in UnassignProcedure because it has no startTransition. AssignProcedure does where it
|
||||
* squashes whatever the current region state is making it OFFLINE. That makes it harder to mess
|
||||
* it up. Make do with UnassignProcedure for now.
|
||||
*/
|
||||
@Test
|
||||
public void testUnableToAssign() throws Exception {
|
||||
try (Admin admin = TEST_UTIL.getAdmin()) {
|
||||
// Pick a random region from this tests' table to play with. Get its RegionStateNode.
|
||||
// Clone it because the original will be changed by the system. We need clone to fake out
|
||||
// a state.
|
||||
final RegionInfo region = pickArbitraryRegion(admin);
|
||||
AssignmentManager am = TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager();
|
||||
RegionStates.RegionStateNode rsn = am.getRegionStates().getRegionStateNode(region);
|
||||
// Now force region to be in OPENING state.
|
||||
am.markRegionAsOpening(rsn);
|
||||
// Now the 'region' is in an artificially bad state, try an unassign again.
|
||||
// Run unassign in a thread because it is blocking.
|
||||
Runnable unassign = () -> {
|
||||
try {
|
||||
admin.unassign(region.getRegionName(), true);
|
||||
} catch (IOException ioe) {
|
||||
fail("Failed assign");
|
||||
}
|
||||
};
|
||||
Thread t = new Thread(unassign, "unassign");
|
||||
t.start();
|
||||
while(!t.isAlive()) {
|
||||
Threads.sleep(100);
|
||||
}
|
||||
Threads.sleep(1000);
|
||||
// Unassign should be running and failing. Look for incrementing timeout as evidence that
|
||||
// Unassign is stuck and doing backoff.
|
||||
// Now fix the condition we were waiting on so the unassign can complete.
|
||||
JsonParser parser = new JsonParser();
|
||||
long oldTimeout = 0;
|
||||
int timeoutIncrements = 0;
|
||||
while (true) {
|
||||
long timeout = getUnassignTimeout(parser, admin.getProcedures());
|
||||
if (timeout > oldTimeout) {
|
||||
LOG.info("Timeout incremented, was {}, now is {}, increments={}",
|
||||
timeout, oldTimeout, timeoutIncrements);
|
||||
oldTimeout = timeout;
|
||||
timeoutIncrements++;
|
||||
if (timeoutIncrements > 3) {
|
||||
// If we incremented at least twice, break; the backoff is working.
|
||||
break;
|
||||
}
|
||||
}
|
||||
Thread.sleep(1000);
|
||||
}
|
||||
TEST_UTIL.getMiniHBaseCluster().stopMaster(0).join();
|
||||
HMaster master = TEST_UTIL.getMiniHBaseCluster().startMaster().getMaster();
|
||||
TEST_UTIL.waitFor(30000, () -> master.isInitialized());
|
||||
am = master.getAssignmentManager();
|
||||
rsn = am.getRegionStates().getRegionStateNode(region);
|
||||
am.markRegionAsOpened(rsn);
|
||||
t.join();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param proceduresAsJSON This is String returned by admin.getProcedures call... an array of
|
||||
* Procedures as JSON.
|
||||
* @return The Procedure timeout value parsed from the Unassign Procedure.
|
||||
* @Exception Thrown if we do not find UnassignProcedure or fail to parse timeout.
|
||||
*/
|
||||
private long getUnassignTimeout(JsonParser parser, String proceduresAsJSON) throws Exception {
|
||||
JsonArray array = parser.parse(proceduresAsJSON).getAsJsonArray();
|
||||
Iterator<JsonElement> iterator = array.iterator();
|
||||
while (iterator.hasNext()) {
|
||||
JsonElement element = iterator.next();
|
||||
JsonObject obj = element.getAsJsonObject();
|
||||
String className = obj.get("className").getAsString();
|
||||
String actualClassName = UnassignProcedure.class.getName();
|
||||
if (className.equals(actualClassName)) {
|
||||
return obj.get("timeout").getAsLong();
|
||||
}
|
||||
}
|
||||
throw new Exception("Failed to find UnassignProcedure or timeout in " + proceduresAsJSON);
|
||||
}
|
||||
}
|
|
@ -50,8 +50,8 @@ import org.apache.hadoop.hbase.favored.FavoredNodesPlan;
|
|||
import org.apache.hadoop.hbase.master.HMaster;
|
||||
import org.apache.hadoop.hbase.master.LoadBalancer;
|
||||
import org.apache.hadoop.hbase.master.ServerManager;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStates;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegion;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
|
|
|
@ -27,6 +27,7 @@ import java.util.List;
|
|||
import java.util.TreeSet;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
|
@ -51,6 +52,7 @@ import org.apache.hadoop.hbase.master.HMaster;
|
|||
import org.apache.hadoop.hbase.master.RegionState;
|
||||
import org.apache.hadoop.hbase.master.TableStateManager;
|
||||
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
|
||||
import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
|
||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
|
||||
|
@ -72,13 +74,13 @@ public class MasterProcedureTestingUtility {
|
|||
public static void restartMasterProcedureExecutor(ProcedureExecutor<MasterProcedureEnv> procExec)
|
||||
throws Exception {
|
||||
final MasterProcedureEnv env = procExec.getEnvironment();
|
||||
final HMaster master = (HMaster)env.getMasterServices();
|
||||
final HMaster master = (HMaster) env.getMasterServices();
|
||||
ProcedureTestingUtility.restart(procExec, true, true,
|
||||
// stop services
|
||||
new Callable<Void>() {
|
||||
@Override
|
||||
public Void call() throws Exception {
|
||||
final AssignmentManager am = env.getAssignmentManager();
|
||||
AssignmentManager am = env.getAssignmentManager();
|
||||
// try to simulate a master restart by removing the ServerManager states about seqIDs
|
||||
for (RegionState regionState: am.getRegionStates().getRegionStates()) {
|
||||
env.getMasterServices().getServerManager().removeRegion(regionState.getRegion());
|
||||
|
@ -88,12 +90,26 @@ public class MasterProcedureTestingUtility {
|
|||
return null;
|
||||
}
|
||||
},
|
||||
// setup RIT before starting workers
|
||||
new Callable<Void>() {
|
||||
|
||||
@Override
|
||||
public Void call() throws Exception {
|
||||
AssignmentManager am = env.getAssignmentManager();
|
||||
am.start();
|
||||
// just follow the same way with HMaster.finishActiveMasterInitialization. See the
|
||||
// comments there
|
||||
am.setupRIT(procExec.getActiveProceduresNoCopy().stream().filter(p -> !p.isSuccess())
|
||||
.filter(p -> p instanceof TransitRegionStateProcedure)
|
||||
.map(p -> (TransitRegionStateProcedure) p).collect(Collectors.toList()));
|
||||
return null;
|
||||
}
|
||||
},
|
||||
// restart services
|
||||
new Callable<Void>() {
|
||||
@Override
|
||||
public Void call() throws Exception {
|
||||
final AssignmentManager am = env.getAssignmentManager();
|
||||
am.start();
|
||||
AssignmentManager am = env.getAssignmentManager();
|
||||
am.joinCluster();
|
||||
master.setInitialized(true);
|
||||
return null;
|
||||
|
|
|
@ -20,13 +20,15 @@ package org.apache.hadoop.hbase.master.procedure;
|
|||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HColumnDescriptor;
|
||||
import org.apache.hadoop.hbase.HTableDescriptor;
|
||||
import org.apache.hadoop.hbase.TableExistsException;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.Admin;
|
||||
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
|
||||
import org.apache.hadoop.hbase.client.SnapshotDescription;
|
||||
import org.apache.hadoop.hbase.client.TableDescriptor;
|
||||
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
|
||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
|
||||
|
@ -91,20 +93,18 @@ public class TestCloneSnapshotProcedure extends TestTableDDLProcedureBase {
|
|||
return 1;
|
||||
}
|
||||
|
||||
public static HTableDescriptor createHTableDescriptor(
|
||||
final TableName tableName, final byte[] ... family) {
|
||||
HTableDescriptor htd = new HTableDescriptor(tableName);
|
||||
for (int i = 0; i < family.length; ++i) {
|
||||
htd.addFamily(new HColumnDescriptor(family[i]));
|
||||
}
|
||||
return htd;
|
||||
public static TableDescriptor createTableDescriptor(TableName tableName, byte[]... family) {
|
||||
TableDescriptorBuilder builder = TableDescriptorBuilder.newBuilder(tableName);
|
||||
Stream.of(family).map(ColumnFamilyDescriptorBuilder::of)
|
||||
.forEachOrdered(builder::setColumnFamily);
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCloneSnapshot() throws Exception {
|
||||
final ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
|
||||
final TableName clonedTableName = TableName.valueOf("testCloneSnapshot2");
|
||||
final HTableDescriptor htd = createHTableDescriptor(clonedTableName, CF);
|
||||
final TableDescriptor htd = createTableDescriptor(clonedTableName, CF);
|
||||
|
||||
// take the snapshot
|
||||
SnapshotProtos.SnapshotDescription snapshotDesc = getSnapshot();
|
||||
|
@ -124,7 +124,7 @@ public class TestCloneSnapshotProcedure extends TestTableDDLProcedureBase {
|
|||
|
||||
final ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
|
||||
final TableName clonedTableName = TableName.valueOf(snapshotDesc.getTable());
|
||||
final HTableDescriptor htd = createHTableDescriptor(clonedTableName, CF);
|
||||
final TableDescriptor htd = createTableDescriptor(clonedTableName, CF);
|
||||
|
||||
long procId = ProcedureTestingUtility.submitAndWait(
|
||||
procExec, new CloneSnapshotProcedure(procExec.getEnvironment(), htd, snapshotDesc));
|
||||
|
@ -139,11 +139,14 @@ public class TestCloneSnapshotProcedure extends TestTableDDLProcedureBase {
|
|||
public void testRecoveryAndDoubleExecution() throws Exception {
|
||||
final ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
|
||||
final TableName clonedTableName = TableName.valueOf("testRecoveryAndDoubleExecution");
|
||||
final HTableDescriptor htd = createHTableDescriptor(clonedTableName, CF);
|
||||
final TableDescriptor htd = createTableDescriptor(clonedTableName, CF);
|
||||
|
||||
// take the snapshot
|
||||
SnapshotProtos.SnapshotDescription snapshotDesc = getSnapshot();
|
||||
|
||||
// Here if you enable this then we will enter an infinite loop, as we will fail either after
|
||||
// TRSP.openRegion or after OpenRegionProcedure.execute, so we can never finish the TRSP...
|
||||
ProcedureTestingUtility.setKillIfHasParent(procExec, false);
|
||||
ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
|
||||
|
||||
// Start the Clone snapshot procedure && kill the executor
|
||||
|
@ -162,7 +165,7 @@ public class TestCloneSnapshotProcedure extends TestTableDDLProcedureBase {
|
|||
public void testRollbackAndDoubleExecution() throws Exception {
|
||||
final ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
|
||||
final TableName clonedTableName = TableName.valueOf("testRollbackAndDoubleExecution");
|
||||
final HTableDescriptor htd = createHTableDescriptor(clonedTableName, CF);
|
||||
final TableDescriptor htd = createTableDescriptor(clonedTableName, CF);
|
||||
|
||||
// take the snapshot
|
||||
SnapshotProtos.SnapshotDescription snapshotDesc = getSnapshot();
|
||||
|
|
|
@ -91,7 +91,7 @@ public class TestEnableTableProcedure extends TestTableDDLProcedureBase {
|
|||
|
||||
// Enable the table - expect failure from ProcedurePrepareLatch
|
||||
final ProcedurePrepareLatch prepareLatch = new ProcedurePrepareLatch.CompatibilityLatch();
|
||||
long procId3 = procExec.submitProcedure(
|
||||
procExec.submitProcedure(
|
||||
new EnableTableProcedure(procExec.getEnvironment(), tableName, false, prepareLatch));
|
||||
prepareLatch.await();
|
||||
Assert.fail("Enable should throw exception through latch.");
|
||||
|
@ -108,6 +108,7 @@ public class TestEnableTableProcedure extends TestTableDDLProcedureBase {
|
|||
MasterProcedureTestingUtility.createTable(procExec, tableName, splitKeys, "f1", "f2");
|
||||
UTIL.getAdmin().disableTable(tableName);
|
||||
ProcedureTestingUtility.waitNoProcedureRunning(procExec);
|
||||
ProcedureTestingUtility.setKillIfHasParent(procExec, false);
|
||||
ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
|
||||
|
||||
// Start the Enable procedure && kill the executor
|
||||
|
|
|
@ -1,109 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.procedure;
|
||||
|
||||
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.master.MasterServices;
|
||||
import org.apache.hadoop.hbase.master.assignment.MockMasterServices;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
|
||||
import org.apache.hadoop.hbase.procedure2.StateMachineProcedure;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos;
|
||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||
import org.apache.hadoop.hbase.testclassification.SmallTests;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.mockito.Mockito;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
@Category({MasterTests.class, SmallTests.class})
|
||||
public class TestRecoverMetaProcedure {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(TestRecoverMetaProcedure.class);
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestRecoverMetaProcedure.class);
|
||||
private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
|
||||
|
||||
/**
|
||||
* Test the new prepare step.
|
||||
* Here we test that our Mock is faking out the precedure well-enough for it to progress past the
|
||||
* first prepare stage.
|
||||
*/
|
||||
@Test
|
||||
public void testPrepare() throws ProcedureSuspendedException, ProcedureYieldException,
|
||||
InterruptedException, IOException {
|
||||
RecoverMetaProcedure rmp = new RecoverMetaProcedure();
|
||||
MasterProcedureEnv env = Mockito.mock(MasterProcedureEnv.class);
|
||||
MasterServices masterServices =
|
||||
new MockMasterServices(UTIL.getConfiguration(), null);
|
||||
Mockito.when(env.getMasterServices()).thenReturn(masterServices);
|
||||
assertEquals(StateMachineProcedure.Flow.HAS_MORE_STATE,
|
||||
rmp.executeFromState(env, rmp.getInitialState()));
|
||||
int stateId = rmp.getCurrentStateId();
|
||||
assertEquals(MasterProcedureProtos.RecoverMetaState.RECOVER_META_SPLIT_LOGS_VALUE,
|
||||
rmp.getCurrentStateId());
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the new prepare step.
|
||||
* If Master is stopping, procedure should skip the assign by returning NO_MORE_STATE
|
||||
*/
|
||||
@Test
|
||||
public void testPrepareWithMasterStopping() throws ProcedureSuspendedException,
|
||||
ProcedureYieldException, InterruptedException, IOException {
|
||||
RecoverMetaProcedure rmp = new RecoverMetaProcedure();
|
||||
MasterProcedureEnv env = Mockito.mock(MasterProcedureEnv.class);
|
||||
MasterServices masterServices = new MockMasterServices(UTIL.getConfiguration(), null) {
|
||||
@Override
|
||||
public boolean isStopping() {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
Mockito.when(env.getMasterServices()).thenReturn(masterServices);
|
||||
assertEquals(StateMachineProcedure.Flow.NO_MORE_STATE,
|
||||
rmp.executeFromState(env, rmp.getInitialState()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the new prepare step.
|
||||
* If cluster is down, procedure should skip the assign by returning NO_MORE_STATE
|
||||
*/
|
||||
@Test
|
||||
public void testPrepareWithNoCluster() throws ProcedureSuspendedException,
|
||||
ProcedureYieldException, InterruptedException, IOException {
|
||||
RecoverMetaProcedure rmp = new RecoverMetaProcedure();
|
||||
MasterProcedureEnv env = Mockito.mock(MasterProcedureEnv.class);
|
||||
MasterServices masterServices = new MockMasterServices(UTIL.getConfiguration(), null) {
|
||||
@Override
|
||||
public boolean isClusterUp() {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
Mockito.when(env.getMasterServices()).thenReturn(masterServices);
|
||||
assertEquals(StateMachineProcedure.Flow.NO_MORE_STATE,
|
||||
rmp.executeFromState(env, rmp.getInitialState()));
|
||||
}
|
||||
}
|
|
@ -139,6 +139,10 @@ public class TestServerCrashProcedure {
|
|||
// Enable test flags and then queue the crash procedure.
|
||||
ProcedureTestingUtility.waitNoProcedureRunning(procExec);
|
||||
if (doubleExecution) {
|
||||
// For SCP, if you enable this then we will enter an infinite loop, as we will crash between
|
||||
// queue and open for TRSP, and then going back to queue, as we will use the crash rs as the
|
||||
// target server since it is recored in hbase:meta.
|
||||
ProcedureTestingUtility.setKillIfHasParent(procExec, false);
|
||||
ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
|
||||
// kill the RS
|
||||
AssignmentTestingUtil.killRs(util, rsToKill);
|
||||
|
|
|
@ -154,7 +154,7 @@ public class TestTruncateTableProcedure extends TestTableDDLProcedureBase {
|
|||
UTIL.waitUntilAllRegionsAssigned(tableName);
|
||||
|
||||
// validate the table regions and layout
|
||||
regions = UTIL.getAdmin().getTableRegions(tableName).toArray(new RegionInfo[0]);
|
||||
regions = UTIL.getAdmin().getRegions(tableName).toArray(new RegionInfo[0]);
|
||||
if (preserveSplits) {
|
||||
assertEquals(1 + splitKeys.length, regions.length);
|
||||
} else {
|
||||
|
@ -203,6 +203,7 @@ public class TestTruncateTableProcedure extends TestTableDDLProcedureBase {
|
|||
|
||||
final ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
|
||||
ProcedureTestingUtility.waitNoProcedureRunning(procExec);
|
||||
ProcedureTestingUtility.setKillIfHasParent(procExec, false);
|
||||
ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
|
||||
|
||||
// Start the Truncate procedure && kill the executor
|
||||
|
@ -216,7 +217,7 @@ public class TestTruncateTableProcedure extends TestTableDDLProcedureBase {
|
|||
UTIL.waitUntilAllRegionsAssigned(tableName);
|
||||
|
||||
// validate the table regions and layout
|
||||
regions = UTIL.getAdmin().getTableRegions(tableName).toArray(new RegionInfo[0]);
|
||||
regions = UTIL.getAdmin().getRegions(tableName).toArray(new RegionInfo[0]);
|
||||
if (preserveSplits) {
|
||||
assertEquals(1 + splitKeys.length, regions.length);
|
||||
} else {
|
||||
|
|
|
@ -1,216 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.snapshot;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
||||
import org.apache.hadoop.hbase.master.MasterServices;
|
||||
import org.apache.hadoop.hbase.master.ServerManager;
|
||||
import org.apache.hadoop.hbase.master.assignment.AssignProcedure;
|
||||
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStates;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
|
||||
import org.apache.hadoop.hbase.testclassification.RegionServerTests;
|
||||
import org.apache.hadoop.hbase.testclassification.SmallTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.junit.rules.TestName;
|
||||
import org.mockito.Mockito;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@Category({RegionServerTests.class, SmallTests.class})
|
||||
public class TestAssignProcedure {
|
||||
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestAssignProcedure.class);
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(TestAssignProcedure.class);
|
||||
@Rule public TestName name = new TestName();
|
||||
|
||||
/**
|
||||
* An override that opens up the updateTransition method inside in AssignProcedure so can call it
|
||||
* below directly in test and mess with targetServer. Used by test
|
||||
* {@link #testTargetServerBeingNulledOnUs()}.
|
||||
*/
|
||||
public static class TargetServerBeingNulledOnUsAssignProcedure extends AssignProcedure {
|
||||
public final AtomicBoolean addToRemoteDispatcherWasCalled = new AtomicBoolean(false);
|
||||
public final AtomicBoolean remoteCallFailedWasCalled = new AtomicBoolean(false);
|
||||
private final RegionStates.RegionStateNode rsn;
|
||||
|
||||
public TargetServerBeingNulledOnUsAssignProcedure(RegionInfo regionInfo,
|
||||
RegionStates.RegionStateNode rsn) {
|
||||
super(regionInfo);
|
||||
this.rsn = rsn;
|
||||
}
|
||||
|
||||
/**
|
||||
* Override so can change access from protected to public.
|
||||
*/
|
||||
@Override
|
||||
public boolean updateTransition(MasterProcedureEnv env, RegionStates.RegionStateNode regionNode)
|
||||
throws IOException, ProcedureSuspendedException {
|
||||
return super.updateTransition(env, regionNode);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean addToRemoteDispatcher(MasterProcedureEnv env, ServerName targetServer) {
|
||||
// So, mock the ServerCrashProcedure nulling out the targetServer AFTER updateTransition
|
||||
// has been called and BEFORE updateTransition gets to here.
|
||||
// We used to throw a NullPointerException. Now we just say the assign failed so it will
|
||||
// be rescheduled.
|
||||
boolean b = super.addToRemoteDispatcher(env, null);
|
||||
assertFalse(b);
|
||||
// Assert we were actually called.
|
||||
this.addToRemoteDispatcherWasCalled.set(true);
|
||||
return b;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RegionStates.RegionStateNode getRegionState(MasterProcedureEnv env) {
|
||||
// Do this so we don't have to mock a bunch of stuff.
|
||||
return this.rsn;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remoteCallFailed(final MasterProcedureEnv env,
|
||||
final ServerName serverName, final IOException exception) {
|
||||
// Just skip this remoteCallFailed. Its too hard to mock. Assert it is called though.
|
||||
// Happens after the code we are testing has been called.
|
||||
this.remoteCallFailedWasCalled.set(true);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Test that we deal with ServerCrashProcedure zero'ing out the targetServer in the
|
||||
* RegionStateNode in the midst of our doing an assign. The trickery is done above in
|
||||
* TargetServerBeingNulledOnUsAssignProcedure. We skip a bunch of logic to get at the guts
|
||||
* where the problem happens (We also skip-out the failure handling because it'd take a bunch
|
||||
* of mocking to get it to run). Fix is inside in RemoteProcedureDispatch#addOperationToNode.
|
||||
* It now notices empty targetServer and just returns false so we fall into failure processing
|
||||
* and we'll reassign elsewhere instead of NPE'ing. The fake of ServerCrashProcedure nulling out
|
||||
* the targetServer happens inside in updateTransition just after it was called but before it
|
||||
* gets to the near the end when addToRemoteDispatcher is called. See the
|
||||
* TargetServerBeingNulledOnUsAssignProcedure class above. See HBASE-19218.
|
||||
* Before fix, this test would fail w/ a NullPointerException.
|
||||
*/
|
||||
@Test
|
||||
public void testTargetServerBeingNulledOnUs() throws ProcedureSuspendedException, IOException {
|
||||
TableName tn = TableName.valueOf(this.name.getMethodName());
|
||||
RegionInfo ri = RegionInfoBuilder.newBuilder(tn).build();
|
||||
// Create an RSN with location/target server. Will be cleared above in addToRemoteDispatcher to
|
||||
// simulate issue in HBASE-19218
|
||||
RegionStates.RegionStateNode rsn = new RegionStates.RegionStateNode(ri);
|
||||
rsn.setRegionLocation(ServerName.valueOf("server.example.org", 0, 0));
|
||||
MasterProcedureEnv env = Mockito.mock(MasterProcedureEnv.class);
|
||||
AssignmentManager am = Mockito.mock(AssignmentManager.class);
|
||||
ServerManager sm = Mockito.mock(ServerManager.class);
|
||||
Mockito.when(sm.isServerOnline(Mockito.any())).thenReturn(true);
|
||||
MasterServices ms = Mockito.mock(MasterServices.class);
|
||||
Mockito.when(ms.getServerManager()).thenReturn(sm);
|
||||
Configuration configuration = HBaseConfiguration.create();
|
||||
Mockito.when(ms.getConfiguration()).thenReturn(configuration);
|
||||
Mockito.when(env.getAssignmentManager()).thenReturn(am);
|
||||
Mockito.when(env.getMasterServices()).thenReturn(ms);
|
||||
RSProcedureDispatcher rsd = new RSProcedureDispatcher(ms);
|
||||
Mockito.when(env.getRemoteDispatcher()).thenReturn(rsd);
|
||||
|
||||
TargetServerBeingNulledOnUsAssignProcedure assignProcedure =
|
||||
new TargetServerBeingNulledOnUsAssignProcedure(ri, rsn);
|
||||
assignProcedure.updateTransition(env, rsn);
|
||||
assertTrue(assignProcedure.remoteCallFailedWasCalled.get());
|
||||
assertTrue(assignProcedure.addToRemoteDispatcherWasCalled.get());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimpleComparator() {
|
||||
List<AssignProcedure> procedures = new ArrayList<AssignProcedure>();
|
||||
RegionInfo user1 = RegionInfoBuilder.newBuilder(TableName.valueOf("user_space1")).build();
|
||||
procedures.add(new AssignProcedure(user1));
|
||||
RegionInfo user2 = RegionInfoBuilder.newBuilder(TableName.valueOf("user_space2")).build();
|
||||
procedures.add(new AssignProcedure(RegionInfoBuilder.FIRST_META_REGIONINFO));
|
||||
procedures.add(new AssignProcedure(user2));
|
||||
RegionInfo system = RegionInfoBuilder.newBuilder(TableName.NAMESPACE_TABLE_NAME).build();
|
||||
procedures.add(new AssignProcedure(system));
|
||||
procedures.sort(AssignProcedure.COMPARATOR);
|
||||
assertTrue(procedures.get(0).isMeta());
|
||||
assertTrue(procedures.get(1).getRegionInfo().getTable().equals(TableName.NAMESPACE_TABLE_NAME));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testComparatorWithMetas() {
|
||||
List<AssignProcedure> procedures = new ArrayList<AssignProcedure>();
|
||||
RegionInfo user3 = RegionInfoBuilder.newBuilder(TableName.valueOf("user3")).build();
|
||||
procedures.add(new AssignProcedure(user3));
|
||||
RegionInfo system = RegionInfoBuilder.newBuilder(TableName.NAMESPACE_TABLE_NAME).build();
|
||||
procedures.add(new AssignProcedure(system));
|
||||
RegionInfo user1 = RegionInfoBuilder.newBuilder(TableName.valueOf("user_space1")).build();
|
||||
RegionInfo user2 = RegionInfoBuilder.newBuilder(TableName.valueOf("user_space2")).build();
|
||||
procedures.add(new AssignProcedure(user1));
|
||||
RegionInfo meta2 = RegionInfoBuilder.newBuilder(TableName.META_TABLE_NAME).
|
||||
setStartKey(Bytes.toBytes("002")).build();
|
||||
procedures.add(new AssignProcedure(meta2));
|
||||
procedures.add(new AssignProcedure(user2));
|
||||
RegionInfo meta1 = RegionInfoBuilder.newBuilder(TableName.META_TABLE_NAME).
|
||||
setStartKey(Bytes.toBytes("001")).build();
|
||||
procedures.add(new AssignProcedure(meta1));
|
||||
procedures.add(new AssignProcedure(RegionInfoBuilder.FIRST_META_REGIONINFO));
|
||||
RegionInfo meta0 = RegionInfoBuilder.newBuilder(TableName.META_TABLE_NAME).
|
||||
setStartKey(Bytes.toBytes("000")).build();
|
||||
procedures.add(new AssignProcedure(meta0));
|
||||
for (int i = 0; i < 10; i++) {
|
||||
Collections.shuffle(procedures);
|
||||
procedures.sort(AssignProcedure.COMPARATOR);
|
||||
try {
|
||||
assertTrue(procedures.get(0).getRegionInfo().equals(RegionInfoBuilder.FIRST_META_REGIONINFO));
|
||||
assertTrue(procedures.get(1).getRegionInfo().equals(meta0));
|
||||
assertTrue(procedures.get(2).getRegionInfo().equals(meta1));
|
||||
assertTrue(procedures.get(3).getRegionInfo().equals(meta2));
|
||||
assertTrue(procedures.get(4).getRegionInfo().getTable().equals(TableName.NAMESPACE_TABLE_NAME));
|
||||
assertTrue(procedures.get(5).getRegionInfo().equals(user1));
|
||||
assertTrue(procedures.get(6).getRegionInfo().equals(user2));
|
||||
assertTrue(procedures.get(7).getRegionInfo().equals(user3));
|
||||
} catch (Throwable t) {
|
||||
for (AssignProcedure proc : procedures) {
|
||||
LOG.debug(Objects.toString(proc));
|
||||
}
|
||||
throw t;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -23,12 +23,11 @@ import static org.junit.Assert.assertTrue;
|
|||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.DoNotRetryIOException;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.TableNotEnabledException;
|
||||
import org.apache.hadoop.hbase.client.Admin;
|
||||
import org.apache.hadoop.hbase.client.DoNotRetryRegionException;
|
||||
import org.apache.hadoop.hbase.client.Put;
|
||||
|
@ -132,11 +131,13 @@ public class TestRegionMove {
|
|||
// Disable the table
|
||||
admin.disableTable(tableName);
|
||||
|
||||
// We except a DNRIOE when we try to move a region which isn't open.
|
||||
thrown.expect(TableNotEnabledException.class);
|
||||
thrown.expectMessage(t.getName().toString());
|
||||
|
||||
// Move the region to the other RS -- should fail
|
||||
admin.move(regionToMove.getEncodedNameAsBytes(), Bytes.toBytes(rs2.getServerName().toString()));
|
||||
try {
|
||||
// Move the region to the other RS -- should fail
|
||||
admin.move(regionToMove.getEncodedNameAsBytes(),
|
||||
Bytes.toBytes(rs2.getServerName().toString()));
|
||||
fail();
|
||||
} catch (DoNotRetryIOException e) {
|
||||
// We got expected exception
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,15 +33,14 @@ import java.util.Optional;
|
|||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.Coprocessor;
|
||||
import org.apache.hadoop.hbase.CoprocessorEnvironment;
|
||||
import org.apache.hadoop.hbase.DoNotRetryIOException;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.HColumnDescriptor;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.HTableDescriptor;
|
||||
import org.apache.hadoop.hbase.MasterNotRunningException;
|
||||
|
@ -52,6 +51,7 @@ import org.apache.hadoop.hbase.TableName;
|
|||
import org.apache.hadoop.hbase.UnknownRegionException;
|
||||
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
|
||||
import org.apache.hadoop.hbase.client.Admin;
|
||||
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
|
||||
import org.apache.hadoop.hbase.client.CompactionState;
|
||||
import org.apache.hadoop.hbase.client.Consistency;
|
||||
import org.apache.hadoop.hbase.client.Delete;
|
||||
|
@ -64,12 +64,13 @@ import org.apache.hadoop.hbase.client.Result;
|
|||
import org.apache.hadoop.hbase.client.ResultScanner;
|
||||
import org.apache.hadoop.hbase.client.Scan;
|
||||
import org.apache.hadoop.hbase.client.Table;
|
||||
import org.apache.hadoop.hbase.client.TableDescriptor;
|
||||
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
|
||||
import org.apache.hadoop.hbase.client.TestReplicasClient.SlowMeCopro;
|
||||
import org.apache.hadoop.hbase.coprocessor.MasterCoprocessor;
|
||||
import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment;
|
||||
import org.apache.hadoop.hbase.coprocessor.MasterObserver;
|
||||
import org.apache.hadoop.hbase.coprocessor.ObserverContext;
|
||||
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
|
||||
import org.apache.hadoop.hbase.master.HMaster;
|
||||
import org.apache.hadoop.hbase.master.LoadBalancer;
|
||||
import org.apache.hadoop.hbase.master.MasterRpcServices;
|
||||
|
@ -77,14 +78,11 @@ import org.apache.hadoop.hbase.master.NoSuchProcedureException;
|
|||
import org.apache.hadoop.hbase.master.RegionState;
|
||||
import org.apache.hadoop.hbase.master.RegionState.State;
|
||||
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStates;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
|
||||
import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
|
||||
import org.apache.hadoop.hbase.regionserver.throttle.NoLimitThroughputController;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse;
|
||||
import org.apache.hadoop.hbase.testclassification.LargeTests;
|
||||
import org.apache.hadoop.hbase.testclassification.RegionServerTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
|
@ -94,8 +92,6 @@ import org.apache.hadoop.hbase.util.HBaseFsck;
|
|||
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
|
||||
import org.apache.hadoop.hbase.util.RetryCounter;
|
||||
import org.apache.hadoop.hbase.util.Threads;
|
||||
import org.apache.hbase.thirdparty.com.google.protobuf.RpcController;
|
||||
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.apache.zookeeper.KeeperException.NodeExistsException;
|
||||
import org.junit.After;
|
||||
|
@ -111,11 +107,18 @@ import org.junit.rules.TestName;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.protobuf.RpcController;
|
||||
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse;
|
||||
|
||||
/**
|
||||
* The below tests are testing split region against a running cluster
|
||||
*/
|
||||
@Category({RegionServerTests.class, LargeTests.class})
|
||||
@SuppressWarnings("deprecation")
|
||||
public class TestSplitTransactionOnCluster {
|
||||
|
||||
@ClassRule
|
||||
|
@ -151,7 +154,7 @@ public class TestSplitTransactionOnCluster {
|
|||
@After
|
||||
public void tearDown() throws Exception {
|
||||
this.admin.close();
|
||||
for (HTableDescriptor htd: this.admin.listTables()) {
|
||||
for (TableDescriptor htd: this.admin.listTableDescriptors()) {
|
||||
LOG.info("Tear down, remove table=" + htd.getTableName());
|
||||
TESTING_UTIL.deleteTable(htd.getTableName());
|
||||
}
|
||||
|
@ -192,7 +195,7 @@ public class TestSplitTransactionOnCluster {
|
|||
t.close();
|
||||
|
||||
// Turn off balancer so it doesn't cut in and mess up our placements.
|
||||
this.admin.setBalancerRunning(false, true);
|
||||
this.admin.balancerSwitch(false, true);
|
||||
// Turn off the meta scanner so it don't remove parent on us.
|
||||
master.setCatalogJanitorEnabled(false);
|
||||
|
||||
|
@ -207,7 +210,7 @@ public class TestSplitTransactionOnCluster {
|
|||
master.getConfiguration());
|
||||
|
||||
// split async
|
||||
this.admin.splitRegion(region.getRegionInfo().getRegionName(), new byte[] {42});
|
||||
this.admin.splitRegionAsync(region.getRegionInfo().getRegionName(), new byte[] { 42 });
|
||||
|
||||
// we have to wait until the SPLITTING state is seen by the master
|
||||
FailingSplitMasterObserver observer =
|
||||
|
@ -221,7 +224,7 @@ public class TestSplitTransactionOnCluster {
|
|||
}
|
||||
assertTrue(cluster.getMaster().getAssignmentManager().getRegionStates().isRegionOnline(hri));
|
||||
} finally {
|
||||
admin.setBalancerRunning(true, false);
|
||||
admin.balancerSwitch(true, false);
|
||||
master.setCatalogJanitorEnabled(true);
|
||||
abortAndWaitForMaster();
|
||||
TESTING_UTIL.deleteTable(tableName);
|
||||
|
@ -232,9 +235,9 @@ public class TestSplitTransactionOnCluster {
|
|||
public void testSplitFailedCompactionAndSplit() throws Exception {
|
||||
final TableName tableName = TableName.valueOf(name.getMethodName());
|
||||
// Create table then get the single region for our new table.
|
||||
HTableDescriptor htd = new HTableDescriptor(tableName);
|
||||
byte[] cf = Bytes.toBytes("cf");
|
||||
htd.addFamily(new HColumnDescriptor(cf));
|
||||
TableDescriptor htd = TableDescriptorBuilder.newBuilder(tableName)
|
||||
.setColumnFamily(ColumnFamilyDescriptorBuilder.of(cf)).build();
|
||||
admin.createTable(htd);
|
||||
|
||||
for (int i = 0; cluster.getRegions(tableName).isEmpty() && i < 100; i++) {
|
||||
|
@ -309,7 +312,7 @@ public class TestSplitTransactionOnCluster {
|
|||
RegionStates regionStates = cluster.getMaster().getAssignmentManager().getRegionStates();
|
||||
|
||||
// Turn off balancer so it doesn't cut in and mess up our placements.
|
||||
this.admin.setBalancerRunning(false, true);
|
||||
this.admin.balancerSwitch(false, true);
|
||||
// Turn off the meta scanner so it don't remove parent on us.
|
||||
cluster.getMaster().setCatalogJanitorEnabled(false);
|
||||
try {
|
||||
|
@ -326,7 +329,7 @@ public class TestSplitTransactionOnCluster {
|
|||
// We don't roll back here anymore. Instead we fail-fast on construction of the
|
||||
// split transaction. Catch the exception instead.
|
||||
try {
|
||||
this.admin.splitRegion(hri.getRegionName());
|
||||
this.admin.splitRegionAsync(hri.getRegionName(), null);
|
||||
fail();
|
||||
} catch (DoNotRetryRegionException e) {
|
||||
// Expected
|
||||
|
@ -343,7 +346,7 @@ public class TestSplitTransactionOnCluster {
|
|||
checkAndGetDaughters(tableName);
|
||||
// OK, so split happened after we cleared the blocking node.
|
||||
} finally {
|
||||
admin.setBalancerRunning(true, false);
|
||||
admin.balancerSwitch(true, false);
|
||||
cluster.getMaster().setCatalogJanitorEnabled(true);
|
||||
t.close();
|
||||
}
|
||||
|
@ -367,7 +370,7 @@ public class TestSplitTransactionOnCluster {
|
|||
int tableRegionIndex = ensureTableRegionNotOnSameServerAsMeta(admin, hri);
|
||||
|
||||
// Turn off balancer so it doesn't cut in and mess up our placements.
|
||||
this.admin.setBalancerRunning(false, true);
|
||||
this.admin.balancerSwitch(false, true);
|
||||
// Turn off the meta scanner so it don't remove parent on us.
|
||||
cluster.getMaster().setCatalogJanitorEnabled(false);
|
||||
try {
|
||||
|
@ -405,7 +408,7 @@ public class TestSplitTransactionOnCluster {
|
|||
}
|
||||
}
|
||||
assertTrue(daughterRegion != null);
|
||||
for (int i=0; i<100; i++) {
|
||||
for (int i = 0; i < 100; i++) {
|
||||
if (!daughterRegion.hasReferences()) break;
|
||||
Threads.sleep(100);
|
||||
}
|
||||
|
@ -437,7 +440,7 @@ public class TestSplitTransactionOnCluster {
|
|||
}
|
||||
} finally {
|
||||
LOG.info("EXITING");
|
||||
admin.setBalancerRunning(true, false);
|
||||
admin.balancerSwitch(true, false);
|
||||
cluster.getMaster().setCatalogJanitorEnabled(true);
|
||||
t.close();
|
||||
}
|
||||
|
@ -446,9 +449,8 @@ public class TestSplitTransactionOnCluster {
|
|||
@Test
|
||||
public void testSplitShouldNotThrowNPEEvenARegionHasEmptySplitFiles() throws Exception {
|
||||
TableName userTableName = TableName.valueOf(name.getMethodName());
|
||||
HTableDescriptor htd = new HTableDescriptor(userTableName);
|
||||
HColumnDescriptor hcd = new HColumnDescriptor("col");
|
||||
htd.addFamily(hcd);
|
||||
TableDescriptor htd = TableDescriptorBuilder.newBuilder(userTableName)
|
||||
.setColumnFamily(ColumnFamilyDescriptorBuilder.of("col")).build();
|
||||
admin.createTable(htd);
|
||||
Table table = TESTING_UTIL.getConnection().getTable(userTableName);
|
||||
try {
|
||||
|
@ -480,7 +482,7 @@ public class TestSplitTransactionOnCluster {
|
|||
p.addColumn("col".getBytes(), "ql".getBytes(), "val".getBytes());
|
||||
table.put(p);
|
||||
admin.flush(userTableName);
|
||||
admin.splitRegion(hRegionInfo.getRegionName(), "row7".getBytes());
|
||||
admin.splitRegionAsync(hRegionInfo.getRegionName(), "row7".getBytes());
|
||||
regionsOfTable = cluster.getMaster()
|
||||
.getAssignmentManager().getRegionStates()
|
||||
.getRegionsOfTable(userTableName);
|
||||
|
@ -530,7 +532,7 @@ public class TestSplitTransactionOnCluster {
|
|||
int tableRegionIndex = ensureTableRegionNotOnSameServerAsMeta(admin, hri);
|
||||
|
||||
// Turn off balancer so it doesn't cut in and mess up our placements.
|
||||
this.admin.setBalancerRunning(false, true);
|
||||
this.admin.balancerSwitch(false, true);
|
||||
// Turn off the meta scanner so it don't remove parent on us.
|
||||
cluster.getMaster().setCatalogJanitorEnabled(false);
|
||||
try {
|
||||
|
@ -540,7 +542,7 @@ public class TestSplitTransactionOnCluster {
|
|||
HRegionServer server = cluster.getRegionServer(tableRegionIndex);
|
||||
printOutRegions(server, "Initial regions: ");
|
||||
// Call split.
|
||||
this.admin.splitRegion(hri.getRegionName());
|
||||
this.admin.splitRegionAsync(hri.getRegionName(), null);
|
||||
List<HRegion> daughters = checkAndGetDaughters(tableName);
|
||||
|
||||
// Before cleanup, get a new master.
|
||||
|
@ -575,7 +577,7 @@ public class TestSplitTransactionOnCluster {
|
|||
ServerName regionServerOfRegion = regionStates.getRegionServerOfRegion(hri);
|
||||
assertEquals(null, regionServerOfRegion);
|
||||
} finally {
|
||||
TESTING_UTIL.getAdmin().setBalancerRunning(true, false);
|
||||
TESTING_UTIL.getAdmin().balancerSwitch(true, false);
|
||||
cluster.getMaster().setCatalogJanitorEnabled(true);
|
||||
t.close();
|
||||
}
|
||||
|
@ -601,7 +603,7 @@ public class TestSplitTransactionOnCluster {
|
|||
HRegionServer regionServer = cluster.getRegionServer(regionServerIndex);
|
||||
insertData(tableName, admin, t);
|
||||
// Turn off balancer so it doesn't cut in and mess up our placements.
|
||||
admin.setBalancerRunning(false, true);
|
||||
admin.balancerSwitch(false, true);
|
||||
// Turn off the meta scanner so it don't remove parent on us.
|
||||
cluster.getMaster().setCatalogJanitorEnabled(false);
|
||||
boolean tableExists = MetaTableAccessor.tableExists(regionServer.getConnection(),
|
||||
|
@ -648,7 +650,7 @@ public class TestSplitTransactionOnCluster {
|
|||
SlowMeCopro.getPrimaryCdl().get().countDown();
|
||||
} finally {
|
||||
SlowMeCopro.getPrimaryCdl().get().countDown();
|
||||
admin.setBalancerRunning(true, false);
|
||||
admin.balancerSwitch(true, false);
|
||||
cluster.getMaster().setCatalogJanitorEnabled(true);
|
||||
t.close();
|
||||
}
|
||||
|
@ -676,8 +678,7 @@ public class TestSplitTransactionOnCluster {
|
|||
* into two regions with no store files.
|
||||
*/
|
||||
@Test
|
||||
public void testSplitRegionWithNoStoreFiles()
|
||||
throws Exception {
|
||||
public void testSplitRegionWithNoStoreFiles() throws Exception {
|
||||
final TableName tableName = TableName.valueOf(name.getMethodName());
|
||||
// Create table then get the single region for our new table.
|
||||
createTableAndWait(tableName, HConstants.CATALOG_FAMILY);
|
||||
|
@ -688,7 +689,7 @@ public class TestSplitTransactionOnCluster {
|
|||
.getRegionName());
|
||||
HRegionServer regionServer = cluster.getRegionServer(regionServerIndex);
|
||||
// Turn off balancer so it doesn't cut in and mess up our placements.
|
||||
this.admin.setBalancerRunning(false, true);
|
||||
this.admin.balancerSwitch(false, true);
|
||||
// Turn off the meta scanner so it don't remove parent on us.
|
||||
cluster.getMaster().setCatalogJanitorEnabled(false);
|
||||
try {
|
||||
|
@ -740,23 +741,25 @@ public class TestSplitTransactionOnCluster {
|
|||
assertTrue(regionStates.isRegionInState(daughters.get(1).getRegionInfo(), State.OPEN));
|
||||
|
||||
// We should not be able to assign it again
|
||||
am.assign(hri);
|
||||
assertFalse("Split region can't be assigned",
|
||||
regionStates.isRegionInTransition(hri));
|
||||
try {
|
||||
am.assign(hri);
|
||||
} catch (DoNotRetryIOException e) {
|
||||
// Expected
|
||||
}
|
||||
assertFalse("Split region can't be assigned", regionStates.isRegionInTransition(hri));
|
||||
assertTrue(regionStates.isRegionInState(hri, State.SPLIT));
|
||||
|
||||
// We should not be able to unassign it either
|
||||
try {
|
||||
am.unassign(hri);
|
||||
fail("Should have thrown exception");
|
||||
} catch (UnexpectedStateException e) {
|
||||
} catch (DoNotRetryIOException e) {
|
||||
// Expected
|
||||
}
|
||||
assertFalse("Split region can't be unassigned",
|
||||
regionStates.isRegionInTransition(hri));
|
||||
assertFalse("Split region can't be unassigned", regionStates.isRegionInTransition(hri));
|
||||
assertTrue(regionStates.isRegionInState(hri, State.SPLIT));
|
||||
} finally {
|
||||
admin.setBalancerRunning(true, false);
|
||||
admin.balancerSwitch(true, false);
|
||||
cluster.getMaster().setCatalogJanitorEnabled(true);
|
||||
}
|
||||
}
|
||||
|
@ -766,21 +769,23 @@ public class TestSplitTransactionOnCluster {
|
|||
throws Exception {
|
||||
final TableName tableName = TableName.valueOf(name.getMethodName());
|
||||
try {
|
||||
HTableDescriptor htd = new HTableDescriptor(tableName);
|
||||
htd.addFamily(new HColumnDescriptor("f"));
|
||||
htd.addFamily(new HColumnDescriptor("i_f"));
|
||||
htd.setRegionSplitPolicyClassName(CustomSplitPolicy.class.getName());
|
||||
byte[] cf = Bytes.toBytes("f");
|
||||
byte[] cf1 = Bytes.toBytes("i_f");
|
||||
TableDescriptor htd = TableDescriptorBuilder.newBuilder(tableName)
|
||||
.setColumnFamily(ColumnFamilyDescriptorBuilder.of(cf))
|
||||
.setColumnFamily(ColumnFamilyDescriptorBuilder.of(cf1))
|
||||
.setRegionSplitPolicyClassName(CustomSplitPolicy.class.getName()).build();
|
||||
admin.createTable(htd);
|
||||
List<HRegion> regions = awaitTableRegions(tableName);
|
||||
HRegion region = regions.get(0);
|
||||
for(int i = 3;i<9;i++) {
|
||||
Put p = new Put(Bytes.toBytes("row"+i));
|
||||
p.addColumn(Bytes.toBytes("f"), Bytes.toBytes("q"), Bytes.toBytes("value" + i));
|
||||
p.addColumn(Bytes.toBytes("i_f"), Bytes.toBytes("q"), Bytes.toBytes("value" + i));
|
||||
p.addColumn(cf, Bytes.toBytes("q"), Bytes.toBytes("value" + i));
|
||||
p.addColumn(cf1, Bytes.toBytes("q"), Bytes.toBytes("value" + i));
|
||||
region.put(p);
|
||||
}
|
||||
region.flush(true);
|
||||
HStore store = region.getStore(Bytes.toBytes("f"));
|
||||
HStore store = region.getStore(cf);
|
||||
Collection<HStoreFile> storefiles = store.getStorefiles();
|
||||
assertEquals(1, storefiles.size());
|
||||
assertFalse(region.hasReferences());
|
||||
|
@ -833,7 +838,7 @@ public class TestSplitTransactionOnCluster {
|
|||
|
||||
private void split(final RegionInfo hri, final HRegionServer server, final int regionCount)
|
||||
throws IOException, InterruptedException {
|
||||
admin.splitRegion(hri.getRegionName());
|
||||
admin.splitRegionAsync(hri.getRegionName(), null);
|
||||
for (int i = 0; cluster.getRegions(hri.getTable()).size() <= regionCount && i < 60; i++) {
|
||||
LOG.debug("Waiting on region " + hri.getRegionNameAsString() + " to split");
|
||||
Thread.sleep(2000);
|
||||
|
@ -999,7 +1004,7 @@ public class TestSplitTransactionOnCluster {
|
|||
if (enabled.get() && req.getTransition(0).getTransitionCode().equals(
|
||||
TransitionCode.READY_TO_SPLIT) && !resp.hasErrorMessage()) {
|
||||
RegionStates regionStates = myMaster.getAssignmentManager().getRegionStates();
|
||||
for (RegionStates.RegionStateNode regionState:
|
||||
for (RegionStateNode regionState:
|
||||
regionStates.getRegionsInTransition()) {
|
||||
/* TODO!!!!
|
||||
// Find the merging_new region and remove it
|
||||
|
|
Loading…
Reference in New Issue