Revert "Revert "Revert "HBASE-21213 [hbck2] bypass leaves behind state in RegionStates when assign/unassign"""

This reverts commit 2174461cf7.

Revert because not ready to port to other branches.
This commit is contained in:
Michael Stack 2018-09-29 04:06:46 -07:00
parent 2174461cf7
commit 259d12f739
29 changed files with 122 additions and 478 deletions

View File

@ -102,12 +102,11 @@ public class HBaseHbck implements Hbck {
} }
@Override @Override
public List<Long> assigns(List<String> encodedRegionNames, boolean override) public List<Long> assigns(List<String> encodedRegionNames) throws IOException {
throws IOException {
try { try {
MasterProtos.AssignsResponse response = MasterProtos.AssignsResponse response =
this.hbck.assigns(rpcControllerFactory.newController(), this.hbck.assigns(rpcControllerFactory.newController(),
RequestConverter.toAssignRegionsRequest(encodedRegionNames, override)); RequestConverter.toAssignRegionsRequest(encodedRegionNames));
return response.getPidList(); return response.getPidList();
} catch (ServiceException se) { } catch (ServiceException se) {
LOG.debug(toCommaDelimitedString(encodedRegionNames), se); LOG.debug(toCommaDelimitedString(encodedRegionNames), se);
@ -116,12 +115,11 @@ public class HBaseHbck implements Hbck {
} }
@Override @Override
public List<Long> unassigns(List<String> encodedRegionNames, boolean override) public List<Long> unassigns(List<String> encodedRegionNames) throws IOException {
throws IOException {
try { try {
MasterProtos.UnassignsResponse response = MasterProtos.UnassignsResponse response =
this.hbck.unassigns(rpcControllerFactory.newController(), this.hbck.unassigns(rpcControllerFactory.newController(),
RequestConverter.toUnassignRegionsRequest(encodedRegionNames, override)); RequestConverter.toUnassignRegionsRequest(encodedRegionNames));
return response.getPidList(); return response.getPidList();
} catch (ServiceException se) { } catch (ServiceException se) {
LOG.debug(toCommaDelimitedString(encodedRegionNames), se); LOG.debug(toCommaDelimitedString(encodedRegionNames), se);
@ -134,8 +132,7 @@ public class HBaseHbck implements Hbck {
} }
@Override @Override
public List<Boolean> bypassProcedure(List<Long> pids, long waitTime, boolean override, public List<Boolean> bypassProcedure(List<Long> pids, long waitTime, boolean force)
boolean recursive)
throws IOException { throws IOException {
MasterProtos.BypassProcedureResponse response = ProtobufUtil.call( MasterProtos.BypassProcedureResponse response = ProtobufUtil.call(
new Callable<MasterProtos.BypassProcedureResponse>() { new Callable<MasterProtos.BypassProcedureResponse>() {
@ -144,7 +141,7 @@ public class HBaseHbck implements Hbck {
try { try {
return hbck.bypassProcedure(rpcControllerFactory.newController(), return hbck.bypassProcedure(rpcControllerFactory.newController(),
MasterProtos.BypassProcedureRequest.newBuilder().addAllProcId(pids). MasterProtos.BypassProcedureRequest.newBuilder().addAllProcId(pids).
setWaitTime(waitTime).setOverride(override).setRecursive(recursive).build()); setWaitTime(waitTime).setForce(force).build());
} catch (Throwable t) { } catch (Throwable t) {
LOG.error(pids.stream().map(i -> i.toString()). LOG.error(pids.stream().map(i -> i.toString()).
collect(Collectors.joining(", ")), t); collect(Collectors.joining(", ")), t);

View File

@ -28,14 +28,11 @@ import org.apache.yetus.audience.InterfaceAudience;
/** /**
* Hbck fixup tool APIs. Obtain an instance from {@link ClusterConnection#getHbck()} and call * Hbck fixup tool APIs. Obtain an instance from {@link ClusterConnection#getHbck()} and call
* {@link #close()} when done. * {@link #close()} when done.
* <p>WARNING: the below methods can damage the cluster. It may leave the cluster in an * <p>WARNING: the below methods can damage the cluster. For experienced users only.
* indeterminate state, e.g. region not assigned, or some hdfs files left behind. After running
* any of the below, operators may have to do some clean up on hdfs or schedule some assign
* procedures to get regions back online. DO AT YOUR OWN RISK. For experienced users only.
* *
* @see ConnectionFactory * @see ConnectionFactory
* @see ClusterConnection * @see ClusterConnection
* @since 2.0.2, 2.1.1 * @since 2.2.0
*/ */
@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.HBCK) @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.HBCK)
public interface Hbck extends Abortable, Closeable { public interface Hbck extends Abortable, Closeable {
@ -52,38 +49,22 @@ public interface Hbck extends Abortable, Closeable {
* -- good if many Regions to online -- and it will schedule the assigns even in the case where * -- good if many Regions to online -- and it will schedule the assigns even in the case where
* Master is initializing (as long as the ProcedureExecutor is up). Does NOT call Coprocessor * Master is initializing (as long as the ProcedureExecutor is up). Does NOT call Coprocessor
* hooks. * hooks.
* @param override You need to add the override for case where a region has previously been
* bypassed. When a Procedure has been bypassed, a Procedure will have completed
* but no other Procedure will be able to make progress on the target entity
* (intentionally). This override flag will override this fencing mechanism.
* @param encodedRegionNames Region encoded names; e.g. 1588230740 is the hard-coded encoding * @param encodedRegionNames Region encoded names; e.g. 1588230740 is the hard-coded encoding
* for hbase:meta region and de00010733901a05f5a2a3a382e27dd4 is an * for hbase:meta region and de00010733901a05f5a2a3a382e27dd4 is an
* example of what a random user-space encoded Region name looks like. * example of what a random user-space encoded Region name looks like.
*/ */
List<Long> assigns(List<String> encodedRegionNames, boolean override) throws IOException; List<Long> assigns(List<String> encodedRegionNames) throws IOException;
default List<Long> assigns(List<String> encodedRegionNames) throws IOException {
return assigns(encodedRegionNames, false);
}
/** /**
* Like {@link Admin#unassign(byte[], boolean)} but 'raw' in that it can do more than one Region * Like {@link Admin#unassign(byte[], boolean)} but 'raw' in that it can do more than one Region
* at a time -- good if many Regions to offline -- and it will schedule the assigns even in the * at a time -- good if many Regions to offline -- and it will schedule the assigns even in the
* case where Master is initializing (as long as the ProcedureExecutor is up). Does NOT call * case where Master is initializing (as long as the ProcedureExecutor is up). Does NOT call
* Coprocessor hooks. * Coprocessor hooks.
* @param override You need to add the override for case where a region has previously been
* bypassed. When a Procedure has been bypassed, a Procedure will have completed
* but no other Procedure will be able to make progress on the target entity
* (intentionally). This override flag will override this fencing mechanism.
* @param encodedRegionNames Region encoded names; e.g. 1588230740 is the hard-coded encoding * @param encodedRegionNames Region encoded names; e.g. 1588230740 is the hard-coded encoding
* for hbase:meta region and de00010733901a05f5a2a3a382e27dd4 is an * for hbase:meta region and de00010733901a05f5a2a3a382e27dd4 is an
* example of what a random user-space encoded Region name looks like. * example of what a random user-space encoded Region name looks like.
*/ */
List<Long> unassigns(List<String> encodedRegionNames, boolean override) throws IOException; List<Long> unassigns(List<String> encodedRegionNames) throws IOException;
default List<Long> unassigns(List<String> encodedRegionNames) throws IOException {
return unassigns(encodedRegionNames, false);
}
/** /**
* Bypass specified procedure and move it to completion. Procedure is marked completed but * Bypass specified procedure and move it to completion. Procedure is marked completed but
@ -92,12 +73,9 @@ public interface Hbck extends Abortable, Closeable {
* *
* @param pids of procedures to complete. * @param pids of procedures to complete.
* @param waitTime wait time in ms for acquiring lock for a procedure * @param waitTime wait time in ms for acquiring lock for a procedure
* @param override if override set to true, we will bypass the procedure even if it is executing. * @param force if force set to true, we will bypass the procedure even if it is executing.
* This is for procedures which can't break out during execution (bugs?). * This is for procedures which can't break out during execution (bugs?).
* @param recursive If set, if a parent procedure, we will find and bypass children and then
* the parent procedure (Dangerous but useful in case where child procedure has been 'lost').
* @return true if procedure is marked for bypass successfully, false otherwise * @return true if procedure is marked for bypass successfully, false otherwise
*/ */
List<Boolean> bypassProcedure(List<Long> pids, long waitTime, boolean override, boolean recursive) List<Boolean> bypassProcedure(List<Long> pids, long waitTime, boolean force) throws IOException;
throws IOException;
} }

View File

@ -1883,18 +1883,16 @@ public final class RequestConverter {
// HBCK2 // HBCK2
public static MasterProtos.AssignsRequest toAssignRegionsRequest( public static MasterProtos.AssignsRequest toAssignRegionsRequest(
List<String> encodedRegionNames, boolean override) { List<String> encodedRegionNames) {
MasterProtos.AssignsRequest.Builder b = MasterProtos.AssignsRequest.newBuilder(); MasterProtos.AssignsRequest.Builder b = MasterProtos.AssignsRequest.newBuilder();
return b.addAllRegion(toEncodedRegionNameRegionSpecifiers(encodedRegionNames)). return b.addAllRegion(toEncodedRegionNameRegionSpecifiers(encodedRegionNames)).build();
setOverride(override).build();
} }
public static MasterProtos.UnassignsRequest toUnassignRegionsRequest( public static MasterProtos.UnassignsRequest toUnassignRegionsRequest(
List<String> encodedRegionNames, boolean override) { List<String> encodedRegionNames) {
MasterProtos.UnassignsRequest.Builder b = MasterProtos.UnassignsRequest.Builder b =
MasterProtos.UnassignsRequest.newBuilder(); MasterProtos.UnassignsRequest.newBuilder();
return b.addAllRegion(toEncodedRegionNameRegionSpecifiers(encodedRegionNames)). return b.addAllRegion(toEncodedRegionNameRegionSpecifiers(encodedRegionNames)).build();
setOverride(override).build();
} }
private static List<RegionSpecifier> toEncodedRegionNameRegionSpecifiers( private static List<RegionSpecifier> toEncodedRegionNameRegionSpecifiers(

View File

@ -145,16 +145,12 @@ public abstract class Procedure<TEnvironment> implements Comparable<Procedure<TE
private boolean lockedWhenLoading = false; private boolean lockedWhenLoading = false;
/** /**
* Used for override complete of the procedure without actually doing any logic in the procedure. * Used for force complete of the procedure without
* actually doing any logic in the procedure.
* If bypass is set to true, when executing it will return null when * If bypass is set to true, when executing it will return null when
* {@link #doExecute(Object)} is called to finish the procedure and release any locks * {@link #doExecute(Object)} to finish the procedure and releasing any locks
* it may currently hold. The bypass does cleanup around the Procedure as far as the * it may currently hold.
* Procedure framework is concerned. It does not clean any internal state that the * Bypassing a procedure is not like aborting. Aborting a procedure will trigger
* Procedure's themselves may have set. That is for the Procedures to do themselves
* when bypass is called. They should override bypass and do their cleanup in the
* overridden bypass method (be sure to call the parent bypass to ensure proper
* processing).
* <p></p>Bypassing a procedure is not like aborting. Aborting a procedure will trigger
* a rollback. And since the {@link #abort(Object)} method is overrideable * a rollback. And since the {@link #abort(Object)} method is overrideable
* Some procedures may have chosen to ignore the aborting. * Some procedures may have chosen to ignore the aborting.
*/ */
@ -179,15 +175,12 @@ public abstract class Procedure<TEnvironment> implements Comparable<Procedure<TE
} }
/** /**
* Set the bypass to true. * set the bypass to true
* Only called in {@link ProcedureExecutor#bypassProcedure(long, long, boolean, boolean)}. * Only called in {@link ProcedureExecutor#bypassProcedure(long, long, boolean)} for now,
* DO NOT use this method alone, since we can't just bypass one single procedure. We need to * DO NOT use this method alone, since we can't just bypass
* bypass its ancestor too. If your Procedure has set state, it needs to undo it in here. * one single procedure. We need to bypass its ancestor too. So making it package private
* @param env Current environment. May be null because of context; e.g. pretty-printing
* procedure WALs where there is no 'environment' (and where Procedures that require
* an 'environment' won't be run.
*/ */
protected void bypass(TEnvironment env) { void bypass() {
this.bypass = true; this.bypass = true;
} }
@ -713,7 +706,7 @@ public abstract class Procedure<TEnvironment> implements Comparable<Procedure<TE
/** /**
* Will only be called when loading procedures from procedure store, where we need to record * Will only be called when loading procedures from procedure store, where we need to record
* whether the procedure has already held a lock. Later we will call * whether the procedure has already held a lock. Later we will call
* {@link #doAcquireLock(Object, ProcedureStore)} to actually acquire the lock. * {@link #doAcquireLock(Object)} to actually acquire the lock.
*/ */
final void lockedWhenLoading() { final void lockedWhenLoading() {
this.lockedWhenLoading = true; this.lockedWhenLoading = true;

View File

@ -306,7 +306,7 @@ public class ProcedureExecutor<TEnvironment> {
private Configuration conf; private Configuration conf;
/** /**
* Created in the {@link #init(int, boolean)} method. Destroyed in {@link #join()} (FIX! Doing * Created in the {@link #start(int, boolean)} method. Destroyed in {@link #join()} (FIX! Doing
* resource handling rather than observing in a #join is unexpected). * resource handling rather than observing in a #join is unexpected).
* Overridden when we do the ProcedureTestingUtility.testRecoveryAndDoubleExecution trickery * Overridden when we do the ProcedureTestingUtility.testRecoveryAndDoubleExecution trickery
* (Should be ok). * (Should be ok).
@ -314,7 +314,7 @@ public class ProcedureExecutor<TEnvironment> {
private ThreadGroup threadGroup; private ThreadGroup threadGroup;
/** /**
* Created in the {@link #init(int, boolean)} method. Terminated in {@link #join()} (FIX! Doing * Created in the {@link #start(int, boolean)} method. Terminated in {@link #join()} (FIX! Doing
* resource handling rather than observing in a #join is unexpected). * resource handling rather than observing in a #join is unexpected).
* Overridden when we do the ProcedureTestingUtility.testRecoveryAndDoubleExecution trickery * Overridden when we do the ProcedureTestingUtility.testRecoveryAndDoubleExecution trickery
* (Should be ok). * (Should be ok).
@ -322,7 +322,7 @@ public class ProcedureExecutor<TEnvironment> {
private CopyOnWriteArrayList<WorkerThread> workerThreads; private CopyOnWriteArrayList<WorkerThread> workerThreads;
/** /**
* Created in the {@link #init(int, boolean)} method. Terminated in {@link #join()} (FIX! Doing * Created in the {@link #start(int, boolean)} method. Terminated in {@link #join()} (FIX! Doing
* resource handling rather than observing in a #join is unexpected). * resource handling rather than observing in a #join is unexpected).
* Overridden when we do the ProcedureTestingUtility.testRecoveryAndDoubleExecution trickery * Overridden when we do the ProcedureTestingUtility.testRecoveryAndDoubleExecution trickery
* (Should be ok). * (Should be ok).
@ -966,7 +966,7 @@ public class ProcedureExecutor<TEnvironment> {
* Bypass a procedure. If the procedure is set to bypass, all the logic in * Bypass a procedure. If the procedure is set to bypass, all the logic in
* execute/rollback will be ignored and it will return success, whatever. * execute/rollback will be ignored and it will return success, whatever.
* It is used to recover buggy stuck procedures, releasing the lock resources * It is used to recover buggy stuck procedures, releasing the lock resources
* and letting other procedures run. Bypassing one procedure (and its ancestors will * and letting other procedures to run. Bypassing one procedure (and its ancestors will
* be bypassed automatically) may leave the cluster in a middle state, e.g. region * be bypassed automatically) may leave the cluster in a middle state, e.g. region
* not assigned, or some hdfs files left behind. After getting rid of those stuck procedures, * not assigned, or some hdfs files left behind. After getting rid of those stuck procedures,
* the operators may have to do some clean up on hdfs or schedule some assign procedures * the operators may have to do some clean up on hdfs or schedule some assign procedures
@ -993,38 +993,34 @@ public class ProcedureExecutor<TEnvironment> {
* there. We need to restart the master after bypassing, and letting the problematic * there. We need to restart the master after bypassing, and letting the problematic
* procedure to execute wth bypass=true, so in that condition, the procedure can be * procedure to execute wth bypass=true, so in that condition, the procedure can be
* successfully bypassed. * successfully bypassed.
* @param recursive We will do an expensive search for children of each pid. EXPENSIVE!
* @return true if bypass success * @return true if bypass success
* @throws IOException IOException * @throws IOException IOException
*/ */
public List<Boolean> bypassProcedure(List<Long> pids, long lockWait, boolean force, public List<Boolean> bypassProcedure(List<Long> pids, long lockWait, boolean force)
boolean recursive)
throws IOException { throws IOException {
List<Boolean> result = new ArrayList<Boolean>(pids.size()); List<Boolean> result = new ArrayList<Boolean>(pids.size());
for(long pid: pids) { for(long pid: pids) {
result.add(bypassProcedure(pid, lockWait, force, recursive)); result.add(bypassProcedure(pid, lockWait, force));
} }
return result; return result;
} }
boolean bypassProcedure(long pid, long lockWait, boolean override, boolean recursive) boolean bypassProcedure(long pid, long lockWait, boolean force) throws IOException {
throws IOException { Procedure<TEnvironment> procedure = getProcedure(pid);
final Procedure<TEnvironment> procedure = getProcedure(pid);
if (procedure == null) { if (procedure == null) {
LOG.debug("Procedure pid={} does not exist, skipping bypass", pid); LOG.debug("Procedure with id={} does not exist, skipping bypass", pid);
return false; return false;
} }
LOG.debug("Begin bypass {} with lockWait={}, override={}, recursive={}", LOG.debug("Begin bypass {} with lockWait={}, force={}", procedure, lockWait, force);
procedure, lockWait, override, recursive);
IdLock.Entry lockEntry = procExecutionLock.tryLockEntry(procedure.getProcId(), lockWait); IdLock.Entry lockEntry = procExecutionLock.tryLockEntry(procedure.getProcId(), lockWait);
if (lockEntry == null && !override) { if (lockEntry == null && !force) {
LOG.debug("Waited {} ms, but {} is still running, skipping bypass with force={}", LOG.debug("Waited {} ms, but {} is still running, skipping bypass with force={}",
lockWait, procedure, override); lockWait, procedure, force);
return false; return false;
} else if (lockEntry == null) { } else if (lockEntry == null) {
LOG.debug("Waited {} ms, but {} is still running, begin bypass with force={}", LOG.debug("Waited {} ms, but {} is still running, begin bypass with force={}",
lockWait, procedure, override); lockWait, procedure, force);
} }
try { try {
// check whether the procedure is already finished // check whether the procedure is already finished
@ -1034,29 +1030,8 @@ public class ProcedureExecutor<TEnvironment> {
} }
if (procedure.hasChildren()) { if (procedure.hasChildren()) {
if (recursive) { LOG.debug("{} has children, skipping bypass", procedure);
// EXPENSIVE. Checks each live procedure of which there could be many!!! return false;
// Is there another way to get children of a procedure?
LOG.info("Recursive bypass on children of pid={}", procedure.getProcId());
this.procedures.forEachValue(1 /*Single-threaded*/,
// Transformer
v -> {
return v.getParentProcId() == procedure.getProcId()? v: null;
},
// Consumer
v -> {
boolean result = false;
IOException ioe = null;
try {
result = bypassProcedure(v.getProcId(), lockWait, override, recursive);
} catch (IOException e) {
LOG.warn("Recursive bypass of pid={}", v.getProcId(), e);
}
});
} else {
LOG.debug("{} has children, skipping bypass", procedure);
return false;
}
} }
// If the procedure has no parent or no child, we are safe to bypass it in whatever state // If the procedure has no parent or no child, we are safe to bypass it in whatever state
@ -1066,7 +1041,6 @@ public class ProcedureExecutor<TEnvironment> {
LOG.debug("Bypassing procedures in RUNNABLE, WAITING and WAITING_TIMEOUT states " LOG.debug("Bypassing procedures in RUNNABLE, WAITING and WAITING_TIMEOUT states "
+ "(with no parent), {}", + "(with no parent), {}",
procedure); procedure);
// Question: how is the bypass done here?
return false; return false;
} }
@ -1076,7 +1050,7 @@ public class ProcedureExecutor<TEnvironment> {
Procedure current = procedure; Procedure current = procedure;
while (current != null) { while (current != null) {
LOG.debug("Bypassing {}", current); LOG.debug("Bypassing {}", current);
current.bypass(getEnvironment()); current.bypass();
store.update(procedure); store.update(procedure);
long parentID = current.getParentProcId(); long parentID = current.getParentProcId();
current = getProcedure(parentID); current = getProcedure(parentID);

View File

@ -272,7 +272,7 @@ public final class ProcedureUtil {
} }
if (proto.getBypass()) { if (proto.getBypass()) {
proc.bypass(null); proc.bypass();
} }
ProcedureStateSerializer serializer = null; ProcedureStateSerializer serializer = null;

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.ipc.RemoteException;
import org.apache.yetus.audience.InterfaceAudience; import org.apache.yetus.audience.InterfaceAudience;
import org.apache.yetus.audience.InterfaceStability;
import org.apache.hadoop.hbase.shaded.protobuf.generated.ErrorHandlingProtos.ForeignExceptionMessage; import org.apache.hadoop.hbase.shaded.protobuf.generated.ErrorHandlingProtos.ForeignExceptionMessage;
import org.apache.hadoop.hbase.util.ForeignExceptionUtil; import org.apache.hadoop.hbase.util.ForeignExceptionUtil;
@ -36,6 +37,7 @@ import org.apache.hadoop.hbase.util.ForeignExceptionUtil;
* their stacks traces and messages overridden to reflect the original 'remote' exception. * their stacks traces and messages overridden to reflect the original 'remote' exception.
*/ */
@InterfaceAudience.Private @InterfaceAudience.Private
@InterfaceStability.Evolving
@SuppressWarnings("serial") @SuppressWarnings("serial")
public class RemoteProcedureException extends ProcedureException { public class RemoteProcedureException extends ProcedureException {
@ -72,10 +74,6 @@ public class RemoteProcedureException extends ProcedureException {
return new Exception(cause); return new Exception(cause);
} }
// NOTE: Does not throw DoNotRetryIOE because it does not
// have access (DNRIOE is in the client module). Use
// MasterProcedureUtil.unwrapRemoteIOException if need to
// throw DNRIOE.
public IOException unwrapRemoteIOException() { public IOException unwrapRemoteIOException() {
final Exception cause = unwrapRemoteException(); final Exception cause = unwrapRemoteException();
if (cause instanceof IOException) { if (cause instanceof IOException) {

View File

@ -87,7 +87,7 @@ public class TestProcedureBypass {
long id = procExecutor.submitProcedure(proc); long id = procExecutor.submitProcedure(proc);
Thread.sleep(500); Thread.sleep(500);
//bypass the procedure //bypass the procedure
assertTrue(procExecutor.bypassProcedure(id, 30000, false, false)); assertTrue(procExecutor.bypassProcedure(id, 30000, false));
htu.waitFor(5000, () -> proc.isSuccess() && proc.isBypass()); htu.waitFor(5000, () -> proc.isSuccess() && proc.isBypass());
LOG.info("{} finished", proc); LOG.info("{} finished", proc);
} }
@ -98,7 +98,7 @@ public class TestProcedureBypass {
long id = procExecutor.submitProcedure(proc); long id = procExecutor.submitProcedure(proc);
Thread.sleep(500); Thread.sleep(500);
//bypass the procedure //bypass the procedure
assertTrue(procExecutor.bypassProcedure(id, 1000, true, false)); assertTrue(procExecutor.bypassProcedure(id, 1000, true));
//Since the procedure is stuck there, we need to restart the executor to recovery. //Since the procedure is stuck there, we need to restart the executor to recovery.
ProcedureTestingUtility.restart(procExecutor); ProcedureTestingUtility.restart(procExecutor);
htu.waitFor(5000, () -> proc.isSuccess() && proc.isBypass()); htu.waitFor(5000, () -> proc.isSuccess() && proc.isBypass());
@ -114,24 +114,12 @@ public class TestProcedureBypass {
.size() > 0); .size() > 0);
SuspendProcedure suspendProcedure = (SuspendProcedure)procExecutor.getProcedures().stream() SuspendProcedure suspendProcedure = (SuspendProcedure)procExecutor.getProcedures().stream()
.filter(p -> p.getParentProcId() == rootId).collect(Collectors.toList()).get(0); .filter(p -> p.getParentProcId() == rootId).collect(Collectors.toList()).get(0);
assertTrue(procExecutor.bypassProcedure(suspendProcedure.getProcId(), 1000, false, false)); assertTrue(procExecutor.bypassProcedure(suspendProcedure.getProcId(), 1000, false));
htu.waitFor(5000, () -> proc.isSuccess() && proc.isBypass()); htu.waitFor(5000, () -> proc.isSuccess() && proc.isBypass());
LOG.info("{} finished", proc); LOG.info("{} finished", proc);
} }
@Test
public void testBypassingProcedureWithParentRecursive() throws Exception {
final RootProcedure proc = new RootProcedure();
long rootId = procExecutor.submitProcedure(proc);
htu.waitFor(5000, () -> procExecutor.getProcedures().stream()
.filter(p -> p.getParentProcId() == rootId).collect(Collectors.toList())
.size() > 0);
SuspendProcedure suspendProcedure = (SuspendProcedure)procExecutor.getProcedures().stream()
.filter(p -> p.getParentProcId() == rootId).collect(Collectors.toList()).get(0);
assertTrue(procExecutor.bypassProcedure(rootId, 1000, false, true));
htu.waitFor(5000, () -> proc.isSuccess() && proc.isBypass());
LOG.info("{} finished", proc);
}
@AfterClass @AfterClass
public static void tearDown() throws Exception { public static void tearDown() throws Exception {
@ -191,4 +179,7 @@ public class TestProcedureBypass {
} }
} }
} }
} }

View File

@ -99,7 +99,6 @@ message MergeTableRegionsResponse {
message AssignRegionRequest { message AssignRegionRequest {
required RegionSpecifier region = 1; required RegionSpecifier region = 1;
optional bool override = 2 [default = false];
} }
message AssignRegionResponse { message AssignRegionResponse {
@ -1006,7 +1005,6 @@ message SetTableStateInMetaRequest {
// Region at a time. // Region at a time.
message AssignsRequest { message AssignsRequest {
repeated RegionSpecifier region = 1; repeated RegionSpecifier region = 1;
optional bool override = 2 [default = false];
} }
/** Like Admin's AssignRegionResponse except it can /** Like Admin's AssignRegionResponse except it can
@ -1021,7 +1019,6 @@ message AssignsResponse {
*/ */
message UnassignsRequest { message UnassignsRequest {
repeated RegionSpecifier region = 1; repeated RegionSpecifier region = 1;
optional bool override = 2 [default = false];
} }
/** Like Admin's UnassignRegionResponse except it can /** Like Admin's UnassignRegionResponse except it can
@ -1034,8 +1031,7 @@ message UnassignsResponse {
message BypassProcedureRequest { message BypassProcedureRequest {
repeated uint64 proc_id = 1; repeated uint64 proc_id = 1;
optional uint64 waitTime = 2; // wait time in ms to acquire lock on a procedure optional uint64 waitTime = 2; // wait time in ms to acquire lock on a procedure
optional bool override = 3 [default = false]; // if true, procedure is marked for bypass even if its executing optional bool force = 3; // if true, procedure is marked for bypass even if its executing
optional bool recursive = 4;
} }
message BypassProcedureResponse { message BypassProcedureResponse {

View File

@ -330,7 +330,6 @@ message AssignRegionStateData {
optional ServerName target_server = 4; optional ServerName target_server = 4;
// Current attempt index used for expotential backoff when stuck // Current attempt index used for expotential backoff when stuck
optional int32 attempt = 5; optional int32 attempt = 5;
optional bool override = 6 [default = false];
} }
message UnassignRegionStateData { message UnassignRegionStateData {
@ -342,7 +341,6 @@ message UnassignRegionStateData {
// This is the server currently hosting the Region, the // This is the server currently hosting the Region, the
// server we will send the unassign rpc too. // server we will send the unassign rpc too.
optional ServerName hosting_server = 5; optional ServerName hosting_server = 5;
// We hijacked an old param named 'force' and use it as 'override'.
optional bool force = 4 [default = false]; optional bool force = 4 [default = false];
optional bool remove_after_unassigning = 6 [default = false]; optional bool remove_after_unassigning = 6 [default = false];
// Current attempt index used for expotential backoff when stuck // Current attempt index used for expotential backoff when stuck

View File

@ -65,7 +65,6 @@ import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.ServerListener; import org.apache.hadoop.hbase.master.ServerListener;
import org.apache.hadoop.hbase.master.TableStateManager; import org.apache.hadoop.hbase.master.TableStateManager;
import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode; import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureUtil;
import org.apache.hadoop.hbase.net.Address; import org.apache.hadoop.hbase.net.Address;
import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.Procedure;
import org.apache.hadoop.hbase.protobuf.ProtobufMagic; import org.apache.hadoop.hbase.protobuf.ProtobufMagic;
@ -874,7 +873,7 @@ final class RSGroupInfoManagerImpl implements RSGroupInfoManager {
Procedure<?> result = masterServices.getMasterProcedureExecutor().getResult(procId); Procedure<?> result = masterServices.getMasterProcedureExecutor().getResult(procId);
if (result != null && result.isFailed()) { if (result != null && result.isFailed()) {
throw new IOException("Failed to create group table. " + throw new IOException("Failed to create group table. " +
MasterProcedureUtil.unwrapRemoteIOException(result)); result.getException().unwrapRemoteIOException());
} }
} }
} }

View File

@ -49,7 +49,6 @@ import org.apache.hadoop.hbase.master.LoadBalancer;
import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.RegionPlan; import org.apache.hadoop.hbase.master.RegionPlan;
import org.apache.hadoop.hbase.master.assignment.AssignmentManager; import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
import org.apache.hadoop.hbase.master.assignment.RegionStates;
import org.apache.hadoop.hbase.net.Address; import org.apache.hadoop.hbase.net.Address;
import org.apache.hadoop.hbase.rsgroup.RSGroupBasedLoadBalancer; import org.apache.hadoop.hbase.rsgroup.RSGroupBasedLoadBalancer;
import org.apache.hadoop.hbase.rsgroup.RSGroupInfo; import org.apache.hadoop.hbase.rsgroup.RSGroupInfo;
@ -570,8 +569,6 @@ public class TestRSGroupBasedLoadBalancer {
Mockito.when(services.getTableDescriptors()).thenReturn(tds); Mockito.when(services.getTableDescriptors()).thenReturn(tds);
AssignmentManager am = Mockito.mock(AssignmentManager.class); AssignmentManager am = Mockito.mock(AssignmentManager.class);
Mockito.when(services.getAssignmentManager()).thenReturn(am); Mockito.when(services.getAssignmentManager()).thenReturn(am);
RegionStates rss = Mockito.mock(RegionStates.class);
Mockito.when(am.getRegionStates()).thenReturn(rss);
return services; return services;
} }

View File

@ -978,7 +978,7 @@ public class HMaster extends HRegionServer implements MasterServices {
// as procedures run -- in particular SCPs for crashed servers... One should put up hbase:meta // as procedures run -- in particular SCPs for crashed servers... One should put up hbase:meta
// if it is down. It may take a while to come online. So, wait here until meta if for sure // if it is down. It may take a while to come online. So, wait here until meta if for sure
// available. Thats what waitUntilMetaOnline does. // available. Thats what waitUntilMetaOnline does.
if (!waitForMetaOnline()) { if (!waitUntilMetaOnline()) {
return; return;
} }
this.assignmentManager.joinCluster(); this.assignmentManager.joinCluster();
@ -1010,7 +1010,7 @@ public class HMaster extends HRegionServer implements MasterServices {
// Here we expect hbase:namespace to be online. See inside initClusterSchemaService. // Here we expect hbase:namespace to be online. See inside initClusterSchemaService.
// TODO: Fix this. Namespace is a pain being a sort-of system table. Fold it in to hbase:meta. // TODO: Fix this. Namespace is a pain being a sort-of system table. Fold it in to hbase:meta.
// isNamespace does like isMeta and waits until namespace is onlined before allowing progress. // isNamespace does like isMeta and waits until namespace is onlined before allowing progress.
if (!waitForNamespaceOnline()) { if (!waitUntilNamespaceOnline()) {
return; return;
} }
status.setStatus("Starting cluster schema service"); status.setStatus("Starting cluster schema service");
@ -1094,7 +1094,7 @@ public class HMaster extends HRegionServer implements MasterServices {
* and we will hold here until operator intervention. * and we will hold here until operator intervention.
*/ */
@VisibleForTesting @VisibleForTesting
public boolean waitForMetaOnline() throws InterruptedException { public boolean waitUntilMetaOnline() throws InterruptedException {
return isRegionOnline(RegionInfoBuilder.FIRST_META_REGIONINFO); return isRegionOnline(RegionInfoBuilder.FIRST_META_REGIONINFO);
} }
@ -1135,7 +1135,7 @@ public class HMaster extends HRegionServer implements MasterServices {
* @return True if namespace table is up/online. * @return True if namespace table is up/online.
*/ */
@VisibleForTesting @VisibleForTesting
public boolean waitForNamespaceOnline() throws InterruptedException { public boolean waitUntilNamespaceOnline() throws InterruptedException {
List<RegionInfo> ris = this.assignmentManager.getRegionStates(). List<RegionInfo> ris = this.assignmentManager.getRegionStates().
getRegionsOfTable(TableName.NAMESPACE_TABLE_NAME); getRegionsOfTable(TableName.NAMESPACE_TABLE_NAME);
if (ris.isEmpty()) { if (ris.isEmpty()) {

View File

@ -29,7 +29,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.Set; import java.util.Set;
import java.util.function.BiFunction; import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ClusterMetricsBuilder; import org.apache.hadoop.hbase.ClusterMetricsBuilder;
@ -44,6 +44,7 @@ import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.UnknownRegionException; import org.apache.hadoop.hbase.UnknownRegionException;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.MasterSwitchType; import org.apache.hadoop.hbase.client.MasterSwitchType;
import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionInfoBuilder; import org.apache.hadoop.hbase.client.RegionInfoBuilder;
@ -594,7 +595,7 @@ public class MasterRpcServices extends RSRpcServices
BalanceRequest request) throws ServiceException { BalanceRequest request) throws ServiceException {
try { try {
return BalanceResponse.newBuilder().setBalancerRan(master.balance( return BalanceResponse.newBuilder().setBalancerRan(master.balance(
request.hasForce()? request.getForce(): false)).build(); request.hasForce() ? request.getForce() : false)).build();
} catch (IOException ex) { } catch (IOException ex) {
throw new ServiceException(ex); throw new ServiceException(ex);
} }
@ -1180,8 +1181,7 @@ public class MasterRpcServices extends RSRpcServices
if (executor.isFinished(procId)) { if (executor.isFinished(procId)) {
builder.setState(GetProcedureResultResponse.State.FINISHED); builder.setState(GetProcedureResultResponse.State.FINISHED);
if (result.isFailed()) { if (result.isFailed()) {
IOException exception = IOException exception = result.getException().unwrapRemoteIOException();
MasterProcedureUtil.unwrapRemoteIOException(result);
builder.setException(ForeignExceptionUtil.toProtoForeignException(exception)); builder.setException(ForeignExceptionUtil.toProtoForeignException(exception));
} }
byte[] resultData = result.getResult(); byte[] resultData = result.getResult();
@ -2335,15 +2335,14 @@ public class MasterRpcServices extends RSRpcServices
* Submit the Procedure that gets created by <code>f</code> * Submit the Procedure that gets created by <code>f</code>
* @return pid of the submitted Procedure. * @return pid of the submitted Procedure.
*/ */
private long submitProcedure(HBaseProtos.RegionSpecifier rs, boolean override, private long submitProcedure(HBaseProtos.RegionSpecifier rs, Function<RegionInfo, Procedure> f)
BiFunction<RegionInfo, Boolean, Procedure> f)
throws UnknownRegionException { throws UnknownRegionException {
RegionInfo ri = getRegionInfo(rs); RegionInfo ri = getRegionInfo(rs);
long pid = Procedure.NO_PROC_ID; long pid = Procedure.NO_PROC_ID;
if (ri == null) { if (ri == null) {
LOG.warn("No RegionInfo found to match {}", rs); LOG.warn("No RegionInfo found to match {}", rs);
} else { } else {
pid = this.master.getMasterProcedureExecutor().submitProcedure(f.apply(ri, override)); pid = this.master.getMasterProcedureExecutor().submitProcedure(f.apply(ri));
} }
return pid; return pid;
} }
@ -2363,10 +2362,9 @@ public class MasterRpcServices extends RSRpcServices
MasterProtos.AssignsResponse.Builder responseBuilder = MasterProtos.AssignsResponse.Builder responseBuilder =
MasterProtos.AssignsResponse.newBuilder(); MasterProtos.AssignsResponse.newBuilder();
try { try {
boolean override = request.getOverride();
for (HBaseProtos.RegionSpecifier rs: request.getRegionList()) { for (HBaseProtos.RegionSpecifier rs: request.getRegionList()) {
long pid = submitProcedure(rs, override, long pid = submitProcedure(rs,
(r, b) -> this.master.getAssignmentManager().createAssignProcedure(r, b)); r -> this.master.getAssignmentManager().createAssignProcedure(r));
responseBuilder.addPid(pid); responseBuilder.addPid(pid);
} }
return responseBuilder.build(); return responseBuilder.build();
@ -2390,10 +2388,9 @@ public class MasterRpcServices extends RSRpcServices
MasterProtos.UnassignsResponse.Builder responseBuilder = MasterProtos.UnassignsResponse.Builder responseBuilder =
MasterProtos.UnassignsResponse.newBuilder(); MasterProtos.UnassignsResponse.newBuilder();
try { try {
boolean override = request.getOverride();
for (HBaseProtos.RegionSpecifier rs: request.getRegionList()) { for (HBaseProtos.RegionSpecifier rs: request.getRegionList()) {
long pid = submitProcedure(rs, override, long pid = submitProcedure(rs,
(r, b) -> this.master.getAssignmentManager().createUnassignProcedure(r, b)); ri -> this.master.getAssignmentManager().createUnassignProcedure(ri));
responseBuilder.addPid(pid); responseBuilder.addPid(pid);
} }
return responseBuilder.build(); return responseBuilder.build();
@ -2419,7 +2416,7 @@ public class MasterRpcServices extends RSRpcServices
try { try {
List<Boolean> ret = List<Boolean> ret =
master.getMasterProcedureExecutor().bypassProcedure(request.getProcIdList(), master.getMasterProcedureExecutor().bypassProcedure(request.getProcIdList(),
request.getWaitTime(), request.getOverride(), request.getRecursive()); request.getWaitTime(), request.getForce());
return MasterProtos.BypassProcedureResponse.newBuilder().addAllBypassed(ret).build(); return MasterProtos.BypassProcedureResponse.newBuilder().addAllBypassed(ret).build();
} catch (IOException e) { } catch (IOException e) {
throw new ServiceException(e); throw new ServiceException(e);

View File

@ -99,16 +99,12 @@ public class AssignProcedure extends RegionTransitionProcedure {
} }
public AssignProcedure(final RegionInfo regionInfo) { public AssignProcedure(final RegionInfo regionInfo) {
this(regionInfo, null); super(regionInfo);
this.targetServer = null;
} }
public AssignProcedure(final RegionInfo regionInfo, final ServerName destinationServer) { public AssignProcedure(final RegionInfo regionInfo, final ServerName destinationServer) {
this(regionInfo, destinationServer, false); super(regionInfo);
}
public AssignProcedure(final RegionInfo regionInfo, final ServerName destinationServer,
boolean override) {
super(regionInfo, override);
this.targetServer = destinationServer; this.targetServer = destinationServer;
} }
@ -142,9 +138,6 @@ public class AssignProcedure extends RegionTransitionProcedure {
if (getAttempt() > 0) { if (getAttempt() > 0) {
state.setAttempt(getAttempt()); state.setAttempt(getAttempt());
} }
if (isOverride()) {
state.setOverride(isOverride());
}
serializer.serialize(state.build()); serializer.serialize(state.build());
} }
@ -155,7 +148,6 @@ public class AssignProcedure extends RegionTransitionProcedure {
setTransitionState(state.getTransitionState()); setTransitionState(state.getTransitionState());
setRegionInfo(ProtobufUtil.toRegionInfo(state.getRegionInfo())); setRegionInfo(ProtobufUtil.toRegionInfo(state.getRegionInfo()));
forceNewPlan = state.getForceNewPlan(); forceNewPlan = state.getForceNewPlan();
setOverride(state.getOverride());
if (state.hasTargetServer()) { if (state.hasTargetServer()) {
this.targetServer = ProtobufUtil.toServerName(state.getTargetServer()); this.targetServer = ProtobufUtil.toServerName(state.getTargetServer());
} }

View File

@ -677,37 +677,26 @@ public class AssignmentManager implements ServerListener {
* Called by things like DisableTableProcedure to get a list of UnassignProcedure * Called by things like DisableTableProcedure to get a list of UnassignProcedure
* to unassign the regions of the table. * to unassign the regions of the table.
*/ */
public UnassignProcedure[] createUnassignProcedures(final TableName tableName) {
return createUnassignProcedures(regionStates.getTableRegionStateNodes(tableName));
}
public AssignProcedure createAssignProcedure(final RegionInfo regionInfo) { public AssignProcedure createAssignProcedure(final RegionInfo regionInfo) {
return createAssignProcedure(regionInfo, null, false); AssignProcedure proc = new AssignProcedure(regionInfo);
} proc.setOwner(getProcedureEnvironment().getRequestUser().getShortName());
return proc;
public AssignProcedure createAssignProcedure(final RegionInfo regionInfo, boolean override) {
return createAssignProcedure(regionInfo, null, override);
} }
public AssignProcedure createAssignProcedure(final RegionInfo regionInfo, public AssignProcedure createAssignProcedure(final RegionInfo regionInfo,
ServerName targetServer) { final ServerName targetServer) {
return createAssignProcedure(regionInfo, targetServer, false); AssignProcedure proc = new AssignProcedure(regionInfo, targetServer);
}
public AssignProcedure createAssignProcedure(final RegionInfo regionInfo,
final ServerName targetServer, boolean override) {
AssignProcedure proc = new AssignProcedure(regionInfo, targetServer, override);
proc.setOwner(getProcedureEnvironment().getRequestUser().getShortName()); proc.setOwner(getProcedureEnvironment().getRequestUser().getShortName());
return proc; return proc;
} }
public UnassignProcedure createUnassignProcedure(final RegionInfo regionInfo) { public UnassignProcedure createUnassignProcedure(final RegionInfo regionInfo) {
return createUnassignProcedure(regionInfo, null, false); return createUnassignProcedure(regionInfo, null, false);
}
public UnassignProcedure createUnassignProcedure(final RegionInfo regionInfo,
boolean override) {
return createUnassignProcedure(regionInfo, null, override);
}
public UnassignProcedure[] createUnassignProcedures(final TableName tableName) {
return createUnassignProcedures(regionStates.getTableRegionStateNodes(tableName));
} }
UnassignProcedure createUnassignProcedure(final RegionInfo regionInfo, UnassignProcedure createUnassignProcedure(final RegionInfo regionInfo,

View File

@ -111,12 +111,6 @@ public abstract class RegionTransitionProcedure
*/ */
private RegionInfo regionInfo; private RegionInfo regionInfo;
/**
* this data member must also be persisted.
* @see #regionInfo
*/
private boolean override;
/** /**
* Like {@link #regionInfo}, the expectation is that subclasses persist the value of this * Like {@link #regionInfo}, the expectation is that subclasses persist the value of this
* data member. It is used doing backoff when Procedure gets stuck. * data member. It is used doing backoff when Procedure gets stuck.
@ -126,9 +120,8 @@ public abstract class RegionTransitionProcedure
// Required by the Procedure framework to create the procedure on replay // Required by the Procedure framework to create the procedure on replay
public RegionTransitionProcedure() {} public RegionTransitionProcedure() {}
public RegionTransitionProcedure(final RegionInfo regionInfo, boolean override) { public RegionTransitionProcedure(final RegionInfo regionInfo) {
this.regionInfo = regionInfo; this.regionInfo = regionInfo;
this.override = override;
} }
@VisibleForTesting @VisibleForTesting
@ -141,7 +134,7 @@ public abstract class RegionTransitionProcedure
* {@link #deserializeStateData(ProcedureStateSerializer)} method. Expectation is that * {@link #deserializeStateData(ProcedureStateSerializer)} method. Expectation is that
* subclasses will persist `regioninfo` in their * subclasses will persist `regioninfo` in their
* {@link #serializeStateData(ProcedureStateSerializer)} method and then restore `regionInfo` on * {@link #serializeStateData(ProcedureStateSerializer)} method and then restore `regionInfo` on
* deserialization by calling this. * deserialization by calling.
*/ */
protected void setRegionInfo(final RegionInfo regionInfo) { protected void setRegionInfo(final RegionInfo regionInfo) {
this.regionInfo = regionInfo; this.regionInfo = regionInfo;
@ -149,21 +142,9 @@ public abstract class RegionTransitionProcedure
/** /**
* This setter is for subclasses to call in their * This setter is for subclasses to call in their
* {@link #deserializeStateData(ProcedureStateSerializer)} method. Expectation is that * {@link #deserializeStateData(ProcedureStateSerializer)} method.
* subclasses will persist `override` in their * @see #setRegionInfo(RegionInfo)
* {@link #serializeStateData(ProcedureStateSerializer)} method and then restore `override` on
* deserialization by calling this.
*/ */
protected void setOverride(boolean override) {
this.override = override;
}
/**
* This setter is for subclasses to call in their
* {@link #deserializeStateData(ProcedureStateSerializer)} method.
* @see #setRegionInfo(RegionInfo)
*/
protected void setAttempt(int attempt) { protected void setAttempt(int attempt) {
this.attempt = attempt; this.attempt = attempt;
} }
@ -189,11 +170,6 @@ public abstract class RegionTransitionProcedure
sb.append(getTableName()); sb.append(getTableName());
sb.append(", region="); sb.append(", region=");
sb.append(getRegionInfo() == null? null: getRegionInfo().getEncodedName()); sb.append(getRegionInfo() == null? null: getRegionInfo().getEncodedName());
if (isOverride()) {
// Only log if set.
sb.append(", override=");
sb.append(isOverride());
}
} }
public RegionStateNode getRegionState(final MasterProcedureEnv env) { public RegionStateNode getRegionState(final MasterProcedureEnv env) {
@ -332,19 +308,12 @@ public abstract class RegionTransitionProcedure
final AssignmentManager am = env.getAssignmentManager(); final AssignmentManager am = env.getAssignmentManager();
final RegionStateNode regionNode = getRegionState(env); final RegionStateNode regionNode = getRegionState(env);
if (!am.addRegionInTransition(regionNode, this)) { if (!am.addRegionInTransition(regionNode, this)) {
if (this.isOverride()) { String msg = String.format(
LOG.info("{} owned by pid={}, OVERRIDDEN by 'this' (pid={}, override=true).", "There is already another procedure running on this region this=%s owner=%s",
regionNode.getRegionInfo().getEncodedName(), this, regionNode.getProcedure());
regionNode.getProcedure().getProcId(), getProcId()); LOG.warn(msg + " " + this + "; " + regionNode.toShortString());
regionNode.unsetProcedure(regionNode.getProcedure()); setAbortFailure(getClass().getSimpleName(), msg);
} else { return null;
String msg = String.format("%s owned by pid=%d, CANNOT run 'this' (pid=%d).",
regionNode.getRegionInfo().getEncodedName(),
regionNode.getProcedure().getProcId(), getProcId());
LOG.warn(msg);
setAbortFailure(getClass().getSimpleName(), msg);
return null;
}
} }
try { try {
boolean retry; boolean retry;
@ -456,12 +425,8 @@ public abstract class RegionTransitionProcedure
// TODO: Revisit this and move it to the executor // TODO: Revisit this and move it to the executor
if (env.getProcedureScheduler().waitRegion(this, getRegionInfo())) { if (env.getProcedureScheduler().waitRegion(this, getRegionInfo())) {
try { try {
// Enable TRACE on this class to see lock dump. Can be really large when cluster is big LOG.debug(LockState.LOCK_EVENT_WAIT + " pid=" + getProcId() + " " +
// or big tables being enabled/disabled. env.getProcedureScheduler().dumpLocks());
if (LOG.isTraceEnabled()) {
LOG.trace("{} pid={} {}", LockState.LOCK_EVENT_WAIT, getProcId(),
env.getProcedureScheduler().dumpLocks());
}
} catch (IOException e) { } catch (IOException e) {
// ignore, just for logging // ignore, just for logging
} }
@ -504,23 +469,4 @@ public abstract class RegionTransitionProcedure
// should not be called for region operation until we modified the open/close region procedure // should not be called for region operation until we modified the open/close region procedure
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override
protected void bypass(MasterProcedureEnv env) {
// This override is just so I can write a note on how bypass is done in
// RTP. For RTP procedures -- i.e. assign/unassign -- if bypass is called,
// we intentionally do NOT cleanup our state. We leave a reference to the
// bypassed Procedure in the RegionStateNode. Doing this makes it so the
// RSN is in an odd state. The bypassed Procedure is finished but no one
// else can make progress on this RSN entity (see the #execute above where
// we check the RSN to see if an already registered procedure and if so,
// we exit without proceeding). This is done to intentionally block
// subsequent Procedures from running. Only a Procedure with the 'override' flag
// set can overwrite the RSN and make progress.
super.bypass(env);
}
boolean isOverride() {
return this.override;
}
} }

View File

@ -84,6 +84,10 @@ public class UnassignProcedure extends RegionTransitionProcedure {
*/ */
protected volatile ServerName destinationServer; protected volatile ServerName destinationServer;
// TODO: should this be in a reassign procedure?
// ...and keep unassign for 'disable' case?
private boolean force;
/** /**
* Whether deleting the region from in-memory states after unassigning the region. * Whether deleting the region from in-memory states after unassigning the region.
*/ */
@ -105,11 +109,12 @@ public class UnassignProcedure extends RegionTransitionProcedure {
} }
public UnassignProcedure(final RegionInfo regionInfo, final ServerName hostingServer, public UnassignProcedure(final RegionInfo regionInfo, final ServerName hostingServer,
final ServerName destinationServer, final boolean override, final ServerName destinationServer, final boolean force,
final boolean removeAfterUnassigning) { final boolean removeAfterUnassigning) {
super(regionInfo, override); super(regionInfo);
this.hostingServer = hostingServer; this.hostingServer = hostingServer;
this.destinationServer = destinationServer; this.destinationServer = destinationServer;
this.force = force;
this.removeAfterUnassigning = removeAfterUnassigning; this.removeAfterUnassigning = removeAfterUnassigning;
// we don't need REGION_TRANSITION_QUEUE, we jump directly to sending the request // we don't need REGION_TRANSITION_QUEUE, we jump directly to sending the request
@ -142,7 +147,7 @@ public class UnassignProcedure extends RegionTransitionProcedure {
if (this.destinationServer != null) { if (this.destinationServer != null) {
state.setDestinationServer(ProtobufUtil.toServerName(destinationServer)); state.setDestinationServer(ProtobufUtil.toServerName(destinationServer));
} }
if (isOverride()) { if (force) {
state.setForce(true); state.setForce(true);
} }
if (removeAfterUnassigning) { if (removeAfterUnassigning) {
@ -162,8 +167,7 @@ public class UnassignProcedure extends RegionTransitionProcedure {
setTransitionState(state.getTransitionState()); setTransitionState(state.getTransitionState());
setRegionInfo(ProtobufUtil.toRegionInfo(state.getRegionInfo())); setRegionInfo(ProtobufUtil.toRegionInfo(state.getRegionInfo()));
this.hostingServer = ProtobufUtil.toServerName(state.getHostingServer()); this.hostingServer = ProtobufUtil.toServerName(state.getHostingServer());
// The 'force' flag is the override flag in unassign. force = state.getForce();
setOverride(state.getForce());
if (state.hasDestinationServer()) { if (state.hasDestinationServer()) {
this.destinationServer = ProtobufUtil.toServerName(state.getDestinationServer()); this.destinationServer = ProtobufUtil.toServerName(state.getDestinationServer());
} }

View File

@ -51,8 +51,6 @@ import org.apache.hadoop.hbase.master.LoadBalancer;
import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.RackManager; import org.apache.hadoop.hbase.master.RackManager;
import org.apache.hadoop.hbase.master.RegionPlan; import org.apache.hadoop.hbase.master.RegionPlan;
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
import org.apache.hadoop.hbase.master.assignment.RegionStates;
import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer.Cluster.Action.Type; import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer.Cluster.Action.Type;
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting; import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
import org.apache.hbase.thirdparty.com.google.common.base.Joiner; import org.apache.hbase.thirdparty.com.google.common.base.Joiner;
@ -1458,13 +1456,10 @@ public abstract class BaseLoadBalancer implements LoadBalancer {
ServerName oldServerName = entry.getValue(); ServerName oldServerName = entry.getValue();
// In the current set of regions even if one has region replica let us go with // In the current set of regions even if one has region replica let us go with
// getting the entire snapshot // getting the entire snapshot
if (this.services != null) { // for tests if (this.services != null && this.services.getAssignmentManager() != null) { // for tests
AssignmentManager am = this.services.getAssignmentManager(); if (!hasRegionReplica && this.services.getAssignmentManager().getRegionStates()
if (am != null) { .isReplicaAvailableForRegion(region)) {
RegionStates rss = am.getRegionStates(); hasRegionReplica = true;
if (!hasRegionReplica && rss.isReplicaAvailableForRegion(region)) {
hasRegionReplica = true;
}
} }
} }
List<ServerName> localServers = new ArrayList<>(); List<ServerName> localServers = new ArrayList<>();

View File

@ -19,12 +19,9 @@ package org.apache.hadoop.hbase.master.procedure;
import java.io.IOException; import java.io.IOException;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.Procedure;
import org.apache.hadoop.hbase.procedure2.ProcedureException;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.security.User; import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.util.NonceKey; import org.apache.hadoop.hbase.util.NonceKey;
@ -179,17 +176,4 @@ public final class MasterProcedureUtil {
public static int getServerPriority(ServerProcedureInterface proc) { public static int getServerPriority(ServerProcedureInterface proc) {
return proc.hasMetaTableRegion() ? 100 : 1; return proc.hasMetaTableRegion() ? 100 : 1;
} }
/**
* This is a version of unwrapRemoteIOException that can do DoNotRetryIOE.
* We need to throw DNRIOE to clients if a failed Procedure else they will
* keep trying. The default proc.getException().unwrapRemoteException
* doesn't have access to DNRIOE from the procedure2 module.
*/
public static IOException unwrapRemoteIOException(Procedure proc) {
Exception e = proc.getException().unwrapRemoteException();
// Do not retry ProcedureExceptions!
return (e instanceof ProcedureException)? new DoNotRetryIOException(e):
proc.getException().unwrapRemoteIOException();
}
} }

View File

@ -69,8 +69,7 @@ public class ProcedureDescriber {
description.put("LAST_UPDATE", new Date(proc.getLastUpdate())); description.put("LAST_UPDATE", new Date(proc.getLastUpdate()));
if (proc.isFailed()) { if (proc.isFailed()) {
description.put("ERRORS", description.put("ERRORS", proc.getException().unwrapRemoteIOException().getMessage());
MasterProcedureUtil.unwrapRemoteIOException(proc).getMessage());
} }
description.put("PARAMETERS", parametersToObject(proc)); description.put("PARAMETERS", parametersToObject(proc));

View File

@ -99,7 +99,7 @@ public abstract class ProcedurePrepareLatch {
@Override @Override
protected void countDown(final Procedure proc) { protected void countDown(final Procedure proc) {
if (proc.hasException()) { if (proc.hasException()) {
exception = MasterProcedureUtil.unwrapRemoteIOException(proc); exception = proc.getException().unwrapRemoteIOException();
} }
latch.countDown(); latch.countDown();
} }

View File

@ -160,10 +160,9 @@ public final class ProcedureSyncWait {
throw new IOException("The Master is Aborting"); throw new IOException("The Master is Aborting");
} }
// If the procedure fails, we should always have an exception captured. Throw it.
// Needs to be an IOE to get out of here.
if (proc.hasException()) { if (proc.hasException()) {
throw MasterProcedureUtil.unwrapRemoteIOException(proc); // If the procedure fails, we should always have an exception captured. Throw it.
throw proc.getException().unwrapRemoteIOException();
} else { } else {
return proc.getResult(); return proc.getResult();
} }

View File

@ -108,7 +108,7 @@ public class TestMetaTableAccessor {
@Test @Test
public void testIsMetaWhenAllHealthy() throws InterruptedException { public void testIsMetaWhenAllHealthy() throws InterruptedException {
HMaster m = UTIL.getMiniHBaseCluster().getMaster(); HMaster m = UTIL.getMiniHBaseCluster().getMaster();
assertTrue(m.waitForMetaOnline()); assertTrue(m.waitUntilMetaOnline());
} }
@Test @Test
@ -117,7 +117,7 @@ public class TestMetaTableAccessor {
int index = UTIL.getMiniHBaseCluster().getServerWithMeta(); int index = UTIL.getMiniHBaseCluster().getServerWithMeta();
HRegionServer rsWithMeta = UTIL.getMiniHBaseCluster().getRegionServer(index); HRegionServer rsWithMeta = UTIL.getMiniHBaseCluster().getRegionServer(index);
rsWithMeta.abort("TESTING"); rsWithMeta.abort("TESTING");
assertTrue(m.waitForMetaOnline()); assertTrue(m.waitUntilMetaOnline());
} }
/** /**

View File

@ -116,8 +116,7 @@ public class TestHbck {
//bypass the procedure //bypass the procedure
List<Long> pids = Arrays.<Long>asList(procId); List<Long> pids = Arrays.<Long>asList(procId);
List<Boolean> results = List<Boolean> results = TEST_UTIL.getHbck().bypassProcedure(pids, 30000, false);
TEST_UTIL.getHbck().bypassProcedure(pids, 30000, false, false);
assertTrue("Failed to by pass procedure!", results.get(0)); assertTrue("Failed to by pass procedure!", results.get(0));
TEST_UTIL.waitFor(5000, () -> proc.isSuccess() && proc.isBypass()); TEST_UTIL.waitFor(5000, () -> proc.isSuccess() && proc.isBypass());
LOG.info("{} finished", proc); LOG.info("{} finished", proc);

View File

@ -1,181 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master.assignment;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.procedure2.Procedure;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.util.Bytes;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.ClassRule;
import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.rules.TestName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Tests bypass on a region assign/unassign
*/
@Category({LargeTests.class})
public class TestRegionBypass {
private final static Logger LOG = LoggerFactory.getLogger(TestRegionBypass.class);
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestRegionBypass.class);
@Rule
public TestName name = new TestName();
private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
private TableName tableName;
@BeforeClass
public static void startCluster() throws Exception {
TEST_UTIL.startMiniCluster(2);
}
@AfterClass
public static void stopCluster() throws Exception {
TEST_UTIL.shutdownMiniCluster();
}
@Before
public void before() throws IOException {
this.tableName = TableName.valueOf(this.name.getMethodName());
// Create a table. Has one region at least.
TEST_UTIL.createTable(this.tableName, Bytes.toBytes("cf"));
}
@Test
public void testBypass() throws IOException {
Admin admin = TEST_UTIL.getAdmin();
List<RegionInfo> regions = admin.getRegions(this.tableName);
for (RegionInfo ri: regions) {
admin.unassign(ri.getRegionName(), false);
}
List<Long> pids = new ArrayList<>(regions.size());
for (RegionInfo ri: regions) {
Procedure<MasterProcedureEnv> p = new StallingAssignProcedure(ri);
pids.add(TEST_UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor().
submitProcedure(p));
}
for (Long pid: pids) {
while (!TEST_UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor().isStarted(pid)) {
Thread.currentThread().yield();
}
}
// Call bypass on all. We should be stuck in the dispatch at this stage.
List<Procedure<MasterProcedureEnv>> ps =
TEST_UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor().getProcedures();
for (Procedure<MasterProcedureEnv> p: ps) {
if (p instanceof StallingAssignProcedure) {
List<Boolean> bs = TEST_UTIL.getHbck().
bypassProcedure(Arrays.<Long>asList(p.getProcId()), 0, false, false);
for (Boolean b: bs) {
LOG.info("BYPASSED {} {}", p.getProcId(), b);
}
}
}
// Countdown the latch so its not hanging out.
for (Procedure<MasterProcedureEnv> p: ps) {
if (p instanceof StallingAssignProcedure) {
((StallingAssignProcedure)p).latch.countDown();
}
}
// Try and assign WITHOUT override flag. Should fail!.
for (RegionInfo ri: regions) {
try {
admin.assign(ri.getRegionName());
} catch (Throwable dnrioe) {
// Expected
LOG.info("Expected {}", dnrioe);
}
}
while (!TEST_UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor().
getActiveProcIds().isEmpty()) {
Thread.currentThread().yield();
}
// Now assign with the override flag.
for (RegionInfo ri: regions) {
TEST_UTIL.getHbck().assigns(Arrays.<String>asList(ri.getEncodedName()), true);
}
while (!TEST_UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor().
getActiveProcIds().isEmpty()) {
Thread.currentThread().yield();
}
for (RegionInfo ri: regions) {
assertTrue(ri.toString(), TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().
getRegionStates().isRegionOnline(ri));
}
}
/**
* An AssignProcedure that Stalls just before the finish.
*/
public static class StallingAssignProcedure extends AssignProcedure {
public final CountDownLatch latch = new CountDownLatch(2);
public StallingAssignProcedure() {
super();
}
public StallingAssignProcedure(RegionInfo regionInfo) {
super(regionInfo);
}
@Override
void setTransitionState(MasterProcedureProtos.RegionTransitionState state) {
if (state == MasterProcedureProtos.RegionTransitionState.REGION_TRANSITION_DISPATCH) {
try {
LOG.info("LATCH2 {}", this.latch.getCount());
this.latch.await();
LOG.info("LATCH3 {}", this.latch.getCount());
} catch (InterruptedException e) {
e.printStackTrace();
}
} else if (state == MasterProcedureProtos.RegionTransitionState.REGION_TRANSITION_QUEUE) {
// Set latch.
LOG.info("LATCH1 {}", this.latch.getCount());
this.latch.countDown();
}
super.setTransitionState(state);
}
}
}

View File

@ -31,11 +31,12 @@ EOF
end end
def command def command
formatter.header(%w[PID Name State Submitted Last_Update Parameters]) formatter.header(%w[Id Name State Submitted_Time Last_Update Parameters])
list = JSON.parse(admin.list_procedures) list = JSON.parse(admin.list_procedures)
list.each do |proc| list.each do |proc|
submitted_time = Time.at(Integer(proc['submittedTime'])/1000).to_s submitted_time = Time.at(Integer(proc['submittedTime']) / 1000).to_s
last_update = Time.at(Integer(proc['lastUpdate'])/1000).to_s last_update = Time.at(Integer(proc['lastUpdate']) / 1000).to_s
formatter.row([proc['procId'], proc['className'], proc['state'], formatter.row([proc['procId'], proc['className'], proc['state'],
submitted_time, last_update, proc['stateMessage']]) submitted_time, last_update, proc['stateMessage']])
end end

View File

@ -39,4 +39,5 @@ public class TestShell extends AbstractTestShell {
// Start all ruby tests // Start all ruby tests
jruby.runScriptlet(PathType.ABSOLUTE, "src/test/ruby/tests_runner.rb"); jruby.runScriptlet(PathType.ABSOLUTE, "src/test/ruby/tests_runner.rb");
} }
} }

View File

@ -85,9 +85,9 @@ class ShellTest < Test::Unit::TestCase
define_test "Shell::Shell interactive mode should not throw" do define_test "Shell::Shell interactive mode should not throw" do
# incorrect number of arguments # incorrect number of arguments
@shell.command('create', 'nothrow_table') @shell.command('create', 'foo')
@shell.command('create', 'nothrow_table', 'family_1') @shell.command('create', 'foo', 'family_1')
# create a table that exists # create a table that exists
@shell.command('create', 'nothrow_table', 'family_1') @shell.command('create', 'foo', 'family_1')
end end
end end