HBASE-16008 A robust way deal with early termination of HBCK (Stephen Yuan Jiang)
This commit is contained in:
parent
276acdb0b0
commit
a8dd359d7e
|
@ -839,6 +839,13 @@ public interface Admin extends Abortable, Closeable {
|
|||
*/
|
||||
void stopMaster() throws IOException;
|
||||
|
||||
/**
|
||||
* Check whether Master is in maintenance mode
|
||||
*
|
||||
* @throws IOException if a remote or network exception occurs
|
||||
*/
|
||||
boolean isMasterInMaintenanceMode() throws IOException;
|
||||
|
||||
/**
|
||||
* Stop the designated regionserver
|
||||
*
|
||||
|
|
|
@ -1862,6 +1862,13 @@ class ConnectionManager {
|
|||
return stub.stopMaster(controller, request);
|
||||
}
|
||||
|
||||
@Override
|
||||
public MasterProtos.IsInMaintenanceModeResponse isMasterInMaintenanceMode(
|
||||
final RpcController controller,
|
||||
final MasterProtos.IsInMaintenanceModeRequest request) throws ServiceException {
|
||||
return stub.isMasterInMaintenanceMode(controller, request);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BalanceResponse balance(RpcController controller,
|
||||
BalanceRequest request) throws ServiceException {
|
||||
|
|
|
@ -125,6 +125,8 @@ import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetSchemaAlterSta
|
|||
import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsRequest;
|
||||
import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsResponse;
|
||||
import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableNamesRequest;
|
||||
import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsInMaintenanceModeRequest;
|
||||
import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsInMaintenanceModeResponse;
|
||||
import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsProcedureDoneRequest;
|
||||
import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsProcedureDoneResponse;
|
||||
import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsRestoreSnapshotDoneRequest;
|
||||
|
@ -2933,6 +2935,19 @@ public class HBaseAdmin implements Admin {
|
|||
* @return cluster status
|
||||
* @throws IOException if a remote or network exception occurs
|
||||
*/
|
||||
@Override
|
||||
public boolean isMasterInMaintenanceMode() throws IOException {
|
||||
return executeCallable(new MasterCallable<IsInMaintenanceModeResponse>(getConnection()) {
|
||||
@Override
|
||||
public IsInMaintenanceModeResponse call(int callTimeout) throws ServiceException {
|
||||
PayloadCarryingRpcController controller = rpcControllerFactory.newController();
|
||||
controller.setCallTimeout(callTimeout);
|
||||
return master.isMasterInMaintenanceMode(
|
||||
controller, IsInMaintenanceModeRequest.newBuilder().build());
|
||||
}
|
||||
}).getInMaintenanceMode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ClusterStatus getClusterStatus() throws IOException {
|
||||
return executeCallable(new MasterCallable<ClusterStatus>(getConnection()) {
|
||||
|
|
|
@ -126,6 +126,8 @@ public class ZooKeeperWatcher implements Watcher, Abortable, Closeable {
|
|||
public String recoveringRegionsZNode;
|
||||
// znode containing namespace descriptors
|
||||
public static String namespaceZNode = "namespace";
|
||||
// znode of indicating master maintenance mode
|
||||
public static String masterMaintZNode = "masterMaintenance";
|
||||
|
||||
// Certain ZooKeeper nodes need to be world-readable
|
||||
public static final ArrayList<ACL> CREATOR_ALL_AND_WORLD_READABLE =
|
||||
|
@ -207,6 +209,7 @@ public class ZooKeeperWatcher implements Watcher, Abortable, Closeable {
|
|||
ZKUtil.createAndFailSilent(this, backupMasterAddressesZNode);
|
||||
ZKUtil.createAndFailSilent(this, tableLockZNode);
|
||||
ZKUtil.createAndFailSilent(this, recoveringRegionsZNode);
|
||||
ZKUtil.createAndFailSilent(this, masterMaintZNode);
|
||||
} catch (KeeperException e) {
|
||||
throw new ZooKeeperConnectionException(
|
||||
prefix("Unexpected KeeperException creating base node"), e);
|
||||
|
@ -457,6 +460,8 @@ public class ZooKeeperWatcher implements Watcher, Abortable, Closeable {
|
|||
conf.get("zookeeper.znode.recovering.regions", "recovering-regions"));
|
||||
namespaceZNode = ZKUtil.joinZNode(baseZNode,
|
||||
conf.get("zookeeper.znode.namespace", "namespace"));
|
||||
masterMaintZNode = ZKUtil.joinZNode(baseZNode,
|
||||
conf.get("zookeeper.znode.masterMaintenance", "master-maintenance"));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -247,6 +247,13 @@ message StopMasterRequest {
|
|||
message StopMasterResponse {
|
||||
}
|
||||
|
||||
message IsInMaintenanceModeRequest {
|
||||
}
|
||||
|
||||
message IsInMaintenanceModeResponse {
|
||||
required bool inMaintenanceMode = 1;
|
||||
}
|
||||
|
||||
message BalanceRequest {
|
||||
optional bool force = 1;
|
||||
}
|
||||
|
@ -618,6 +625,12 @@ service MasterService {
|
|||
rpc StopMaster(StopMasterRequest)
|
||||
returns(StopMasterResponse);
|
||||
|
||||
/**
|
||||
* Query whether the Master is in maintenance mode.
|
||||
*/
|
||||
rpc IsMasterInMaintenanceMode(IsInMaintenanceModeRequest)
|
||||
returns(IsInMaintenanceModeResponse);
|
||||
|
||||
/**
|
||||
* Run the balancer. Will run the balancer and if regions to move, it will
|
||||
* go ahead and do the reassignments. Can NOT run for various reasons.
|
||||
|
|
|
@ -111,6 +111,7 @@ public class CatalogJanitor extends ScheduledChore {
|
|||
try {
|
||||
AssignmentManager am = this.services.getAssignmentManager();
|
||||
if (this.enabled.get()
|
||||
&& !this.services.isInMaintenanceMode()
|
||||
&& am != null
|
||||
&& am.isFailoverCleanupDone()
|
||||
&& am.getRegionStates().getRegionsInTransition().size() == 0) {
|
||||
|
@ -242,6 +243,11 @@ public class CatalogJanitor extends ScheduledChore {
|
|||
int mergeCleaned = 0;
|
||||
Map<HRegionInfo, Result> mergedRegions = scanTriple.getSecond();
|
||||
for (Map.Entry<HRegionInfo, Result> e : mergedRegions.entrySet()) {
|
||||
if (this.services.isInMaintenanceMode()) {
|
||||
// Stop cleaning if the master is in maintenance mode
|
||||
break;
|
||||
}
|
||||
|
||||
HRegionInfo regionA = HRegionInfo.getHRegionInfo(e.getValue(),
|
||||
HConstants.MERGEA_QUALIFIER);
|
||||
HRegionInfo regionB = HRegionInfo.getHRegionInfo(e.getValue(),
|
||||
|
@ -268,6 +274,11 @@ public class CatalogJanitor extends ScheduledChore {
|
|||
// regions whose parents are still around
|
||||
HashSet<String> parentNotCleaned = new HashSet<String>();
|
||||
for (Map.Entry<HRegionInfo, Result> e : splitParents.entrySet()) {
|
||||
if (this.services.isInMaintenanceMode()) {
|
||||
// Stop cleaning if the master is in maintenance mode
|
||||
break;
|
||||
}
|
||||
|
||||
if (!parentNotCleaned.contains(e.getKey().getEncodedName()) &&
|
||||
cleanParent(e.getKey(), e.getValue())) {
|
||||
splitCleaned++;
|
||||
|
|
|
@ -158,6 +158,7 @@ import org.apache.hadoop.hbase.util.VersionInfo;
|
|||
import org.apache.hadoop.hbase.zookeeper.DrainingServerTracker;
|
||||
import org.apache.hadoop.hbase.zookeeper.LoadBalancerTracker;
|
||||
import org.apache.hadoop.hbase.zookeeper.MasterAddressTracker;
|
||||
import org.apache.hadoop.hbase.zookeeper.MasterMaintenanceModeTracker;
|
||||
import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
|
||||
import org.apache.hadoop.hbase.zookeeper.RegionNormalizerTracker;
|
||||
import org.apache.hadoop.hbase.zookeeper.RegionServerTracker;
|
||||
|
@ -269,6 +270,9 @@ public class HMaster extends HRegionServer implements MasterServices, Server {
|
|||
/** Namespace stuff */
|
||||
private TableNamespaceManager tableNamespaceManager;
|
||||
|
||||
//Tracker for master maintenance mode setting
|
||||
private MasterMaintenanceModeTracker maintenanceModeTracker;
|
||||
|
||||
// Metrics for the HMaster
|
||||
final MetricsMaster metricsMaster;
|
||||
// file system manager for the master FS operations
|
||||
|
@ -616,6 +620,9 @@ public class HMaster extends HRegionServer implements MasterServices, Server {
|
|||
this.serverManager);
|
||||
this.drainingServerTracker.start();
|
||||
|
||||
this.maintenanceModeTracker = new MasterMaintenanceModeTracker(zooKeeper);
|
||||
this.maintenanceModeTracker.start();
|
||||
|
||||
// Set the cluster as up. If new RSs, they'll be waiting on this before
|
||||
// going ahead with their startup.
|
||||
boolean wasUp = this.clusterStatusTracker.isClusterUp();
|
||||
|
@ -1292,6 +1299,12 @@ public class HMaster extends HRegionServer implements MasterServices, Server {
|
|||
LOG.debug("Master has not been initialized, don't run balancer.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (isInMaintenanceMode()) {
|
||||
LOG.info("Master is in maintenanceMode mode, don't run balancer.");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Do this call outside of synchronized block.
|
||||
int maximumBalanceTime = getBalancerCutoffTime();
|
||||
synchronized (this.balancer) {
|
||||
|
@ -1390,6 +1403,11 @@ public class HMaster extends HRegionServer implements MasterServices, Server {
|
|||
return false;
|
||||
}
|
||||
|
||||
if (isInMaintenanceMode()) {
|
||||
LOG.info("Master is in maintenance mode, don't run region normalizer.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!this.regionNormalizerTracker.isNormalizerOn()) {
|
||||
LOG.debug("Region normalization is disabled, don't run region normalizer.");
|
||||
return false;
|
||||
|
@ -1404,6 +1422,11 @@ public class HMaster extends HRegionServer implements MasterServices, Server {
|
|||
Collections.shuffle(allEnabledTables);
|
||||
|
||||
for (TableName table : allEnabledTables) {
|
||||
if (isInMaintenanceMode()) {
|
||||
LOG.debug("Master is in maintenance mode, stop running region normalizer.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (quotaManager.getNamespaceQuotaManager() != null &&
|
||||
quotaManager.getNamespaceQuotaManager().getState(table.getNamespaceAsString()) != null){
|
||||
LOG.debug("Skipping normalizing " + table + " since its namespace has quota");
|
||||
|
@ -2384,6 +2407,16 @@ public class HMaster extends HRegionServer implements MasterServices, Server {
|
|||
return initialized.isReady();
|
||||
}
|
||||
|
||||
/**
|
||||
* Report whether this master is in maintenance mode.
|
||||
*
|
||||
* @return true if master is in maintenanceMode
|
||||
*/
|
||||
@Override
|
||||
public boolean isInMaintenanceMode() {
|
||||
return maintenanceModeTracker.isInMaintenanceMode();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public void setInitialized(boolean isInitialized) {
|
||||
procedureExecutor.getEnvironment().setEventReady(initialized, isInitialized);
|
||||
|
@ -2847,7 +2880,9 @@ public class HMaster extends HRegionServer implements MasterServices, Server {
|
|||
* @return The state of the load balancer, or false if the load balancer isn't defined.
|
||||
*/
|
||||
public boolean isBalancerOn() {
|
||||
if (null == loadBalancerTracker) return false;
|
||||
if (null == loadBalancerTracker || isInMaintenanceMode()) {
|
||||
return false;
|
||||
}
|
||||
return loadBalancerTracker.isBalancerOn();
|
||||
}
|
||||
|
||||
|
@ -2855,14 +2890,11 @@ public class HMaster extends HRegionServer implements MasterServices, Server {
|
|||
* Queries the state of the {@link RegionNormalizerTracker}. If it's not initialized,
|
||||
* false is returned.
|
||||
*/
|
||||
public boolean isNormalizerOn() {
|
||||
if (null == regionNormalizerTracker) {
|
||||
return false;
|
||||
}
|
||||
return regionNormalizerTracker.isNormalizerOn();
|
||||
public boolean isNormalizerOn() {
|
||||
return (null == regionNormalizerTracker || isInMaintenanceMode()) ?
|
||||
false: regionNormalizerTracker.isNormalizerOn();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Queries the state of the {@link SplitOrMergeTracker}. If it is not initialized,
|
||||
* false is returned. If switchType is illegal, false will return.
|
||||
|
@ -2870,7 +2902,7 @@ public class HMaster extends HRegionServer implements MasterServices, Server {
|
|||
* @return The state of the switch
|
||||
*/
|
||||
public boolean isSplitOrMergeEnabled(Admin.MasterSwitchType switchType) {
|
||||
if (null == splitOrMergeTracker) {
|
||||
if (null == splitOrMergeTracker || isInMaintenanceMode()) {
|
||||
return false;
|
||||
}
|
||||
return splitOrMergeTracker.isSplitOrMergeEnabled(switchType);
|
||||
|
|
|
@ -109,6 +109,8 @@ import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsBalancerEnabled
|
|||
import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsBalancerEnabledResponse;
|
||||
import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsCatalogJanitorEnabledRequest;
|
||||
import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsCatalogJanitorEnabledResponse;
|
||||
import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsInMaintenanceModeRequest;
|
||||
import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsInMaintenanceModeResponse;
|
||||
import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningRequest;
|
||||
import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsMasterRunningResponse;
|
||||
import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsNormalizerEnabledRequest;
|
||||
|
@ -1358,6 +1360,15 @@ public class MasterRpcServices extends RSRpcServices
|
|||
return StopMasterResponse.newBuilder().build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public IsInMaintenanceModeResponse isMasterInMaintenanceMode(
|
||||
final RpcController controller,
|
||||
final IsInMaintenanceModeRequest request) throws ServiceException {
|
||||
IsInMaintenanceModeResponse.Builder response = IsInMaintenanceModeResponse.newBuilder();
|
||||
response.setInMaintenanceMode(master.isInMaintenanceMode());
|
||||
return response.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public UnassignRegionResponse unassignRegion(RpcController controller,
|
||||
UnassignRegionRequest req) throws ServiceException {
|
||||
|
|
|
@ -330,6 +330,11 @@ public interface MasterServices extends Server {
|
|||
final long nonceGroup,
|
||||
final long nonce) throws IOException;
|
||||
|
||||
/**
|
||||
* @return true if master is in maintanceMode
|
||||
*/
|
||||
boolean isInMaintenanceMode();
|
||||
|
||||
/**
|
||||
* Abort a procedure.
|
||||
* @param procId ID of the procedure
|
||||
|
|
|
@ -211,6 +211,9 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
// AlreadyBeingCreatedException which is implies timeout on this operations up to
|
||||
// HdfsConstants.LEASE_SOFTLIMIT_PERIOD (60 seconds).
|
||||
private static final int DEFAULT_WAIT_FOR_LOCK_TIMEOUT = 80; // seconds
|
||||
private static final int DEFAULT_MAX_CREATE_ZNODE_ATTEMPTS = 5;
|
||||
private static final int DEFAULT_CREATE_ZNODE_ATTEMPT_SLEEP_INTERVAL = 200; // milliseconds
|
||||
private static final int DEFAULT_CREATE_ZNODE_ATTEMPT_MAX_SLEEP_TIME = 5000; // milliseconds
|
||||
|
||||
/**********************
|
||||
* Internal resources
|
||||
|
@ -238,8 +241,6 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
private static boolean details = false; // do we display the full report
|
||||
private long timelag = DEFAULT_TIME_LAG; // tables whose modtime is older
|
||||
private static boolean forceExclusive = false; // only this hbck can modify HBase
|
||||
private static boolean disableBalancer = false; // disable load balancer to keep regions stable
|
||||
private static boolean disableSplitAndMerge = false; // disable split and merge
|
||||
private boolean fixAssignments = false; // fix assignment errors?
|
||||
private boolean fixMeta = false; // fix meta errors?
|
||||
private boolean checkHdfs = true; // load and check fs consistency?
|
||||
|
@ -315,7 +316,11 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
*/
|
||||
private Set<TableName> orphanedTableZNodes = new HashSet<TableName>();
|
||||
private final RetryCounterFactory lockFileRetryCounterFactory;
|
||||
|
||||
private final RetryCounterFactory createZNodeRetryCounterFactory;
|
||||
|
||||
private ZooKeeperWatcher zkw = null;
|
||||
private String hbckEphemeralNodePath = null;
|
||||
private boolean hbckZodeCreated = false;
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
|
@ -355,6 +360,15 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
"hbase.hbck.lockfile.attempt.sleep.interval", DEFAULT_LOCK_FILE_ATTEMPT_SLEEP_INTERVAL),
|
||||
getConf().getInt(
|
||||
"hbase.hbck.lockfile.attempt.maxsleeptime", DEFAULT_LOCK_FILE_ATTEMPT_MAX_SLEEP_TIME));
|
||||
createZNodeRetryCounterFactory = new RetryCounterFactory(
|
||||
getConf().getInt("hbase.hbck.createznode.attempts", DEFAULT_MAX_CREATE_ZNODE_ATTEMPTS),
|
||||
getConf().getInt(
|
||||
"hbase.hbck.createznode.attempt.sleep.interval",
|
||||
DEFAULT_CREATE_ZNODE_ATTEMPT_SLEEP_INTERVAL),
|
||||
getConf().getInt(
|
||||
"hbase.hbck.createznode.attempt.maxsleeptime",
|
||||
DEFAULT_CREATE_ZNODE_ATTEMPT_MAX_SLEEP_TIME));
|
||||
zkw = createZooKeeperWatcher();
|
||||
}
|
||||
|
||||
private class FileLockCallable implements Callable<FSDataOutputStream> {
|
||||
|
@ -503,6 +517,7 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
@Override
|
||||
public void run() {
|
||||
IOUtils.closeQuietly(HBaseFsck.this);
|
||||
cleanupHbckZnode();
|
||||
unlockHbck();
|
||||
}
|
||||
});
|
||||
|
@ -681,49 +696,78 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
return errors.getErrorList().size();
|
||||
}
|
||||
|
||||
/**
|
||||
* This method maintains an ephemeral znode. If the creation fails we return false or throw
|
||||
* exception
|
||||
*
|
||||
* @return true if creating znode succeeds; false otherwise
|
||||
* @throws IOException if IO failure occurs
|
||||
*/
|
||||
private boolean setMasterInMaintenanceMode() throws IOException {
|
||||
RetryCounter retryCounter = createZNodeRetryCounterFactory.create();
|
||||
hbckEphemeralNodePath = ZKUtil.joinZNode(
|
||||
ZooKeeperWatcher.masterMaintZNode,
|
||||
"hbck-" + Long.toString(EnvironmentEdgeManager.currentTime()));
|
||||
do {
|
||||
try {
|
||||
hbckZodeCreated = ZKUtil.createEphemeralNodeAndWatch(zkw, hbckEphemeralNodePath, null);
|
||||
if (hbckZodeCreated) {
|
||||
break;
|
||||
}
|
||||
} catch (KeeperException e) {
|
||||
if (retryCounter.getAttemptTimes() >= retryCounter.getMaxAttempts()) {
|
||||
throw new IOException("Can't create znode " + hbckEphemeralNodePath, e);
|
||||
}
|
||||
// fall through and retry
|
||||
}
|
||||
|
||||
LOG.warn("Fail to create znode " + hbckEphemeralNodePath + ", try=" +
|
||||
(retryCounter.getAttemptTimes() + 1) + " of " + retryCounter.getMaxAttempts());
|
||||
|
||||
try {
|
||||
retryCounter.sleepUntilNextRetry();
|
||||
} catch (InterruptedException ie) {
|
||||
throw (InterruptedIOException) new InterruptedIOException(
|
||||
"Can't create znode " + hbckEphemeralNodePath).initCause(ie);
|
||||
}
|
||||
} while (retryCounter.shouldRetry());
|
||||
return hbckZodeCreated;
|
||||
}
|
||||
|
||||
private void cleanupHbckZnode() {
|
||||
try {
|
||||
if (zkw != null && hbckZodeCreated) {
|
||||
ZKUtil.deleteNode(zkw, hbckEphemeralNodePath);
|
||||
hbckZodeCreated = false;
|
||||
}
|
||||
} catch (KeeperException e) {
|
||||
// Ignore
|
||||
if (!e.code().equals(KeeperException.Code.NONODE)) {
|
||||
LOG.warn("Delete HBCK znode " + hbckEphemeralNodePath + " failed ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Contacts the master and prints out cluster-wide information
|
||||
* @return 0 on success, non-zero on failure
|
||||
*/
|
||||
public int onlineHbck() throws IOException, KeeperException, InterruptedException, ServiceException {
|
||||
public int onlineHbck()
|
||||
throws IOException, KeeperException, InterruptedException, ServiceException {
|
||||
// print hbase server version
|
||||
errors.print("Version: " + status.getHBaseVersion());
|
||||
offlineHdfsIntegrityRepair();
|
||||
|
||||
boolean oldBalancer = false;
|
||||
if (shouldDisableBalancer()) {
|
||||
oldBalancer = admin.setBalancerRunning(false, true);
|
||||
}
|
||||
boolean[] oldSplitAndMerge = null;
|
||||
if (shouldDisableSplitAndMerge()) {
|
||||
oldSplitAndMerge = admin.setSplitOrMergeEnabled(false, false,
|
||||
Admin.MasterSwitchType.SPLIT, Admin.MasterSwitchType.MERGE);
|
||||
// If Master runs maintenance tasks (such as balancer, catalog janitor, etc) during online
|
||||
// hbck, it is likely that hbck would be misled and report transient errors. Therefore, it
|
||||
// is better to set Master into maintenance mode during online hbck.
|
||||
//
|
||||
if (!setMasterInMaintenanceMode()) {
|
||||
LOG.warn("HBCK is running while master is not in maintenance mode, you might see transient "
|
||||
+ "error. Please run HBCK multiple times to reduce the chance of transient error.");
|
||||
}
|
||||
|
||||
try {
|
||||
onlineConsistencyRepair();
|
||||
}
|
||||
finally {
|
||||
// Only restore the balancer if it was true when we started repairing and
|
||||
// we actually disabled it. Otherwise, we might clobber another run of
|
||||
// hbck that has just restored it.
|
||||
if (shouldDisableBalancer() && oldBalancer) {
|
||||
admin.setBalancerRunning(oldBalancer, false);
|
||||
}
|
||||
|
||||
if (shouldDisableSplitAndMerge()) {
|
||||
if (oldSplitAndMerge != null) {
|
||||
if (oldSplitAndMerge[0] && oldSplitAndMerge[1]) {
|
||||
admin.setSplitOrMergeEnabled(true, false,
|
||||
Admin.MasterSwitchType.SPLIT, Admin.MasterSwitchType.MERGE);
|
||||
} else if (oldSplitAndMerge[0]) {
|
||||
admin.setSplitOrMergeEnabled(true, false, Admin.MasterSwitchType.SPLIT);
|
||||
} else if (oldSplitAndMerge[1]) {
|
||||
admin.setSplitOrMergeEnabled(true, false, Admin.MasterSwitchType.MERGE);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
onlineConsistencyRepair();
|
||||
|
||||
if (checkRegionBoundaries) {
|
||||
checkRegionBoundaries();
|
||||
|
@ -738,6 +782,9 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
|
||||
checkAndFixReplication();
|
||||
|
||||
// Remove the hbck znode
|
||||
cleanupHbckZnode();
|
||||
|
||||
// Remove the hbck lock
|
||||
unlockHbck();
|
||||
|
||||
|
@ -757,9 +804,20 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
IOUtils.closeQuietly(admin);
|
||||
IOUtils.closeQuietly(meta);
|
||||
IOUtils.closeQuietly(connection);
|
||||
try {
|
||||
cleanupHbckZnode();
|
||||
unlockHbck();
|
||||
} catch (Exception io) {
|
||||
LOG.warn(io);
|
||||
} finally {
|
||||
if (zkw != null) {
|
||||
zkw.close();
|
||||
zkw = null;
|
||||
}
|
||||
IOUtils.closeQuietly(admin);
|
||||
IOUtils.closeQuietly(meta);
|
||||
IOUtils.closeQuietly(connection);
|
||||
}
|
||||
}
|
||||
|
||||
private static class RegionBoundariesInformation {
|
||||
|
@ -1644,7 +1702,6 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
HConnectionManager.execute(new HConnectable<Void>(getConf()) {
|
||||
@Override
|
||||
public Void connect(HConnection connection) throws IOException {
|
||||
ZooKeeperWatcher zkw = createZooKeeperWatcher();
|
||||
try {
|
||||
for (TableName tableName :
|
||||
ZKTableStateClientSideReader.getDisabledOrDisablingTables(zkw)) {
|
||||
|
@ -1654,8 +1711,6 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
throw new IOException(ke);
|
||||
} catch (InterruptedException e) {
|
||||
throw new InterruptedIOException();
|
||||
} finally {
|
||||
zkw.close();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
@ -1775,17 +1830,6 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
});
|
||||
}
|
||||
|
||||
private ServerName getMetaRegionServerName(int replicaId)
|
||||
throws IOException, KeeperException {
|
||||
ZooKeeperWatcher zkw = createZooKeeperWatcher();
|
||||
ServerName sn = null;
|
||||
try {
|
||||
sn = new MetaTableLocator().getMetaRegionLocation(zkw, replicaId);
|
||||
} finally {
|
||||
zkw.close();
|
||||
}
|
||||
return sn;
|
||||
}
|
||||
|
||||
/**
|
||||
* Contacts each regionserver and fetches metadata about regions.
|
||||
|
@ -3230,32 +3274,21 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
}
|
||||
|
||||
private void checkAndFixTableLocks() throws IOException {
|
||||
ZooKeeperWatcher zkw = createZooKeeperWatcher();
|
||||
TableLockChecker checker = new TableLockChecker(zkw, errors);
|
||||
checker.checkTableLocks();
|
||||
|
||||
try {
|
||||
TableLockChecker checker = new TableLockChecker(zkw, errors);
|
||||
checker.checkTableLocks();
|
||||
|
||||
if (this.fixTableLocks) {
|
||||
checker.fixExpiredTableLocks();
|
||||
}
|
||||
} finally {
|
||||
zkw.close();
|
||||
if (this.fixTableLocks) {
|
||||
checker.fixExpiredTableLocks();
|
||||
}
|
||||
}
|
||||
|
||||
private void checkAndFixReplication() throws IOException {
|
||||
ZooKeeperWatcher zkw = createZooKeeperWatcher();
|
||||
try {
|
||||
ReplicationChecker checker = new ReplicationChecker(getConf(), zkw, connection, errors);
|
||||
checker.checkUnDeletedQueues();
|
||||
ReplicationChecker checker = new ReplicationChecker(getConf(), zkw, connection, errors);
|
||||
checker.checkUnDeletedQueues();
|
||||
|
||||
if (checker.hasUnDeletedQueues() && this.fixReplication) {
|
||||
checker.fixUnDeletedQueues();
|
||||
setShouldRerun();
|
||||
}
|
||||
} finally {
|
||||
zkw.close();
|
||||
if (checker.hasUnDeletedQueues() && this.fixReplication) {
|
||||
checker.fixUnDeletedQueues();
|
||||
setShouldRerun();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3267,47 +3300,41 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
*/
|
||||
private void checkAndFixOrphanedTableZNodes()
|
||||
throws IOException, KeeperException, InterruptedException {
|
||||
ZooKeeperWatcher zkw = createZooKeeperWatcher();
|
||||
Set<TableName> enablingTables = ZKTableStateClientSideReader.getEnablingTables(zkw);
|
||||
String msg;
|
||||
TableInfo tableInfo;
|
||||
|
||||
try {
|
||||
Set<TableName> enablingTables = ZKTableStateClientSideReader.getEnablingTables(zkw);
|
||||
String msg;
|
||||
TableInfo tableInfo;
|
||||
|
||||
for (TableName tableName : enablingTables) {
|
||||
// Check whether the table exists in hbase
|
||||
tableInfo = tablesInfo.get(tableName);
|
||||
if (tableInfo != null) {
|
||||
// Table exists. This table state is in transit. No problem for this table.
|
||||
continue;
|
||||
}
|
||||
|
||||
msg = "Table " + tableName + " not found in hbase:meta. Orphaned table ZNode found.";
|
||||
LOG.warn(msg);
|
||||
orphanedTableZNodes.add(tableName);
|
||||
errors.reportError(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY, msg);
|
||||
for (TableName tableName : enablingTables) {
|
||||
// Check whether the table exists in hbase
|
||||
tableInfo = tablesInfo.get(tableName);
|
||||
if (tableInfo != null) {
|
||||
// Table exists. This table state is in transit. No problem for this table.
|
||||
continue;
|
||||
}
|
||||
|
||||
if (orphanedTableZNodes.size() > 0 && this.fixTableZNodes) {
|
||||
ZKTableStateManager zkTableStateMgr = new ZKTableStateManager(zkw);
|
||||
msg = "Table " + tableName + " not found in hbase:meta. Orphaned table ZNode found.";
|
||||
LOG.warn(msg);
|
||||
orphanedTableZNodes.add(tableName);
|
||||
errors.reportError(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY, msg);
|
||||
}
|
||||
|
||||
for (TableName tableName : orphanedTableZNodes) {
|
||||
try {
|
||||
// Set the table state to be disabled so that if we made mistake, we can trace
|
||||
// the history and figure it out.
|
||||
// Another choice is to call checkAndRemoveTableState() to delete the orphaned ZNode.
|
||||
// Both approaches works.
|
||||
zkTableStateMgr.setTableState(tableName, ZooKeeperProtos.Table.State.DISABLED);
|
||||
} catch (CoordinatedStateException e) {
|
||||
// This exception should not happen here
|
||||
LOG.error(
|
||||
"Got a CoordinatedStateException while fixing the ENABLING table znode " + tableName,
|
||||
e);
|
||||
}
|
||||
if (orphanedTableZNodes.size() > 0 && this.fixTableZNodes) {
|
||||
ZKTableStateManager zkTableStateMgr = new ZKTableStateManager(zkw);
|
||||
|
||||
for (TableName tableName : orphanedTableZNodes) {
|
||||
try {
|
||||
// Set the table state to be disabled so that if we made mistake, we can trace
|
||||
// the history and figure it out.
|
||||
// Another choice is to call checkAndRemoveTableState() to delete the orphaned ZNode.
|
||||
// Both approaches works.
|
||||
zkTableStateMgr.setTableState(tableName, ZooKeeperProtos.Table.State.DISABLED);
|
||||
} catch (CoordinatedStateException e) {
|
||||
// This exception should not happen here
|
||||
LOG.error(
|
||||
"Got a CoordinatedStateException while fixing the ENABLING table znode " + tableName,
|
||||
e);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
zkw.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3377,12 +3404,7 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
private void unassignMetaReplica(HbckInfo hi) throws IOException, InterruptedException,
|
||||
KeeperException {
|
||||
undeployRegions(hi);
|
||||
ZooKeeperWatcher zkw = createZooKeeperWatcher();
|
||||
try {
|
||||
ZKUtil.deleteNode(zkw, zkw.getZNodeForReplica(hi.metaEntry.getReplicaId()));
|
||||
} finally {
|
||||
zkw.close();
|
||||
}
|
||||
ZKUtil.deleteNode(zkw, zkw.getZNodeForReplica(hi.metaEntry.getReplicaId()));
|
||||
}
|
||||
|
||||
private void assignMetaReplica(int replicaId)
|
||||
|
@ -4250,38 +4272,6 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
return fixAny || forceExclusive;
|
||||
}
|
||||
|
||||
/**
|
||||
* Disable the load balancer.
|
||||
*/
|
||||
public static void setDisableBalancer() {
|
||||
disableBalancer = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Disable the split and merge
|
||||
*/
|
||||
public static void setDisableSplitAndMerge() {
|
||||
disableSplitAndMerge = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* The balancer should be disabled if we are modifying HBase.
|
||||
* It can be disabled if you want to prevent region movement from causing
|
||||
* false positives.
|
||||
*/
|
||||
public boolean shouldDisableBalancer() {
|
||||
return fixAny || disableBalancer;
|
||||
}
|
||||
|
||||
/**
|
||||
* The split and merge should be disabled if we are modifying HBase.
|
||||
* It can be disabled if you want to prevent region movement from causing
|
||||
* false positives.
|
||||
*/
|
||||
public boolean shouldDisableSplitAndMerge() {
|
||||
return fixAny || disableSplitAndMerge;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set summary mode.
|
||||
* Print only summary of the tables and status (OK or INCONSISTENT)
|
||||
|
@ -4552,7 +4542,6 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
out.println(" -sidelineDir <hdfs://> HDFS path to backup existing meta.");
|
||||
out.println(" -boundaries Verify that regions boundaries are the same between META and store files.");
|
||||
out.println(" -exclusive Abort if another hbck is exclusive or fixing.");
|
||||
out.println(" -disableBalancer Disable the load balancer.");
|
||||
|
||||
out.println("");
|
||||
out.println(" Metadata Repair options: (expert features, use with caution!)");
|
||||
|
@ -4653,10 +4642,6 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
setDisplayFullReport();
|
||||
} else if (cmd.equals("-exclusive")) {
|
||||
setForceExclusive();
|
||||
} else if (cmd.equals("-disableBalancer")) {
|
||||
setDisableBalancer();
|
||||
} else if (cmd.equals("-disableSplitAndMerge")) {
|
||||
setDisableSplitAndMerge();
|
||||
} else if (cmd.equals("-timelag")) {
|
||||
if (i == args.length - 1) {
|
||||
errors.reportError(ERROR_CODE.WRONG_USAGE, "HBaseFsck: -timelag needs a value.");
|
||||
|
|
|
@ -0,0 +1,81 @@
|
|||
/**
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.zookeeper;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.hadoop.hbase.classification.InterfaceAudience;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
|
||||
/**
|
||||
* Tracks the master Maintenance Mode via ZK.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class MasterMaintenanceModeTracker extends ZooKeeperListener {
|
||||
private boolean hasChildren;
|
||||
|
||||
public MasterMaintenanceModeTracker(ZooKeeperWatcher watcher) {
|
||||
super(watcher);
|
||||
hasChildren = false;
|
||||
}
|
||||
|
||||
public boolean isInMaintenanceMode() {
|
||||
return hasChildren;
|
||||
}
|
||||
|
||||
private void update(String path) {
|
||||
if (path.startsWith(ZooKeeperWatcher.masterMaintZNode)) {
|
||||
update();
|
||||
}
|
||||
}
|
||||
|
||||
private void update() {
|
||||
try {
|
||||
List<String> children =
|
||||
ZKUtil.listChildrenAndWatchForNewChildren(watcher, ZooKeeperWatcher.masterMaintZNode);
|
||||
hasChildren = (children != null && children.size() > 0);
|
||||
} catch (KeeperException e) {
|
||||
// Ignore the ZK keeper exception
|
||||
hasChildren = false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Starts the tracking of whether master is in Maintenance Mode.
|
||||
*/
|
||||
public void start() {
|
||||
watcher.registerListener(this);
|
||||
update();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void nodeCreated(String path) {
|
||||
update(path);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void nodeDeleted(String path) {
|
||||
update(path);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void nodeChildrenChanged(String path) {
|
||||
update(path);
|
||||
}
|
||||
}
|
|
@ -146,6 +146,4 @@ public class SplitOrMergeTracker {
|
|||
return builder.build();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -534,6 +534,11 @@ public class TestCatalogJanitor {
|
|||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isInMaintenanceMode() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getLastMajorCompactionTimestamp(TableName table) throws IOException {
|
||||
// Auto-generated method stub
|
||||
|
|
|
@ -37,8 +37,6 @@ import java.util.HashSet;
|
|||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.NavigableMap;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.Callable;
|
||||
|
@ -69,7 +67,6 @@ import org.apache.hadoop.hbase.HTableDescriptor;
|
|||
import org.apache.hadoop.hbase.TableExistsException;
|
||||
import org.apache.hadoop.hbase.testclassification.LargeTests;
|
||||
import org.apache.hadoop.hbase.MiniHBaseCluster;
|
||||
import org.apache.hadoop.hbase.RegionLocations;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.MetaTableAccessor;
|
||||
|
@ -84,12 +81,10 @@ import org.apache.hadoop.hbase.client.HBaseAdmin;
|
|||
import org.apache.hadoop.hbase.client.HConnection;
|
||||
import org.apache.hadoop.hbase.client.HTable;
|
||||
import org.apache.hadoop.hbase.client.MetaScanner;
|
||||
import org.apache.hadoop.hbase.client.Mutation;
|
||||
import org.apache.hadoop.hbase.client.Put;
|
||||
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
|
||||
import org.apache.hadoop.hbase.client.Result;
|
||||
import org.apache.hadoop.hbase.client.ResultScanner;
|
||||
import org.apache.hadoop.hbase.client.RowMutations;
|
||||
import org.apache.hadoop.hbase.client.Scan;
|
||||
import org.apache.hadoop.hbase.client.Table;
|
||||
import org.apache.hadoop.hbase.client.replication.ReplicationAdmin;
|
||||
|
|
Loading…
Reference in New Issue