HBASE-14201 hbck should not take a lock unless fixing errors
By default, hbck is run in a read-only checker mode. In this case, it is sensible to let others run. By default, the balancer is left alone, which may cause spurious errors, but cannot leave the balancer in a bad state. It is dangerous to leave the balancer by accident, so it is only ever enabled after fixing, it will never be forced off because of racing. When hbck is run in fixer mode, it must take an exclusive lock and disable the balancer, or all havoc will break loose. If you want to stop hbck from running in parallel, the -exclusive flag will create the lock file. If you want to force -disableBalancer, that option is available too. This makes more semantic sense than -noLock and -noSwitchBalancer, respectively. Signed-off-by: Elliott Clark <eclark@apache.org>
This commit is contained in:
parent
72a48a1333
commit
0c63d415d9
|
@ -231,9 +231,9 @@ public class HBaseFsck extends Configured implements Closeable {
|
||||||
* Options
|
* Options
|
||||||
***********/
|
***********/
|
||||||
private static boolean details = false; // do we display the full report
|
private static boolean details = false; // do we display the full report
|
||||||
private static boolean useLock = true; // do we use the hbck exclusivity lock
|
|
||||||
private static boolean switchBalancer = true; // do we turn the balancer off while running
|
|
||||||
private long timelag = DEFAULT_TIME_LAG; // tables whose modtime is older
|
private long timelag = DEFAULT_TIME_LAG; // tables whose modtime is older
|
||||||
|
private static boolean forceExclusive = false; // only this hbck can modify HBase
|
||||||
|
private static boolean disableBalancer = false; // disable load balancer to keep regions stable
|
||||||
private boolean fixAssignments = false; // fix assignment errors?
|
private boolean fixAssignments = false; // fix assignment errors?
|
||||||
private boolean fixMeta = false; // fix meta errors?
|
private boolean fixMeta = false; // fix meta errors?
|
||||||
private boolean checkHdfs = true; // load and check fs consistency?
|
private boolean checkHdfs = true; // load and check fs consistency?
|
||||||
|
@ -445,7 +445,7 @@ public class HBaseFsck extends Configured implements Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void unlockHbck() {
|
private void unlockHbck() {
|
||||||
if (hbckLockCleanup.compareAndSet(true, false)) {
|
if (isExclusive() && hbckLockCleanup.compareAndSet(true, false)) {
|
||||||
RetryCounter retryCounter = lockFileRetryCounterFactory.create();
|
RetryCounter retryCounter = lockFileRetryCounterFactory.create();
|
||||||
do {
|
do {
|
||||||
try {
|
try {
|
||||||
|
@ -478,13 +478,13 @@ public class HBaseFsck extends Configured implements Closeable {
|
||||||
*/
|
*/
|
||||||
public void connect() throws IOException {
|
public void connect() throws IOException {
|
||||||
|
|
||||||
if (useLock) {
|
if (isExclusive()) {
|
||||||
// Check if another instance of balancer is running
|
// Grab the lock
|
||||||
hbckOutFd = checkAndMarkRunningHbck();
|
hbckOutFd = checkAndMarkRunningHbck();
|
||||||
if (hbckOutFd == null) {
|
if (hbckOutFd == null) {
|
||||||
setRetCode(-1);
|
setRetCode(-1);
|
||||||
LOG.error("Another instance of hbck is running, exiting this instance.[If you are sure" +
|
LOG.error("Another instance of hbck is fixing HBase, exiting this instance. " +
|
||||||
" no other instance is running, delete the lock file " +
|
"[If you are sure no other instance is running, delete the lock file " +
|
||||||
HBCK_LOCK_PATH + " and rerun the tool]");
|
HBCK_LOCK_PATH + " and rerun the tool]");
|
||||||
throw new IOException("Duplicate hbck - Abort");
|
throw new IOException("Duplicate hbck - Abort");
|
||||||
}
|
}
|
||||||
|
@ -688,9 +688,8 @@ public class HBaseFsck extends Configured implements Closeable {
|
||||||
errors.print("Version: " + status.getHBaseVersion());
|
errors.print("Version: " + status.getHBaseVersion());
|
||||||
offlineHdfsIntegrityRepair();
|
offlineHdfsIntegrityRepair();
|
||||||
|
|
||||||
boolean oldBalancer = true;
|
boolean oldBalancer = false;
|
||||||
// turn the balancer off
|
if (shouldDisableBalancer()) {
|
||||||
if (switchBalancer) {
|
|
||||||
oldBalancer = admin.setBalancerRunning(false, true);
|
oldBalancer = admin.setBalancerRunning(false, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -698,7 +697,10 @@ public class HBaseFsck extends Configured implements Closeable {
|
||||||
onlineConsistencyRepair();
|
onlineConsistencyRepair();
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
if (switchBalancer) {
|
// Only restore the balancer if it was true when we started repairing and
|
||||||
|
// we actually disabled it. Otherwise, we might clobber another run of
|
||||||
|
// hbck that has just restored it.
|
||||||
|
if (shouldDisableBalancer() && oldBalancer) {
|
||||||
admin.setBalancerRunning(oldBalancer, false);
|
admin.setBalancerRunning(oldBalancer, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4150,12 +4152,34 @@ public class HBaseFsck extends Configured implements Closeable {
|
||||||
details = true;
|
details = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void setNoLock() {
|
/**
|
||||||
useLock = false;
|
* Set exclusive mode.
|
||||||
|
*/
|
||||||
|
public static void setForceExclusive() {
|
||||||
|
forceExclusive = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void setNoBalacerSwitch() {
|
/**
|
||||||
switchBalancer = false;
|
* Only one instance of hbck can modify HBase at a time.
|
||||||
|
*/
|
||||||
|
public boolean isExclusive() {
|
||||||
|
return fixAny || forceExclusive;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Disable the load balancer.
|
||||||
|
*/
|
||||||
|
public static void setDisableBalancer() {
|
||||||
|
disableBalancer = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The balancer should be disabled if we are modifying HBase.
|
||||||
|
* It can be disabled if you want to prevent region movement from causing
|
||||||
|
* false positives.
|
||||||
|
*/
|
||||||
|
public boolean shouldDisableBalancer() {
|
||||||
|
return fixAny || disableBalancer;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -4410,8 +4434,8 @@ public class HBaseFsck extends Configured implements Closeable {
|
||||||
out.println(" -metaonly Only check the state of the hbase:meta table.");
|
out.println(" -metaonly Only check the state of the hbase:meta table.");
|
||||||
out.println(" -sidelineDir <hdfs://> HDFS path to backup existing meta.");
|
out.println(" -sidelineDir <hdfs://> HDFS path to backup existing meta.");
|
||||||
out.println(" -boundaries Verify that regions boundaries are the same between META and store files.");
|
out.println(" -boundaries Verify that regions boundaries are the same between META and store files.");
|
||||||
out.println(" -noLock Turn off using the hdfs lock file.");
|
out.println(" -exclusive Abort if another hbck is exclusive or fixing.");
|
||||||
out.println(" -noBalancerSwitch Don't switch the balancer off.");
|
out.println(" -disableBalancer Disable the load balancer.");
|
||||||
|
|
||||||
out.println("");
|
out.println("");
|
||||||
out.println(" Metadata Repair options: (expert features, use with caution!)");
|
out.println(" Metadata Repair options: (expert features, use with caution!)");
|
||||||
|
@ -4501,10 +4525,10 @@ public class HBaseFsck extends Configured implements Closeable {
|
||||||
return printUsageAndExit();
|
return printUsageAndExit();
|
||||||
} else if (cmd.equals("-details")) {
|
} else if (cmd.equals("-details")) {
|
||||||
setDisplayFullReport();
|
setDisplayFullReport();
|
||||||
} else if (cmd.equals("-noLock")) {
|
} else if (cmd.equals("-exclusive")) {
|
||||||
setNoLock();
|
setForceExclusive();
|
||||||
} else if (cmd.equals("-noBalancerSwitch")) {
|
} else if (cmd.equals("-disableBalancer")) {
|
||||||
setNoBalacerSwitch();
|
setDisableBalancer();
|
||||||
} else if (cmd.equals("-timelag")) {
|
} else if (cmd.equals("-timelag")) {
|
||||||
if (i == args.length - 1) {
|
if (i == args.length - 1) {
|
||||||
errors.reportError(ERROR_CODE.WRONG_USAGE, "HBaseFsck: -timelag needs a value.");
|
errors.reportError(ERROR_CODE.WRONG_USAGE, "HBaseFsck: -timelag needs a value.");
|
||||||
|
|
|
@ -611,7 +611,7 @@ public class TestHBaseFsck {
|
||||||
// To avoid flakiness of the test, set low max wait time.
|
// To avoid flakiness of the test, set low max wait time.
|
||||||
c.setInt("hbase.hbck.lockfile.maxwaittime", 3);
|
c.setInt("hbase.hbck.lockfile.maxwaittime", 3);
|
||||||
try{
|
try{
|
||||||
return doFsck(c, false);
|
return doFsck(c, true); // Exclusive hbck only when fixing
|
||||||
} catch(Exception e){
|
} catch(Exception e){
|
||||||
if (e.getMessage().contains("Duplicate hbck")) {
|
if (e.getMessage().contains("Duplicate hbck")) {
|
||||||
fail = false;
|
fail = false;
|
||||||
|
|
|
@ -50,7 +50,6 @@ public class HbckTestingUtil {
|
||||||
TableName table) throws Exception {
|
TableName table) throws Exception {
|
||||||
HBaseFsck fsck = new HBaseFsck(conf, exec);
|
HBaseFsck fsck = new HBaseFsck(conf, exec);
|
||||||
try {
|
try {
|
||||||
fsck.connect();
|
|
||||||
HBaseFsck.setDisplayFullReport(); // i.e. -details
|
HBaseFsck.setDisplayFullReport(); // i.e. -details
|
||||||
fsck.setTimeLag(0);
|
fsck.setTimeLag(0);
|
||||||
fsck.setFixAssignments(fixAssignments);
|
fsck.setFixAssignments(fixAssignments);
|
||||||
|
@ -66,6 +65,9 @@ public class HbckTestingUtil {
|
||||||
if (table != null) {
|
if (table != null) {
|
||||||
fsck.includeTable(table);
|
fsck.includeTable(table);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Parse command line flags before connecting, to grab the lock.
|
||||||
|
fsck.connect();
|
||||||
fsck.onlineHbck();
|
fsck.onlineHbck();
|
||||||
} finally {
|
} finally {
|
||||||
fsck.close();
|
fsck.close();
|
||||||
|
|
Loading…
Reference in New Issue