HBASE-21072 Block out HBCK1 in hbase2
Write the hbase-1.x hbck1 lock file to block out hbck1 instances writing state to an hbase-2.x cluster (could do damage). Set hbase.write.hbck1.lock.file to false to disable this writing.
This commit is contained in:
parent
8a9acd4d2a
commit
66add55234
|
@ -185,6 +185,7 @@ import org.apache.hadoop.hbase.util.Addressing;
|
||||||
import org.apache.hadoop.hbase.util.Bytes;
|
import org.apache.hadoop.hbase.util.Bytes;
|
||||||
import org.apache.hadoop.hbase.util.CompressionTest;
|
import org.apache.hadoop.hbase.util.CompressionTest;
|
||||||
import org.apache.hadoop.hbase.util.EncryptionTest;
|
import org.apache.hadoop.hbase.util.EncryptionTest;
|
||||||
|
import org.apache.hadoop.hbase.util.HBaseFsck;
|
||||||
import org.apache.hadoop.hbase.util.HFileArchiveUtil;
|
import org.apache.hadoop.hbase.util.HFileArchiveUtil;
|
||||||
import org.apache.hadoop.hbase.util.HasThread;
|
import org.apache.hadoop.hbase.util.HasThread;
|
||||||
import org.apache.hadoop.hbase.util.IdLock;
|
import org.apache.hadoop.hbase.util.IdLock;
|
||||||
|
@ -864,7 +865,13 @@ public class HMaster extends HRegionServer implements MasterServices {
|
||||||
ZKClusterId.setClusterId(this.zooKeeper, fileSystemManager.getClusterId());
|
ZKClusterId.setClusterId(this.zooKeeper, fileSystemManager.getClusterId());
|
||||||
this.clusterId = clusterId.toString();
|
this.clusterId = clusterId.toString();
|
||||||
|
|
||||||
|
// Precaution. Put in place the old hbck1 lock file to fence out old hbase1s running their
|
||||||
|
// hbck1s against an hbase2 cluster; it could do damage. To skip this behavior, set
|
||||||
|
// hbase.write.hbck1.lock.file to false.
|
||||||
|
if (this.conf.getBoolean("hbase.write.hbck1.lock.file", true)) {
|
||||||
|
HBaseFsck.checkAndMarkRunningHbck(this.conf,
|
||||||
|
HBaseFsck.createLockRetryCounterFactory(this.conf).create());
|
||||||
|
}
|
||||||
|
|
||||||
status.setStatus("Initialze ServerManager and schedule SCP for crash servers");
|
status.setStatus("Initialze ServerManager and schedule SCP for crash servers");
|
||||||
this.serverManager = createServerManager(this);
|
this.serverManager = createServerManager(this);
|
||||||
|
|
|
@ -144,6 +144,7 @@ import org.apache.hadoop.security.UserGroupInformation;
|
||||||
import org.apache.hadoop.util.ReflectionUtils;
|
import org.apache.hadoop.util.ReflectionUtils;
|
||||||
import org.apache.hadoop.util.Tool;
|
import org.apache.hadoop.util.Tool;
|
||||||
import org.apache.hadoop.util.ToolRunner;
|
import org.apache.hadoop.util.ToolRunner;
|
||||||
|
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
|
||||||
import org.apache.hbase.thirdparty.com.google.common.collect.Sets;
|
import org.apache.hbase.thirdparty.com.google.common.collect.Sets;
|
||||||
import org.apache.yetus.audience.InterfaceAudience;
|
import org.apache.yetus.audience.InterfaceAudience;
|
||||||
import org.apache.yetus.audience.InterfaceStability;
|
import org.apache.yetus.audience.InterfaceStability;
|
||||||
|
@ -164,7 +165,10 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.AdminServic
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* HBaseFsck (hbck) is a tool for checking and repairing region consistency and
|
* HBaseFsck (hbck) is a tool for checking and repairing region consistency and
|
||||||
* table integrity problems in a corrupted HBase.
|
* table integrity problems in a corrupted HBase. This tool was written for hbase-1.x. It does not
|
||||||
|
* work with hbase-2.x; it can read state but is not allowed to change state; i.e. effect 'repair'.
|
||||||
|
* See hbck2 (HBASE-19121) for a hbck tool for hbase2.
|
||||||
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* Region consistency checks verify that hbase:meta, region deployment on region
|
* Region consistency checks verify that hbase:meta, region deployment on region
|
||||||
* servers and the state of data in HDFS (.regioninfo files) all are in
|
* servers and the state of data in HDFS (.regioninfo files) all are in
|
||||||
|
@ -217,7 +221,12 @@ public class HBaseFsck extends Configured implements Closeable {
|
||||||
private static final int DEFAULT_OVERLAPS_TO_SIDELINE = 2;
|
private static final int DEFAULT_OVERLAPS_TO_SIDELINE = 2;
|
||||||
private static final int DEFAULT_MAX_MERGE = 5;
|
private static final int DEFAULT_MAX_MERGE = 5;
|
||||||
private static final String TO_BE_LOADED = "to_be_loaded";
|
private static final String TO_BE_LOADED = "to_be_loaded";
|
||||||
private static final String HBCK_LOCK_FILE = "hbase-hbck.lock";
|
/**
|
||||||
|
* Here is where hbase-1.x used to default the lock for hbck1.
|
||||||
|
* It puts in place a lock when it goes to write/make changes.
|
||||||
|
*/
|
||||||
|
@VisibleForTesting
|
||||||
|
public static final String HBCK_LOCK_FILE = "hbase-hbck.lock";
|
||||||
private static final int DEFAULT_MAX_LOCK_FILE_ATTEMPTS = 5;
|
private static final int DEFAULT_MAX_LOCK_FILE_ATTEMPTS = 5;
|
||||||
private static final int DEFAULT_LOCK_FILE_ATTEMPT_SLEEP_INTERVAL = 200; // milliseconds
|
private static final int DEFAULT_LOCK_FILE_ATTEMPT_SLEEP_INTERVAL = 200; // milliseconds
|
||||||
private static final int DEFAULT_LOCK_FILE_ATTEMPT_MAX_SLEEP_TIME = 5000; // milliseconds
|
private static final int DEFAULT_LOCK_FILE_ATTEMPT_MAX_SLEEP_TIME = 5000; // milliseconds
|
||||||
|
@ -369,40 +378,75 @@ public class HBaseFsck extends Configured implements Closeable {
|
||||||
super(conf);
|
super(conf);
|
||||||
errors = getErrorReporter(getConf());
|
errors = getErrorReporter(getConf());
|
||||||
this.executor = exec;
|
this.executor = exec;
|
||||||
lockFileRetryCounterFactory = new RetryCounterFactory(
|
lockFileRetryCounterFactory = createLockRetryCounterFactory(getConf());
|
||||||
getConf().getInt("hbase.hbck.lockfile.attempts", DEFAULT_MAX_LOCK_FILE_ATTEMPTS),
|
createZNodeRetryCounterFactory = createZnodeRetryCounterFactory(getConf());
|
||||||
getConf().getInt(
|
|
||||||
"hbase.hbck.lockfile.attempt.sleep.interval", DEFAULT_LOCK_FILE_ATTEMPT_SLEEP_INTERVAL),
|
|
||||||
getConf().getInt(
|
|
||||||
"hbase.hbck.lockfile.attempt.maxsleeptime", DEFAULT_LOCK_FILE_ATTEMPT_MAX_SLEEP_TIME));
|
|
||||||
createZNodeRetryCounterFactory = new RetryCounterFactory(
|
|
||||||
getConf().getInt("hbase.hbck.createznode.attempts", DEFAULT_MAX_CREATE_ZNODE_ATTEMPTS),
|
|
||||||
getConf().getInt(
|
|
||||||
"hbase.hbck.createznode.attempt.sleep.interval",
|
|
||||||
DEFAULT_CREATE_ZNODE_ATTEMPT_SLEEP_INTERVAL),
|
|
||||||
getConf().getInt(
|
|
||||||
"hbase.hbck.createznode.attempt.maxsleeptime",
|
|
||||||
DEFAULT_CREATE_ZNODE_ATTEMPT_MAX_SLEEP_TIME));
|
|
||||||
zkw = createZooKeeperWatcher();
|
zkw = createZooKeeperWatcher();
|
||||||
}
|
}
|
||||||
|
|
||||||
private class FileLockCallable implements Callable<FSDataOutputStream> {
|
/**
|
||||||
RetryCounter retryCounter;
|
* @return A retry counter factory configured for retrying lock file creation.
|
||||||
|
*/
|
||||||
|
public static RetryCounterFactory createLockRetryCounterFactory(Configuration conf) {
|
||||||
|
return new RetryCounterFactory(
|
||||||
|
conf.getInt("hbase.hbck.lockfile.attempts", DEFAULT_MAX_LOCK_FILE_ATTEMPTS),
|
||||||
|
conf.getInt("hbase.hbck.lockfile.attempt.sleep.interval",
|
||||||
|
DEFAULT_LOCK_FILE_ATTEMPT_SLEEP_INTERVAL),
|
||||||
|
conf.getInt("hbase.hbck.lockfile.attempt.maxsleeptime",
|
||||||
|
DEFAULT_LOCK_FILE_ATTEMPT_MAX_SLEEP_TIME));
|
||||||
|
}
|
||||||
|
|
||||||
public FileLockCallable(RetryCounter retryCounter) {
|
/**
|
||||||
|
* @return A retry counter factory configured for retrying znode creation.
|
||||||
|
*/
|
||||||
|
private static RetryCounterFactory createZnodeRetryCounterFactory(Configuration conf) {
|
||||||
|
return new RetryCounterFactory(
|
||||||
|
conf.getInt("hbase.hbck.createznode.attempts", DEFAULT_MAX_CREATE_ZNODE_ATTEMPTS),
|
||||||
|
conf.getInt("hbase.hbck.createznode.attempt.sleep.interval",
|
||||||
|
DEFAULT_CREATE_ZNODE_ATTEMPT_SLEEP_INTERVAL),
|
||||||
|
conf.getInt("hbase.hbck.createznode.attempt.maxsleeptime",
|
||||||
|
DEFAULT_CREATE_ZNODE_ATTEMPT_MAX_SLEEP_TIME));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return Return the tmp dir this tool writes too.
|
||||||
|
*/
|
||||||
|
@VisibleForTesting
|
||||||
|
public static Path getTmpDir(Configuration conf) throws IOException {
|
||||||
|
return new Path(FSUtils.getRootDir(conf), HConstants.HBASE_TEMP_DIRECTORY);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class FileLockCallable implements Callable<FSDataOutputStream> {
|
||||||
|
RetryCounter retryCounter;
|
||||||
|
private final Configuration conf;
|
||||||
|
private Path hbckLockPath = null;
|
||||||
|
|
||||||
|
public FileLockCallable(Configuration conf, RetryCounter retryCounter) {
|
||||||
this.retryCounter = retryCounter;
|
this.retryCounter = retryCounter;
|
||||||
|
this.conf = conf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return Will be <code>null</code> unless you call {@link #call()}
|
||||||
|
*/
|
||||||
|
Path getHbckLockPath() {
|
||||||
|
return this.hbckLockPath;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FSDataOutputStream call() throws IOException {
|
public FSDataOutputStream call() throws IOException {
|
||||||
try {
|
try {
|
||||||
FileSystem fs = FSUtils.getCurrentFileSystem(getConf());
|
FileSystem fs = FSUtils.getCurrentFileSystem(this.conf);
|
||||||
FsPermission defaultPerms = FSUtils.getFilePermissions(fs, getConf(),
|
FsPermission defaultPerms = FSUtils.getFilePermissions(fs, this.conf,
|
||||||
HConstants.DATA_FILE_UMASK_KEY);
|
HConstants.DATA_FILE_UMASK_KEY);
|
||||||
Path tmpDir = new Path(FSUtils.getRootDir(getConf()), HConstants.HBASE_TEMP_DIRECTORY);
|
Path tmpDir = getTmpDir(conf);
|
||||||
|
this.hbckLockPath = new Path(tmpDir, HBCK_LOCK_FILE);
|
||||||
fs.mkdirs(tmpDir);
|
fs.mkdirs(tmpDir);
|
||||||
HBCK_LOCK_PATH = new Path(tmpDir, HBCK_LOCK_FILE);
|
final FSDataOutputStream out = createFileWithRetries(fs, this.hbckLockPath, defaultPerms);
|
||||||
final FSDataOutputStream out = createFileWithRetries(fs, HBCK_LOCK_PATH, defaultPerms);
|
|
||||||
out.writeBytes(InetAddress.getLocalHost().toString());
|
out.writeBytes(InetAddress.getLocalHost().toString());
|
||||||
|
// Add a note into the file we write on why hbase2 is writing out an hbck1 lock file.
|
||||||
|
out.writeBytes(" Written by an hbase-2.x Master to block an " +
|
||||||
|
"attempt by an hbase-1.x HBCK tool making modification to state. " +
|
||||||
|
"See 'HBCK must match HBase server version' in the hbase refguide.");
|
||||||
out.flush();
|
out.flush();
|
||||||
return out;
|
return out;
|
||||||
} catch(RemoteException e) {
|
} catch(RemoteException e) {
|
||||||
|
@ -417,7 +461,6 @@ public class HBaseFsck extends Configured implements Closeable {
|
||||||
private FSDataOutputStream createFileWithRetries(final FileSystem fs,
|
private FSDataOutputStream createFileWithRetries(final FileSystem fs,
|
||||||
final Path hbckLockFilePath, final FsPermission defaultPerms)
|
final Path hbckLockFilePath, final FsPermission defaultPerms)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
IOException exception = null;
|
IOException exception = null;
|
||||||
do {
|
do {
|
||||||
try {
|
try {
|
||||||
|
@ -449,13 +492,13 @@ public class HBaseFsck extends Configured implements Closeable {
|
||||||
* @return FSDataOutputStream object corresponding to the newly opened lock file
|
* @return FSDataOutputStream object corresponding to the newly opened lock file
|
||||||
* @throws IOException if IO failure occurs
|
* @throws IOException if IO failure occurs
|
||||||
*/
|
*/
|
||||||
private FSDataOutputStream checkAndMarkRunningHbck() throws IOException {
|
public static Pair<Path, FSDataOutputStream> checkAndMarkRunningHbck(Configuration conf,
|
||||||
RetryCounter retryCounter = lockFileRetryCounterFactory.create();
|
RetryCounter retryCounter) throws IOException {
|
||||||
FileLockCallable callable = new FileLockCallable(retryCounter);
|
FileLockCallable callable = new FileLockCallable(conf, retryCounter);
|
||||||
ExecutorService executor = Executors.newFixedThreadPool(1);
|
ExecutorService executor = Executors.newFixedThreadPool(1);
|
||||||
FutureTask<FSDataOutputStream> futureTask = new FutureTask<>(callable);
|
FutureTask<FSDataOutputStream> futureTask = new FutureTask<>(callable);
|
||||||
executor.execute(futureTask);
|
executor.execute(futureTask);
|
||||||
final int timeoutInSeconds = getConf().getInt(
|
final int timeoutInSeconds = conf.getInt(
|
||||||
"hbase.hbck.lockfile.maxwaittime", DEFAULT_WAIT_FOR_LOCK_TIMEOUT);
|
"hbase.hbck.lockfile.maxwaittime", DEFAULT_WAIT_FOR_LOCK_TIMEOUT);
|
||||||
FSDataOutputStream stream = null;
|
FSDataOutputStream stream = null;
|
||||||
try {
|
try {
|
||||||
|
@ -472,7 +515,7 @@ public class HBaseFsck extends Configured implements Closeable {
|
||||||
} finally {
|
} finally {
|
||||||
executor.shutdownNow();
|
executor.shutdownNow();
|
||||||
}
|
}
|
||||||
return stream;
|
return new Pair<Path, FSDataOutputStream>(callable.getHbckLockPath(), stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void unlockHbck() {
|
private void unlockHbck() {
|
||||||
|
@ -481,8 +524,7 @@ public class HBaseFsck extends Configured implements Closeable {
|
||||||
do {
|
do {
|
||||||
try {
|
try {
|
||||||
IOUtils.closeQuietly(hbckOutFd);
|
IOUtils.closeQuietly(hbckOutFd);
|
||||||
FSUtils.delete(FSUtils.getCurrentFileSystem(getConf()),
|
FSUtils.delete(FSUtils.getCurrentFileSystem(getConf()), HBCK_LOCK_PATH, true);
|
||||||
HBCK_LOCK_PATH, true);
|
|
||||||
LOG.info("Finishing hbck");
|
LOG.info("Finishing hbck");
|
||||||
return;
|
return;
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
|
@ -511,7 +553,10 @@ public class HBaseFsck extends Configured implements Closeable {
|
||||||
|
|
||||||
if (isExclusive()) {
|
if (isExclusive()) {
|
||||||
// Grab the lock
|
// Grab the lock
|
||||||
hbckOutFd = checkAndMarkRunningHbck();
|
Pair<Path, FSDataOutputStream> pair =
|
||||||
|
checkAndMarkRunningHbck(getConf(), this.lockFileRetryCounterFactory.create());
|
||||||
|
HBCK_LOCK_PATH = pair.getFirst();
|
||||||
|
this.hbckOutFd = pair.getSecond();
|
||||||
if (hbckOutFd == null) {
|
if (hbckOutFd == null) {
|
||||||
setRetCode(-1);
|
setRetCode(-1);
|
||||||
LOG.error("Another instance of hbck is fixing HBase, exiting this instance. " +
|
LOG.error("Another instance of hbck is fixing HBase, exiting this instance. " +
|
||||||
|
|
|
@ -19,11 +19,15 @@ package org.apache.hadoop.hbase.master;
|
||||||
|
|
||||||
import static org.junit.Assert.assertArrayEquals;
|
import static org.junit.Assert.assertArrayEquals;
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
import static org.junit.Assert.fail;
|
import static org.junit.Assert.fail;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||||
import org.apache.hadoop.hbase.HColumnDescriptor;
|
import org.apache.hadoop.hbase.HColumnDescriptor;
|
||||||
|
@ -40,10 +44,10 @@ import org.apache.hadoop.hbase.client.RegionInfo;
|
||||||
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
||||||
import org.apache.hadoop.hbase.client.Table;
|
import org.apache.hadoop.hbase.client.Table;
|
||||||
import org.apache.hadoop.hbase.client.TableState;
|
import org.apache.hadoop.hbase.client.TableState;
|
||||||
import org.apache.hadoop.hbase.master.assignment.RegionStates;
|
|
||||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||||
import org.apache.hadoop.hbase.util.Bytes;
|
import org.apache.hadoop.hbase.util.Bytes;
|
||||||
|
import org.apache.hadoop.hbase.util.HBaseFsck;
|
||||||
import org.apache.hadoop.hbase.util.Pair;
|
import org.apache.hadoop.hbase.util.Pair;
|
||||||
import org.apache.hadoop.util.StringUtils;
|
import org.apache.hadoop.util.StringUtils;
|
||||||
import org.junit.AfterClass;
|
import org.junit.AfterClass;
|
||||||
|
@ -192,5 +196,42 @@ public class TestMaster {
|
||||||
TEST_UTIL.deleteTable(tableName);
|
TEST_UTIL.deleteTable(tableName);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBlockingHbkc1WithLockFile() throws IOException {
|
||||||
|
// This is how the patch to the lock file is created inside in HBaseFsck. Too hard to use its
|
||||||
|
// actual method without disturbing HBaseFsck... Do the below mimic instead.
|
||||||
|
Path hbckLockPath = new Path(HBaseFsck.getTmpDir(TEST_UTIL.getConfiguration()),
|
||||||
|
HBaseFsck.HBCK_LOCK_FILE);
|
||||||
|
FileSystem fs = TEST_UTIL.getTestFileSystem();
|
||||||
|
assertTrue(fs.exists(hbckLockPath));
|
||||||
|
TEST_UTIL.getMiniHBaseCluster().
|
||||||
|
killMaster(TEST_UTIL.getMiniHBaseCluster().getMaster().getServerName());
|
||||||
|
assertTrue(fs.exists(hbckLockPath));
|
||||||
|
TEST_UTIL.getMiniHBaseCluster().startMaster();
|
||||||
|
TEST_UTIL.waitFor(30000, () -> TEST_UTIL.getMiniHBaseCluster().getMaster() != null &&
|
||||||
|
TEST_UTIL.getMiniHBaseCluster().getMaster().isInitialized());
|
||||||
|
assertTrue(fs.exists(hbckLockPath));
|
||||||
|
// Start a second Master. Should be fine.
|
||||||
|
TEST_UTIL.getMiniHBaseCluster().startMaster();
|
||||||
|
assertTrue(fs.exists(hbckLockPath));
|
||||||
|
fs.delete(hbckLockPath, true);
|
||||||
|
assertFalse(fs.exists(hbckLockPath));
|
||||||
|
// Kill all Masters.
|
||||||
|
TEST_UTIL.getMiniHBaseCluster().getLiveMasterThreads().stream().
|
||||||
|
map(sn -> sn.getMaster().getServerName()).forEach(sn -> {
|
||||||
|
try {
|
||||||
|
TEST_UTIL.getMiniHBaseCluster().killMaster(sn);
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
// Start a new one.
|
||||||
|
TEST_UTIL.getMiniHBaseCluster().startMaster();
|
||||||
|
TEST_UTIL.waitFor(30000, () -> TEST_UTIL.getMiniHBaseCluster().getMaster() != null &&
|
||||||
|
TEST_UTIL.getMiniHBaseCluster().getMaster().isInitialized());
|
||||||
|
// Assert lock gets put in place again.
|
||||||
|
assertTrue(fs.exists(hbckLockPath));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue