diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index bf0c7904255..2a92104e7ec 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -185,6 +185,7 @@ import org.apache.hadoop.hbase.util.Addressing; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.CompressionTest; import org.apache.hadoop.hbase.util.EncryptionTest; +import org.apache.hadoop.hbase.util.HBaseFsck; import org.apache.hadoop.hbase.util.HFileArchiveUtil; import org.apache.hadoop.hbase.util.HasThread; import org.apache.hadoop.hbase.util.IdLock; @@ -864,7 +865,13 @@ public class HMaster extends HRegionServer implements MasterServices { ZKClusterId.setClusterId(this.zooKeeper, fileSystemManager.getClusterId()); this.clusterId = clusterId.toString(); - + // Precaution. Put in place the old hbck1 lock file to fence out old hbase1s running their + // hbck1s against an hbase2 cluster; it could do damage. To skip this behavior, set + // hbase.write.hbck1.lock.file to false. + if (this.conf.getBoolean("hbase.write.hbck1.lock.file", true)) { + HBaseFsck.checkAndMarkRunningHbck(this.conf, + HBaseFsck.createLockRetryCounterFactory(this.conf).create()); + } status.setStatus("Initialze ServerManager and schedule SCP for crash servers"); this.serverManager = createServerManager(this); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java index c479759fa72..992c9867e2c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java @@ -144,6 +144,7 @@ import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; +import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting; import org.apache.hbase.thirdparty.com.google.common.collect.Sets; import org.apache.yetus.audience.InterfaceAudience; import org.apache.yetus.audience.InterfaceStability; @@ -164,7 +165,10 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.AdminServic /** * HBaseFsck (hbck) is a tool for checking and repairing region consistency and - * table integrity problems in a corrupted HBase. + * table integrity problems in a corrupted HBase. This tool was written for hbase-1.x. It does not + * work with hbase-2.x; it can read state but is not allowed to change state; i.e. effect 'repair'. + * See hbck2 (HBASE-19121) for a hbck tool for hbase2. + * *

* Region consistency checks verify that hbase:meta, region deployment on region * servers and the state of data in HDFS (.regioninfo files) all are in @@ -217,7 +221,12 @@ public class HBaseFsck extends Configured implements Closeable { private static final int DEFAULT_OVERLAPS_TO_SIDELINE = 2; private static final int DEFAULT_MAX_MERGE = 5; private static final String TO_BE_LOADED = "to_be_loaded"; - private static final String HBCK_LOCK_FILE = "hbase-hbck.lock"; + /** + * Here is where hbase-1.x used to default the lock for hbck1. + * It puts in place a lock when it goes to write/make changes. + */ + @VisibleForTesting + public static final String HBCK_LOCK_FILE = "hbase-hbck.lock"; private static final int DEFAULT_MAX_LOCK_FILE_ATTEMPTS = 5; private static final int DEFAULT_LOCK_FILE_ATTEMPT_SLEEP_INTERVAL = 200; // milliseconds private static final int DEFAULT_LOCK_FILE_ATTEMPT_MAX_SLEEP_TIME = 5000; // milliseconds @@ -369,40 +378,75 @@ public class HBaseFsck extends Configured implements Closeable { super(conf); errors = getErrorReporter(getConf()); this.executor = exec; - lockFileRetryCounterFactory = new RetryCounterFactory( - getConf().getInt("hbase.hbck.lockfile.attempts", DEFAULT_MAX_LOCK_FILE_ATTEMPTS), - getConf().getInt( - "hbase.hbck.lockfile.attempt.sleep.interval", DEFAULT_LOCK_FILE_ATTEMPT_SLEEP_INTERVAL), - getConf().getInt( - "hbase.hbck.lockfile.attempt.maxsleeptime", DEFAULT_LOCK_FILE_ATTEMPT_MAX_SLEEP_TIME)); - createZNodeRetryCounterFactory = new RetryCounterFactory( - getConf().getInt("hbase.hbck.createznode.attempts", DEFAULT_MAX_CREATE_ZNODE_ATTEMPTS), - getConf().getInt( - "hbase.hbck.createznode.attempt.sleep.interval", - DEFAULT_CREATE_ZNODE_ATTEMPT_SLEEP_INTERVAL), - getConf().getInt( - "hbase.hbck.createznode.attempt.maxsleeptime", - DEFAULT_CREATE_ZNODE_ATTEMPT_MAX_SLEEP_TIME)); + lockFileRetryCounterFactory = createLockRetryCounterFactory(getConf()); + createZNodeRetryCounterFactory = createZnodeRetryCounterFactory(getConf()); zkw = createZooKeeperWatcher(); } - private class FileLockCallable implements Callable { - RetryCounter retryCounter; + /** + * @return A retry counter factory configured for retrying lock file creation. + */ + public static RetryCounterFactory createLockRetryCounterFactory(Configuration conf) { + return new RetryCounterFactory( + conf.getInt("hbase.hbck.lockfile.attempts", DEFAULT_MAX_LOCK_FILE_ATTEMPTS), + conf.getInt("hbase.hbck.lockfile.attempt.sleep.interval", + DEFAULT_LOCK_FILE_ATTEMPT_SLEEP_INTERVAL), + conf.getInt("hbase.hbck.lockfile.attempt.maxsleeptime", + DEFAULT_LOCK_FILE_ATTEMPT_MAX_SLEEP_TIME)); + } - public FileLockCallable(RetryCounter retryCounter) { + /** + * @return A retry counter factory configured for retrying znode creation. + */ + private static RetryCounterFactory createZnodeRetryCounterFactory(Configuration conf) { + return new RetryCounterFactory( + conf.getInt("hbase.hbck.createznode.attempts", DEFAULT_MAX_CREATE_ZNODE_ATTEMPTS), + conf.getInt("hbase.hbck.createznode.attempt.sleep.interval", + DEFAULT_CREATE_ZNODE_ATTEMPT_SLEEP_INTERVAL), + conf.getInt("hbase.hbck.createznode.attempt.maxsleeptime", + DEFAULT_CREATE_ZNODE_ATTEMPT_MAX_SLEEP_TIME)); + } + + /** + * @return Return the tmp dir this tool writes too. + */ + @VisibleForTesting + public static Path getTmpDir(Configuration conf) throws IOException { + return new Path(FSUtils.getRootDir(conf), HConstants.HBASE_TEMP_DIRECTORY); + } + + private static class FileLockCallable implements Callable { + RetryCounter retryCounter; + private final Configuration conf; + private Path hbckLockPath = null; + + public FileLockCallable(Configuration conf, RetryCounter retryCounter) { this.retryCounter = retryCounter; + this.conf = conf; } + + /** + * @return Will be null unless you call {@link #call()} + */ + Path getHbckLockPath() { + return this.hbckLockPath; + } + @Override public FSDataOutputStream call() throws IOException { try { - FileSystem fs = FSUtils.getCurrentFileSystem(getConf()); - FsPermission defaultPerms = FSUtils.getFilePermissions(fs, getConf(), + FileSystem fs = FSUtils.getCurrentFileSystem(this.conf); + FsPermission defaultPerms = FSUtils.getFilePermissions(fs, this.conf, HConstants.DATA_FILE_UMASK_KEY); - Path tmpDir = new Path(FSUtils.getRootDir(getConf()), HConstants.HBASE_TEMP_DIRECTORY); + Path tmpDir = getTmpDir(conf); + this.hbckLockPath = new Path(tmpDir, HBCK_LOCK_FILE); fs.mkdirs(tmpDir); - HBCK_LOCK_PATH = new Path(tmpDir, HBCK_LOCK_FILE); - final FSDataOutputStream out = createFileWithRetries(fs, HBCK_LOCK_PATH, defaultPerms); + final FSDataOutputStream out = createFileWithRetries(fs, this.hbckLockPath, defaultPerms); out.writeBytes(InetAddress.getLocalHost().toString()); + // Add a note into the file we write on why hbase2 is writing out an hbck1 lock file. + out.writeBytes(" Written by an hbase-2.x Master to block an " + + "attempt by an hbase-1.x HBCK tool making modification to state. " + + "See 'HBCK must match HBase server version' in the hbase refguide."); out.flush(); return out; } catch(RemoteException e) { @@ -417,7 +461,6 @@ public class HBaseFsck extends Configured implements Closeable { private FSDataOutputStream createFileWithRetries(final FileSystem fs, final Path hbckLockFilePath, final FsPermission defaultPerms) throws IOException { - IOException exception = null; do { try { @@ -449,13 +492,13 @@ public class HBaseFsck extends Configured implements Closeable { * @return FSDataOutputStream object corresponding to the newly opened lock file * @throws IOException if IO failure occurs */ - private FSDataOutputStream checkAndMarkRunningHbck() throws IOException { - RetryCounter retryCounter = lockFileRetryCounterFactory.create(); - FileLockCallable callable = new FileLockCallable(retryCounter); + public static Pair checkAndMarkRunningHbck(Configuration conf, + RetryCounter retryCounter) throws IOException { + FileLockCallable callable = new FileLockCallable(conf, retryCounter); ExecutorService executor = Executors.newFixedThreadPool(1); FutureTask futureTask = new FutureTask<>(callable); executor.execute(futureTask); - final int timeoutInSeconds = getConf().getInt( + final int timeoutInSeconds = conf.getInt( "hbase.hbck.lockfile.maxwaittime", DEFAULT_WAIT_FOR_LOCK_TIMEOUT); FSDataOutputStream stream = null; try { @@ -472,7 +515,7 @@ public class HBaseFsck extends Configured implements Closeable { } finally { executor.shutdownNow(); } - return stream; + return new Pair(callable.getHbckLockPath(), stream); } private void unlockHbck() { @@ -481,8 +524,7 @@ public class HBaseFsck extends Configured implements Closeable { do { try { IOUtils.closeQuietly(hbckOutFd); - FSUtils.delete(FSUtils.getCurrentFileSystem(getConf()), - HBCK_LOCK_PATH, true); + FSUtils.delete(FSUtils.getCurrentFileSystem(getConf()), HBCK_LOCK_PATH, true); LOG.info("Finishing hbck"); return; } catch (IOException ioe) { @@ -511,7 +553,10 @@ public class HBaseFsck extends Configured implements Closeable { if (isExclusive()) { // Grab the lock - hbckOutFd = checkAndMarkRunningHbck(); + Pair pair = + checkAndMarkRunningHbck(getConf(), this.lockFileRetryCounterFactory.create()); + HBCK_LOCK_PATH = pair.getFirst(); + this.hbckOutFd = pair.getSecond(); if (hbckOutFd == null) { setRetCode(-1); LOG.error("Another instance of hbck is fixing HBase, exiting this instance. " + diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMaster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMaster.java index 11df31383d2..50a83d864c1 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMaster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMaster.java @@ -19,11 +19,15 @@ package org.apache.hadoop.hbase.master; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.IOException; import java.util.List; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.HColumnDescriptor; @@ -40,10 +44,10 @@ import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfoBuilder; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.client.TableState; -import org.apache.hadoop.hbase.master.assignment.RegionStates; import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.HBaseFsck; import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.util.StringUtils; import org.junit.AfterClass; @@ -192,5 +196,42 @@ public class TestMaster { TEST_UTIL.deleteTable(tableName); } } + + @Test + public void testBlockingHbkc1WithLockFile() throws IOException { + // This is how the patch to the lock file is created inside in HBaseFsck. Too hard to use its + // actual method without disturbing HBaseFsck... Do the below mimic instead. + Path hbckLockPath = new Path(HBaseFsck.getTmpDir(TEST_UTIL.getConfiguration()), + HBaseFsck.HBCK_LOCK_FILE); + FileSystem fs = TEST_UTIL.getTestFileSystem(); + assertTrue(fs.exists(hbckLockPath)); + TEST_UTIL.getMiniHBaseCluster(). + killMaster(TEST_UTIL.getMiniHBaseCluster().getMaster().getServerName()); + assertTrue(fs.exists(hbckLockPath)); + TEST_UTIL.getMiniHBaseCluster().startMaster(); + TEST_UTIL.waitFor(30000, () -> TEST_UTIL.getMiniHBaseCluster().getMaster() != null && + TEST_UTIL.getMiniHBaseCluster().getMaster().isInitialized()); + assertTrue(fs.exists(hbckLockPath)); + // Start a second Master. Should be fine. + TEST_UTIL.getMiniHBaseCluster().startMaster(); + assertTrue(fs.exists(hbckLockPath)); + fs.delete(hbckLockPath, true); + assertFalse(fs.exists(hbckLockPath)); + // Kill all Masters. + TEST_UTIL.getMiniHBaseCluster().getLiveMasterThreads().stream(). + map(sn -> sn.getMaster().getServerName()).forEach(sn -> { + try { + TEST_UTIL.getMiniHBaseCluster().killMaster(sn); + } catch (IOException e) { + e.printStackTrace(); + } + }); + // Start a new one. + TEST_UTIL.getMiniHBaseCluster().startMaster(); + TEST_UTIL.waitFor(30000, () -> TEST_UTIL.getMiniHBaseCluster().getMaster() != null && + TEST_UTIL.getMiniHBaseCluster().getMaster().isInitialized()); + // Assert lock gets put in place again. + assertTrue(fs.exists(hbckLockPath)); + } }