diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java index 017410a6dfe..6d3f2e1ac8a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.InterruptedIOException; import java.io.PrintWriter; import java.io.StringWriter; +import java.net.InetAddress; import java.net.URI; import java.util.ArrayList; import java.util.Arrays; @@ -45,18 +46,22 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.ScheduledThreadPoolExecutor; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsAction; +import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hbase.Abortable; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.ClusterStatus; @@ -109,7 +114,10 @@ import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; import org.apache.hadoop.hbase.zookeeper.ZKTableStateClientSideReader; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; import org.apache.hadoop.hbase.security.AccessDeniedException; +import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException; import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.ipc.RemoteException; +import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.Tool; @@ -178,6 +186,8 @@ public class HBaseFsck extends Configured { private static final int DEFAULT_OVERLAPS_TO_SIDELINE = 2; private static final int DEFAULT_MAX_MERGE = 5; private static final String TO_BE_LOADED = "to_be_loaded"; + private static final String HBCK_LOCK_FILE = "hbase-hbck.lock"; + /********************** * Internal resources @@ -192,6 +202,11 @@ public class HBaseFsck extends Configured { private long startMillis = System.currentTimeMillis(); private HFileCorruptionChecker hfcc; private int retcode = 0; + private static Path HBCK_LOCK_PATH; + private FSDataOutputStream hbckOutFd; + // This lock is to prevent cleanup of balancer resources twice between + // ShutdownHook and the main code. + private static AtomicBoolean hbckLockCleanup = new AtomicBoolean(false); /*********** * Options @@ -300,11 +315,79 @@ public class HBaseFsck extends Configured { this.executor = exec; } + /** + * This method maintains a lock using a file. If the creation fails we return null + * + * @return FSDataOutputStream object corresponding to the newly opened lock file + * @throws IOException + */ + private FSDataOutputStream checkAndMarkRunningHbck() throws IOException { + try { + FileSystem fs = FSUtils.getCurrentFileSystem(getConf()); + FsPermission defaultPerms = FSUtils.getFilePermissions(fs, getConf(), + HConstants.DATA_FILE_UMASK_KEY); + Path tmpDir = new Path(FSUtils.getRootDir(getConf()), HConstants.HBASE_TEMP_DIRECTORY); + fs.mkdirs(tmpDir); + HBCK_LOCK_PATH = new Path(tmpDir, HBCK_LOCK_FILE); + final FSDataOutputStream out = FSUtils.create(fs, HBCK_LOCK_PATH, defaultPerms, false); + out.writeBytes(InetAddress.getLocalHost().toString()); + out.flush(); + return out; + } catch(RemoteException e) { + if(AlreadyBeingCreatedException.class.getName().equals(e.getClassName())){ + return null; + } else { + throw e; + } + } + } + + private void unlockHbck() throws IOException { + if(hbckLockCleanup.compareAndSet(false, true)){ + IOUtils.closeStream(hbckOutFd); + try{ + FSUtils.delete(FSUtils.getCurrentFileSystem(getConf()), HBCK_LOCK_PATH, true); + //Reset the hbckLockCleanup to false so that subsequent calls using the same + // Hbck object succeed. This is added for tests, which keep re-using the same + // objects + hbckLockCleanup.set(false); + } catch(IOException ioe) { + LOG.warn("Failed to delete " + HBCK_LOCK_PATH); + LOG.debug(ioe); + } + } + } + /** * To repair region consistency, one must call connect() in order to repair * online state. */ public void connect() throws IOException { + + // Check if another instance of balancer is running + hbckOutFd = checkAndMarkRunningHbck(); + if (hbckOutFd == null) { + setRetCode(-1); + LOG.error("Another instance of hbck is running, exiting this instance.[If you are sure" + + " no other instance is running, delete the lock file " + + HBCK_LOCK_PATH + " and rerun the tool]"); + throw new IOException("Duplicate hbck - Abort"); + } + + // Add a shutdown hook to this thread, incase user tries to + // kill the hbck with a ctrl-c, we want to cleanup the lock so that + // it is available for further calls + Runtime.getRuntime().addShutdownHook(new Thread() { + public void run() { + try{ + unlockHbck(); + } catch(Exception e){ + LOG.debug("Error while removing hbck lock " + e.getMessage()); + } + } + }); + LOG.debug("Launching hbck"); + connection = HConnectionManager.createConnection(getConf()); admin = new HBaseAdmin(connection); meta = new HTable(TableName.META_TABLE_NAME, connection); @@ -499,6 +582,9 @@ public class HBaseFsck extends Configured { checkAndFixTableLocks(); + // Remove the hbck lock + unlockHbck(); + // Print table summary printTableSummary(tablesInfo); return errors.summarize(); @@ -3842,7 +3928,6 @@ public class HBaseFsck extends Configured { Path hbasedir = FSUtils.getRootDir(conf); URI defaultFs = hbasedir.getFileSystem(conf).getUri(); FSUtils.setFsDefault(conf, new Path(defaultFs)); - int ret = ToolRunner.run(new HBaseFsckTool(conf), args); System.exit(ret); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java index b464de0451f..fe068c91fec 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java @@ -36,8 +36,13 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.NavigableMap; +import java.util.Set; +import java.util.concurrent.Callable; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadPoolExecutor; @@ -526,6 +531,49 @@ public class TestHBaseFsck { } } + /** + * This test makes sure that parallel instances of Hbck is disabled. + * + * @throws Exception + */ + @Test + public void testParallelHbck() throws Exception { + final ExecutorService service; + final Future hbck1,hbck2; + + class RunHbck implements Callable{ + boolean fail = true; + public HBaseFsck call(){ + try{ + return doFsck(conf, false); + } catch(Exception e){ + if (e.getMessage().contains("Duplicate hbck")) { + fail = false; + } + } + // If we reach here, then an exception was caught + if (fail) fail(); + return null; + } + } + service = Executors.newFixedThreadPool(2); + hbck1 = service.submit(new RunHbck()); + hbck2 = service.submit(new RunHbck()); + service.shutdown(); + //wait till hbck calls finish + service.awaitTermination(Integer.MAX_VALUE, TimeUnit.SECONDS); + HBaseFsck h1 = hbck1.get(); + HBaseFsck h2 = hbck2.get(); + // Make sure only one of the calls was successful + assert(h1 == null || h2 == null); + if (h1 != null) { + assert(h1.getRetCode() >= 0); + } + if (h2 != null) { + assert(h2.getRetCode() >= 0); + } + } + /** * This create and fixes a bad table with regions that have a duplicate * start key