HBASE-11405 Multiple invocations of hbck in parallel disables balancer permanently (bharath v)
This commit is contained in:
parent
257562036b
commit
4d29a21ff7
|
@ -21,6 +21,7 @@ import java.io.FileNotFoundException;
|
|||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.io.StringWriter;
|
||||
import java.net.InetAddress;
|
||||
import java.net.URI;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
@ -44,18 +45,22 @@ import java.util.concurrent.ExecutionException;
|
|||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.ScheduledThreadPoolExecutor;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.conf.Configured;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.permission.FsAction;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.hbase.Abortable;
|
||||
import org.apache.hadoop.hbase.Cell;
|
||||
import org.apache.hadoop.hbase.ClusterStatus;
|
||||
|
@ -111,7 +116,10 @@ import org.apache.hadoop.hbase.util.hbck.TableLockChecker;
|
|||
import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
||||
import org.apache.hadoop.hbase.security.AccessDeniedException;
|
||||
import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.hadoop.ipc.RemoteException;
|
||||
import org.apache.hadoop.security.AccessControlException;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.apache.hadoop.util.ReflectionUtils;
|
||||
import org.apache.hadoop.util.Tool;
|
||||
|
@ -180,6 +188,8 @@ public class HBaseFsck extends Configured {
|
|||
private static final int DEFAULT_OVERLAPS_TO_SIDELINE = 2;
|
||||
private static final int DEFAULT_MAX_MERGE = 5;
|
||||
private static final String TO_BE_LOADED = "to_be_loaded";
|
||||
private static final String HBCK_LOCK_FILE = "hbase-hbck.lock";
|
||||
|
||||
|
||||
/**********************
|
||||
* Internal resources
|
||||
|
@ -194,6 +204,12 @@ public class HBaseFsck extends Configured {
|
|||
private long startMillis = System.currentTimeMillis();
|
||||
private HFileCorruptionChecker hfcc;
|
||||
private int retcode = 0;
|
||||
private Path HBCK_LOCK_PATH;
|
||||
private FSDataOutputStream hbckOutFd;
|
||||
// This lock is to prevent cleanup of balancer resources twice between
|
||||
// ShutdownHook and the main code. We cleanup only if the connect() is
|
||||
// successful
|
||||
private final AtomicBoolean hbckLockCleanup = new AtomicBoolean(false);
|
||||
|
||||
/***********
|
||||
* Options
|
||||
|
@ -302,11 +318,74 @@ public class HBaseFsck extends Configured {
|
|||
this.executor = exec;
|
||||
}
|
||||
|
||||
/**
|
||||
* This method maintains a lock using a file. If the creation fails we return null
|
||||
*
|
||||
* @return FSDataOutputStream object corresponding to the newly opened lock file
|
||||
* @throws IOException
|
||||
*/
|
||||
private FSDataOutputStream checkAndMarkRunningHbck() throws IOException {
|
||||
try {
|
||||
FileSystem fs = FSUtils.getCurrentFileSystem(getConf());
|
||||
FsPermission defaultPerms = FSUtils.getFilePermissions(fs, getConf(),
|
||||
HConstants.DATA_FILE_UMASK_KEY);
|
||||
Path tmpDir = new Path(FSUtils.getRootDir(getConf()), HConstants.HBASE_TEMP_DIRECTORY);
|
||||
fs.mkdirs(tmpDir);
|
||||
HBCK_LOCK_PATH = new Path(tmpDir, HBCK_LOCK_FILE);
|
||||
final FSDataOutputStream out = FSUtils.create(fs, HBCK_LOCK_PATH, defaultPerms, false);
|
||||
out.writeBytes(InetAddress.getLocalHost().toString());
|
||||
out.flush();
|
||||
return out;
|
||||
} catch(RemoteException e) {
|
||||
if(AlreadyBeingCreatedException.class.getName().equals(e.getClassName())){
|
||||
return null;
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void unlockHbck() {
|
||||
if(hbckLockCleanup.compareAndSet(true, false)){
|
||||
IOUtils.closeStream(hbckOutFd);
|
||||
try{
|
||||
FSUtils.delete(FSUtils.getCurrentFileSystem(getConf()), HBCK_LOCK_PATH, true);
|
||||
} catch(IOException ioe) {
|
||||
LOG.warn("Failed to delete " + HBCK_LOCK_PATH);
|
||||
LOG.debug(ioe);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* To repair region consistency, one must call connect() in order to repair
|
||||
* online state.
|
||||
*/
|
||||
public void connect() throws IOException {
|
||||
|
||||
// Check if another instance of balancer is running
|
||||
hbckOutFd = checkAndMarkRunningHbck();
|
||||
if (hbckOutFd == null) {
|
||||
setRetCode(-1);
|
||||
LOG.error("Another instance of hbck is running, exiting this instance.[If you are sure" +
|
||||
" no other instance is running, delete the lock file " +
|
||||
HBCK_LOCK_PATH + " and rerun the tool]");
|
||||
throw new IOException("Duplicate hbck - Abort");
|
||||
}
|
||||
|
||||
// Make sure to cleanup the lock
|
||||
hbckLockCleanup.set(true);
|
||||
|
||||
// Add a shutdown hook to this thread, incase user tries to
|
||||
// kill the hbck with a ctrl-c, we want to cleanup the lock so that
|
||||
// it is available for further calls
|
||||
Runtime.getRuntime().addShutdownHook(new Thread() {
|
||||
public void run() {
|
||||
unlockHbck();
|
||||
}
|
||||
});
|
||||
LOG.debug("Launching hbck");
|
||||
|
||||
connection = HConnectionManager.createConnection(getConf());
|
||||
admin = new HBaseAdmin(connection);
|
||||
meta = new HTable(TableName.META_TABLE_NAME, connection);
|
||||
|
@ -501,6 +580,9 @@ public class HBaseFsck extends Configured {
|
|||
|
||||
checkAndFixTableLocks();
|
||||
|
||||
// Remove the hbck lock
|
||||
unlockHbck();
|
||||
|
||||
// Print table summary
|
||||
printTableSummary(tablesInfo);
|
||||
return errors.summarize();
|
||||
|
@ -3962,7 +4044,6 @@ public class HBaseFsck extends Configured {
|
|||
Path hbasedir = FSUtils.getRootDir(conf);
|
||||
URI defaultFs = hbasedir.getFileSystem(conf).getUri();
|
||||
FSUtils.setFsDefault(conf, new Path(defaultFs));
|
||||
|
||||
int ret = ToolRunner.run(new HBaseFsckTool(conf), args);
|
||||
System.exit(ret);
|
||||
}
|
||||
|
|
|
@ -40,8 +40,11 @@ import java.util.Map;
|
|||
import java.util.Map.Entry;
|
||||
import java.util.NavigableMap;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.ScheduledThreadPoolExecutor;
|
||||
import java.util.concurrent.SynchronousQueue;
|
||||
import java.util.concurrent.ThreadPoolExecutor;
|
||||
|
@ -535,6 +538,49 @@ public class TestHBaseFsck {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This test makes sure that parallel instances of Hbck is disabled.
|
||||
*
|
||||
* @throws Exception
|
||||
*/
|
||||
@Test
|
||||
public void testParallelHbck() throws Exception {
|
||||
final ExecutorService service;
|
||||
final Future<HBaseFsck> hbck1,hbck2;
|
||||
|
||||
class RunHbck implements Callable<HBaseFsck>{
|
||||
boolean fail = true;
|
||||
public HBaseFsck call(){
|
||||
try{
|
||||
return doFsck(conf, false);
|
||||
} catch(Exception e){
|
||||
if (e.getMessage().contains("Duplicate hbck")) {
|
||||
fail = false;
|
||||
}
|
||||
}
|
||||
// If we reach here, then an exception was caught
|
||||
if (fail) fail();
|
||||
return null;
|
||||
}
|
||||
}
|
||||
service = Executors.newFixedThreadPool(2);
|
||||
hbck1 = service.submit(new RunHbck());
|
||||
hbck2 = service.submit(new RunHbck());
|
||||
service.shutdown();
|
||||
//wait for 15 seconds, for both hbck calls finish
|
||||
service.awaitTermination(15, TimeUnit.SECONDS);
|
||||
HBaseFsck h1 = hbck1.get();
|
||||
HBaseFsck h2 = hbck2.get();
|
||||
// Make sure only one of the calls was successful
|
||||
assert(h1 == null || h2 == null);
|
||||
if (h1 != null) {
|
||||
assert(h1.getRetCode() >= 0);
|
||||
}
|
||||
if (h2 != null) {
|
||||
assert(h2.getRetCode() >= 0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This create and fixes a bad table with regions that have a duplicate
|
||||
* start key
|
||||
|
|
Loading…
Reference in New Issue