Revert "HBASE-11405 Multiple invocations of hbck in parallel disables balancer permanently (Bharath Vissapragada)"

This reverts commit f4ecb1816b.

More comments came in, reverting until addressed.
This commit is contained in:
Jonathan M Hsieh 2014-09-17 12:58:42 -07:00
parent f4ecb1816b
commit 2e4218648a
2 changed files with 1 additions and 132 deletions

View File

@ -21,7 +21,6 @@ import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.io.PrintWriter; import java.io.PrintWriter;
import java.io.StringWriter; import java.io.StringWriter;
import java.net.InetAddress;
import java.net.URI; import java.net.URI;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
@ -45,22 +44,18 @@ import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future; import java.util.concurrent.Future;
import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured; import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsAction; import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hbase.Abortable; import org.apache.hadoop.hbase.Abortable;
import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.ClusterStatus; import org.apache.hadoop.hbase.ClusterStatus;
@ -116,10 +111,7 @@ import org.apache.hadoop.hbase.util.hbck.TableLockChecker;
import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.hadoop.hbase.security.AccessDeniedException; import org.apache.hadoop.hbase.security.AccessDeniedException;
import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.Tool;
@ -188,8 +180,6 @@ public class HBaseFsck extends Configured {
private static final int DEFAULT_OVERLAPS_TO_SIDELINE = 2; private static final int DEFAULT_OVERLAPS_TO_SIDELINE = 2;
private static final int DEFAULT_MAX_MERGE = 5; private static final int DEFAULT_MAX_MERGE = 5;
private static final String TO_BE_LOADED = "to_be_loaded"; private static final String TO_BE_LOADED = "to_be_loaded";
private static final String HBCK_LOCK_FILE = "hbase-hbck.lock";
/********************** /**********************
* Internal resources * Internal resources
@ -204,11 +194,6 @@ public class HBaseFsck extends Configured {
private long startMillis = System.currentTimeMillis(); private long startMillis = System.currentTimeMillis();
private HFileCorruptionChecker hfcc; private HFileCorruptionChecker hfcc;
private int retcode = 0; private int retcode = 0;
private static Path HBCK_LOCK_PATH;
private FSDataOutputStream hbckOutFd;
// This lock is to prevent cleanup of balancer resources twice between
// ShutdownHook and the main code.
private static AtomicBoolean hbckLockCleanup = new AtomicBoolean(false);
/*********** /***********
* Options * Options
@ -317,79 +302,11 @@ public class HBaseFsck extends Configured {
this.executor = exec; this.executor = exec;
} }
/**
* This method maintains a lock using a file. If the creation fails we return null
*
* @return FSDataOutputStream object corresponding to the newly opened lock file
* @throws IOException
*/
private FSDataOutputStream checkAndMarkRunningHbck() throws IOException {
try {
FileSystem fs = FSUtils.getCurrentFileSystem(getConf());
FsPermission defaultPerms = FSUtils.getFilePermissions(fs, getConf(),
HConstants.DATA_FILE_UMASK_KEY);
Path tmpDir = new Path(FSUtils.getRootDir(getConf()), HConstants.HBASE_TEMP_DIRECTORY);
fs.mkdirs(tmpDir);
HBCK_LOCK_PATH = new Path(tmpDir, HBCK_LOCK_FILE);
final FSDataOutputStream out = FSUtils.create(fs, HBCK_LOCK_PATH, defaultPerms, false);
out.writeBytes(InetAddress.getLocalHost().toString());
out.flush();
return out;
} catch(RemoteException e) {
if(AlreadyBeingCreatedException.class.getName().equals(e.getClassName())){
return null;
} else {
throw e;
}
}
}
private void unlockHbck() throws IOException {
if(hbckLockCleanup.compareAndSet(false, true)){
IOUtils.closeStream(hbckOutFd);
try{
FSUtils.delete(FSUtils.getCurrentFileSystem(getConf()), HBCK_LOCK_PATH, true);
//Reset the hbckLockCleanup to false so that subsequent calls using the same
// Hbck object succeed. This is added for tests, which keep re-using the same
// objects
hbckLockCleanup.set(false);
} catch(IOException ioe) {
LOG.warn("Failed to delete " + HBCK_LOCK_PATH);
LOG.debug(ioe);
}
}
}
/** /**
* To repair region consistency, one must call connect() in order to repair * To repair region consistency, one must call connect() in order to repair
* online state. * online state.
*/ */
public void connect() throws IOException { public void connect() throws IOException {
// Check if another instance of balancer is running
hbckOutFd = checkAndMarkRunningHbck();
if (hbckOutFd == null) {
setRetCode(-1);
LOG.error("Another instance of hbck is running, exiting this instance.[If you are sure" +
" no other instance is running, delete the lock file " +
HBCK_LOCK_PATH + " and rerun the tool]");
throw new IOException("Duplicate hbck - Abort");
}
// Add a shutdown hook to this thread, incase user tries to
// kill the hbck with a ctrl-c, we want to cleanup the lock so that
// it is available for further calls
Runtime.getRuntime().addShutdownHook(new Thread() {
public void run() {
try{
unlockHbck();
} catch(Exception e){
LOG.debug("Error while removing hbck lock " + e.getMessage());
}
}
});
LOG.debug("Launching hbck");
connection = HConnectionManager.createConnection(getConf()); connection = HConnectionManager.createConnection(getConf());
admin = new HBaseAdmin(connection); admin = new HBaseAdmin(connection);
meta = new HTable(TableName.META_TABLE_NAME, connection); meta = new HTable(TableName.META_TABLE_NAME, connection);
@ -584,9 +501,6 @@ public class HBaseFsck extends Configured {
checkAndFixTableLocks(); checkAndFixTableLocks();
// Remove the hbck lock
unlockHbck();
// Print table summary // Print table summary
printTableSummary(tablesInfo); printTableSummary(tablesInfo);
return errors.summarize(); return errors.summarize();
@ -4048,6 +3962,7 @@ public class HBaseFsck extends Configured {
Path hbasedir = FSUtils.getRootDir(conf); Path hbasedir = FSUtils.getRootDir(conf);
URI defaultFs = hbasedir.getFileSystem(conf).getUri(); URI defaultFs = hbasedir.getFileSystem(conf).getUri();
FSUtils.setFsDefault(conf, new Path(defaultFs)); FSUtils.setFsDefault(conf, new Path(defaultFs));
int ret = ToolRunner.run(new HBaseFsckTool(conf), args); int ret = ToolRunner.run(new HBaseFsckTool(conf), args);
System.exit(ret); System.exit(ret);
} }

View File

@ -40,11 +40,8 @@ import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.NavigableMap; import java.util.NavigableMap;
import java.util.Set; import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.CountDownLatch; import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.SynchronousQueue; import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.ThreadPoolExecutor;
@ -538,49 +535,6 @@ public class TestHBaseFsck {
} }
} }
/**
* This test makes sure that parallel instances of Hbck is disabled.
*
* @throws Exception
*/
@Test
public void testParallelHbck() throws Exception {
final ExecutorService service;
final Future<HBaseFsck> hbck1,hbck2;
class RunHbck implements Callable<HBaseFsck>{
boolean fail = true;
public HBaseFsck call(){
try{
return doFsck(conf, false);
} catch(Exception e){
if (e.getMessage().contains("Duplicate hbck")) {
fail = false;
}
}
// If we reach here, then an exception was caught
if (fail) fail();
return null;
}
}
service = Executors.newFixedThreadPool(2);
hbck1 = service.submit(new RunHbck());
hbck2 = service.submit(new RunHbck());
service.shutdown();
//wait till hbck calls finish
service.awaitTermination(Integer.MAX_VALUE, TimeUnit.SECONDS);
HBaseFsck h1 = hbck1.get();
HBaseFsck h2 = hbck2.get();
// Make sure only one of the calls was successful
assert(h1 == null || h2 == null);
if (h1 != null) {
assert(h1.getRetCode() >= 0);
}
if (h2 != null) {
assert(h2.getRetCode() >= 0);
}
}
/** /**
* This create and fixes a bad table with regions that have a duplicate * This create and fixes a bad table with regions that have a duplicate
* start key * start key