HADOOP-1966 Make HBase unit tests more reliable in the Hudson environment.

Set hbase.root in test/hbase-site.xml; when running a test, the default does not work consistantly.

When a HBase mini cluster is started on top of an existing mini dfs cluster, it should not shut down the mini dfs cluster when the mini HBase cluster is shut down.

TestDFSAbort catches exceptions, prints the stack trace and re-throws the exception, so you can see when the exception happened in the log.

Catch runtime exceptions that were escaping from FSUtils.isFileSystemAvailable, enabling more reliable detection of dfs failure. HRegionServer also now checks to see if it is still accepting client requests.



git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk/src/contrib/hbase@580745 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jim Kellerman 2007-09-30 16:09:38 +00:00
parent 354c848546
commit 8a3bc9c23e
8 changed files with 148 additions and 102 deletions

View File

@ -62,6 +62,7 @@ Trunk (unreleased changes)
down is inconsistent b) TestDFSAbort failed in nightly #242
HADOOP-1929 Add hbase-default.xml to hbase jar
HADOOP-1941 StopRowFilter throws NPE when passed null row
HADOOP-1966 Make HBase unit tests more reliable in the Hudson environment.
IMPROVEMENTS
HADOOP-1737 Make HColumnDescriptor data publically members settable

View File

@ -71,9 +71,6 @@ public interface HConstants {
/** Used to construct the name of the directory in which a HRegion resides */
static final String HREGIONDIR_PREFIX = "hregion_";
// TODO: Someone may try to name a column family 'log'. If they
// do, it will clash with the HREGION log dir subdirectory. FIX.
/** Used to construct the name of the log directory for a region server */
static final String HREGION_LOGDIR_NAME = "log";

View File

@ -504,7 +504,7 @@ HMasterRegionInterface {
LOG.error("Scan ROOT region", e);
if (tries == numRetries - 1) {
// We ran out of tries. Make sure the file system is still available
if (checkFileSystem()) {
if (!checkFileSystem()) {
continue; // Avoid sleeping.
}
}
@ -654,7 +654,7 @@ HMasterRegionInterface {
if (tries == numRetries - 1) {
// We ran out of tries. Make sure the file system is still
// available
if (checkFileSystem()) {
if (!checkFileSystem()) {
continue; // avoid sleeping
}
}
@ -941,7 +941,7 @@ HMasterRegionInterface {
*/
protected boolean checkFileSystem() {
if (fsOk) {
if (!FSUtils.isFileSystemAvailable(fs, closed)) {
if (!FSUtils.isFileSystemAvailable(fs)) {
LOG.fatal("Shutting down HBase cluster: file system not available");
closed.set(true);
fsOk = false;

View File

@ -139,6 +139,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
30 * 1000), stop);
}
/** {@inheritDoc} */
public void closing(final Text regionName) {
lock.writeLock().lock();
try {
@ -154,6 +155,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
}
}
/** {@inheritDoc} */
public void closed(final Text regionName) {
lock.writeLock().lock();
try {
@ -458,9 +460,17 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
// get it when the master is panicing because for instance
// the HDFS has been yanked out from under it. Be wary of
// this message.
if (checkFileSystem()) {
closeAllRegions();
restart = true;
try {
if (checkFileSystem()) {
closeAllRegions();
restart = true;
}
} catch (Exception e) {
LOG.fatal("file system available check failed. " +
"Shutting down server.", e);
this.stopRequested.set(true);
this.fsOk = false;
this.abortRequested = true;
}
break;
@ -945,6 +955,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
public byte [] get(final Text regionName, final Text row,
final Text column) throws IOException {
checkOpen();
requestCount.incrementAndGet();
try {
return getRegion(regionName).get(row, column);
@ -959,6 +970,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
public byte [][] get(final Text regionName, final Text row,
final Text column, final int numVersions) throws IOException {
checkOpen();
requestCount.incrementAndGet();
try {
return getRegion(regionName).get(row, column, numVersions);
@ -973,6 +985,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
public byte [][] get(final Text regionName, final Text row, final Text column,
final long timestamp, final int numVersions) throws IOException {
checkOpen();
requestCount.incrementAndGet();
try {
return getRegion(regionName).get(row, column, timestamp, numVersions);
@ -987,6 +1000,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
public MapWritable getRow(final Text regionName, final Text row)
throws IOException {
checkOpen();
requestCount.incrementAndGet();
try {
HRegion region = getRegion(regionName);
@ -1007,6 +1021,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
/** {@inheritDoc} */
public MapWritable next(final long scannerId) throws IOException {
checkOpen();
requestCount.incrementAndGet();
try {
String scannerName = String.valueOf(scannerId);
@ -1044,7 +1059,9 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
/** {@inheritDoc} */
public void batchUpdate(Text regionName, long timestamp, BatchUpdate b)
throws IOException {
throws IOException {
checkOpen();
requestCount.incrementAndGet();
// If timestamp == LATEST_TIMESTAMP and we have deletes, then they need
// special treatment. For these we need to first find the latest cell so
@ -1093,9 +1110,12 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
// remote scanner interface
//
/** {@inheritDoc} */
public long openScanner(Text regionName, Text[] cols, Text firstRow,
final long timestamp, final RowFilterInterface filter)
throws IOException {
final long timestamp, final RowFilterInterface filter)
throws IOException {
checkOpen();
requestCount.incrementAndGet();
try {
HRegion r = getRegion(regionName);
@ -1110,7 +1130,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
leases.createLease(scannerId, scannerId, new ScannerListener(scannerName));
return scannerId;
} catch (IOException e) {
LOG.error("Opening scanner (fsOk: " + this.fsOk + ")",
LOG.error("Error opening scanner (fsOk: " + this.fsOk + ")",
RemoteExceptionHandler.checkIOException(e));
checkFileSystem();
throw e;
@ -1119,6 +1139,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
/** {@inheritDoc} */
public void close(final long scannerId) throws IOException {
checkOpen();
requestCount.incrementAndGet();
try {
String scannerName = String.valueOf(scannerId);
@ -1254,6 +1275,20 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
}
}
/**
* Called to verify that this server is up and running.
*
* @throws IOException
*/
private void checkOpen() throws IOException {
if (stopRequested.get() || abortRequested) {
throw new IOException("Server not running");
}
if (!fsOk) {
throw new IOException("File system not available");
}
}
/**
* Checks to see if the file system is still accessible.
* If not, sets abortRequested and stopRequested
@ -1265,10 +1300,14 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
FileSystem fs = null;
try {
fs = FileSystem.get(this.conf);
} catch (IOException e) {
if (fs != null && !FSUtils.isFileSystemAvailable(fs)) {
LOG.fatal("Shutting down HRegionServer: file system not available");
this.abortRequested = true;
this.stopRequested.set(true);
fsOk = false;
}
} catch (Exception e) {
LOG.error("Failed get of filesystem", e);
}
if (fs != null && !FSUtils.isFileSystemAvailable(fs, stopRequested)) {
LOG.fatal("Shutting down HRegionServer: file system not available");
this.abortRequested = true;
this.stopRequested.set(true);
@ -1301,6 +1340,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
return regionsToCheck;
}
/** {@inheritDoc} */
public long getProtocolVersion(final String protocol,
@SuppressWarnings("unused") final long clientVersion)
throws IOException {

View File

@ -26,7 +26,6 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.dfs.DistributedFileSystem;
/**
@ -38,48 +37,37 @@ public class FSUtils {
/**
* Not instantiable
*/
private FSUtils() {super();}
private FSUtils() {}
/**
* Checks to see if the specified file system is available
*
* @param fs
* @param closed Optional flag. If non-null and set, will abort test of
* filesytem. Presumption is a flag shared by multiple threads. Another
* may have already determined the filesystem -- or something else -- bad.
* @return true if the specified file system is available.
*/
public static boolean isFileSystemAvailable(final FileSystem fs,
final AtomicBoolean closed) {
public static boolean isFileSystemAvailable(final FileSystem fs) {
if (!(fs instanceof DistributedFileSystem)) {
return true;
}
String exception = "";
boolean available = false;
DistributedFileSystem dfs = (DistributedFileSystem) fs;
int maxTries = dfs.getConf().getInt("hbase.client.retries.number", 3);
Path root =
fs.makeQualified(new Path(dfs.getConf().get(HConstants.HBASE_DIR, "/")));
for (int i = 0; i < maxTries && (closed == null || !closed.get()); i++) {
IOException ex = null;
try {
if (dfs.exists(root)) {
available = true;
break;
}
} catch (IOException e) {
ex = e;
try {
if (dfs.exists(new Path("/"))) {
available = true;
}
String exception = (ex == null)? "": ": " + ex.getMessage();
LOG.info("Failed exists test on " + root + " by thread " +
Thread.currentThread().getName() + " (Attempt " + i + " of " +
maxTries +"): " + exception);
} catch (IOException e) {
exception = e.getMessage();
}
LOG.info("Failed file system available test. Thread: " +
Thread.currentThread().getName() + ": " + exception);
try {
if (!available) {
fs.close();
}
} catch (IOException e) {
} catch (Exception e) {
LOG.error("file system close failed: ", e);
}
return available;

View File

@ -75,4 +75,8 @@
the master will notice a dead region server sooner. The default is 15 seconds.
</description>
</property>
<property>
<name>hbase.rootdir</name>
<value>/hbase</value>
<description>location of HBase instance in dfs</description></property>
</configuration>

View File

@ -48,6 +48,7 @@ public class MiniHBaseCluster implements HConstants {
private Configuration conf;
private MiniDFSCluster cluster;
private FileSystem fs;
private boolean shutdownDFS;
private Path parentdir;
private MasterThread masterThread = null;
ArrayList<RegionServerThread> regionThreads =
@ -84,8 +85,14 @@ public class MiniHBaseCluster implements HConstants {
/**
* Starts a MiniHBaseCluster on top of an existing HDFSCluster
*
* Note that if you use this constructor, you should shut down the mini dfs
* cluster in your test case.
****************************************************************************
* * * * * * N O T E * * * * *
*
* If you use this constructor, you should shut down the mini dfs cluster
* in your test case.
*
* * * * * * N O T E * * * * *
****************************************************************************
*
* @param conf
* @param nRegionNodes
@ -98,6 +105,7 @@ public class MiniHBaseCluster implements HConstants {
this.conf = conf;
this.fs = dfsCluster.getFileSystem();
this.cluster = dfsCluster;
this.shutdownDFS = false;
init(nRegionNodes);
}
@ -118,9 +126,11 @@ public class MiniHBaseCluster implements HConstants {
this.conf = conf;
this.deleteOnExit = deleteOnExit;
this.shutdownDFS = false;
if (miniHdfsFilesystem) {
this.cluster = new MiniDFSCluster(this.conf, 2, format, (String[])null);
this.fs = cluster.getFileSystem();
this.shutdownDFS = true;
} else {
this.cluster = null;
this.fs = FileSystem.get(conf);
@ -390,11 +400,14 @@ public class MiniHBaseCluster implements HConstants {
regionServerThreads.size() + " region server(s)");
}
/**
* Shut down the mini HBase cluster
*/
public void shutdown() {
MiniHBaseCluster.shutdown(this.masterThread, this.regionThreads);
try {
if (cluster != null) {
if (shutdownDFS && cluster != null) {
FileSystem fs = cluster.getFileSystem();
LOG.info("Shutting down Mini DFS cluster");

View File

@ -30,32 +30,35 @@ import org.apache.log4j.Logger;
*/
public class TestDFSAbort extends HBaseClusterTestCase {
/** constructor */
public TestDFSAbort() {
super();
Logger.getRootLogger().setLevel(Level.WARN);
Logger.getLogger(this.getClass().getPackage().getName()).setLevel(Level.DEBUG);
}
/** {@inheritDoc} */
@Override
public void setUp() throws Exception {
super.setUp();
HTableDescriptor desc = new HTableDescriptor(getName());
desc.addFamily(new HColumnDescriptor(HConstants.COLUMN_FAMILY_STR));
HBaseAdmin admin = new HBaseAdmin(conf);
admin.createTable(desc);
try {
super.setUp();
HTableDescriptor desc = new HTableDescriptor(getName());
desc.addFamily(new HColumnDescriptor(HConstants.COLUMN_FAMILY_STR));
HBaseAdmin admin = new HBaseAdmin(conf);
admin.createTable(desc);
} catch (Exception e) {
e.printStackTrace();
throw e;
}
}
/**
* @throws Exception
*/
public void testDFSAbort() throws Exception {
// By now the Mini DFS is running, Mini HBase is running and we have
// created a table. Now let's yank the rug out from HBase
cluster.getDFSCluster().shutdown();
// Now wait for Mini HBase Cluster to shut down
cluster.join();
try {
// By now the Mini DFS is running, Mini HBase is running and we have
// created a table. Now let's yank the rug out from HBase
cluster.getDFSCluster().shutdown();
// Now wait for Mini HBase Cluster to shut down
cluster.join();
} catch (Exception e) {
e.printStackTrace();
throw e;
}
}
/**