HBASE-5128 [uber hbck] Online automated repair of table integrity and region consistency problems

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1304665 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jonathan Hsieh 2012-03-23 23:53:55 +00:00
parent 560173f756
commit f2d637ffa5
14 changed files with 2254 additions and 432 deletions

View File

@ -509,8 +509,16 @@ public class HFile {
preferredEncodingInCache, hfs);
}
/**
*
* @param fs filesystem
* @param path Path to file to read
* @param cacheConf This must not be null. @see {@link org.apache.hadoop.hbase.io.hfile.CacheConfig#CacheConfig(Configuration)}
* @return an active Reader instance.
*/
public static Reader createReader(
FileSystem fs, Path path, CacheConfig cacheConf) throws IOException {
Preconditions.checkNotNull(cacheConf, "Cannot create Reader with null CacheConf");
return createReaderWithEncoding(fs, path, cacheConf,
DataBlockEncoding.NONE);
}

View File

@ -218,7 +218,6 @@ public interface HMasterInterface extends VersionedProtocol {
public void unassign(final byte [] regionName, final boolean force)
throws IOException;
/**
* Offline a region from the assignment manager's in-memory state. The
* region should be in a closed state and there will be no attempt to

View File

@ -1034,8 +1034,9 @@ public class AssignmentManager extends ZooKeeperListener {
regionInfo = regionState.getRegion();
} else {
try {
regionInfo = MetaReader.getRegion(catalogTracker,
data.getRegionName()).getFirst();
byte[] name = data.getRegionName();
Pair<HRegionInfo, ServerName> p = MetaReader.getRegion(catalogTracker, name);
regionInfo = p.getFirst();
} catch (IOException e) {
LOG.info("Exception reading META doing HBCK repair operation", e);
return;

View File

@ -1904,18 +1904,17 @@ Server {
public double getAverageLoad() {
return this.assignmentManager.getAverageLoad();
}
/**
* Special method, only used by hbck.
*/
@Override
public void offline(final byte[] regionName)
throws IOException {
public void offline(final byte[] regionName) throws IOException {
Pair<HRegionInfo, ServerName> pair =
MetaReader.getRegion(this.catalogTracker, regionName);
if (pair == null) throw new UnknownRegionException(Bytes.toStringBinary(regionName));
HRegionInfo hri = pair.getFirst();
this.assignmentManager.regionOffline(hri);
this.assignmentManager.regionOffline(hri);
}
/**

File diff suppressed because it is too large Load Diff

View File

@ -21,44 +21,55 @@ package org.apache.hadoop.hbase.util;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.NotServingRegionException;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HConnection;
import org.apache.hadoop.hbase.client.HConnectionManager;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.ipc.HRegionInterface;
import org.apache.hadoop.hbase.master.AssignmentManager.RegionState;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.wal.HLog;
import org.apache.zookeeper.KeeperException;
/**
* This class contains helper methods that repair parts of hbase's filesystem
* contents.
*/
@InterfaceAudience.Public
@InterfaceStability.Evolving
public class HBaseFsckRepair {
public static final Log LOG = LogFactory.getLog(HBaseFsckRepair.class);
/**
* Fix dupe assignment by doing silent closes on each RS hosting the region
* Fix multiple assignment by doing silent closes on each RS hosting the region
* and then force ZK unassigned node to OFFLINE to trigger assignment by
* master.
* @param admin
* @param region
* @param servers
* @throws IOException
* @throws KeeperException
* @throws InterruptedException
*
* @param admin HBase admin used to undeploy
* @param region Region to undeploy
* @param servers list of Servers to undeploy from
*/
public static void fixDupeAssignment(HBaseAdmin admin, HRegionInfo region,
public static void fixMultiAssignment(HBaseAdmin admin, HRegionInfo region,
List<ServerName> servers)
throws IOException, KeeperException, InterruptedException {
HRegionInfo actualRegion = new HRegionInfo(region);
// Close region on the servers silently
for(ServerName server : servers) {
closeRegionSilentlyAndWait(admin.getConfiguration(), server, actualRegion);
closeRegionSilentlyAndWait(admin, server, actualRegion);
}
// Force ZK node to OFFLINE so master assigns
@ -67,58 +78,133 @@ public class HBaseFsckRepair {
/**
* Fix unassigned by creating/transition the unassigned ZK node for this
* region to OFFLINE state with a special flag to tell the master that this
* is a forced operation by HBCK.
* @param admin
* region to OFFLINE state with a special flag to tell the master that this is
* a forced operation by HBCK.
*
* This assumes that info is in META.
*
* @param conf
* @param region
* @throws IOException
* @throws KeeperException
*/
public static void fixUnassigned(HBaseAdmin admin, HRegionInfo region)
throws IOException, KeeperException {
throws IOException, KeeperException {
HRegionInfo actualRegion = new HRegionInfo(region);
// Force ZK node to OFFLINE so master assigns
forceOfflineInZK(admin, actualRegion);
}
/**
* In 0.90, this forces an HRI offline by setting the RegionTransitionData
* in ZK to have HBCK_CODE_NAME as the server. This is a special case in
* the AssignmentManager that attempts an assign call by the master.
*
* @see org.apache.hadoop.hbase.master.AssignementManager#handleHBCK
*
* This doesn't seem to work properly in the updated version of 0.92+'s hbck
* so we use assign to force the region into transition. This has the
* side-effect of requiring a HRegionInfo that considers regionId (timestamp)
* in comparators that is addressed by HBASE-5563.
*/
private static void forceOfflineInZK(HBaseAdmin admin, final HRegionInfo region)
throws ZooKeeperConnectionException, KeeperException, IOException {
admin.assign(region.getRegionName());
}
private static void closeRegionSilentlyAndWait(Configuration conf,
ServerName server, HRegionInfo region) throws IOException,
InterruptedException {
HConnection connection = HConnectionManager.getConnection(conf);
boolean success = false;
/*
* Should we check all assignments or just not in RIT?
*/
public static void waitUntilAssigned(HBaseAdmin admin,
HRegionInfo region) throws IOException, InterruptedException {
HConnection connection = admin.getConnection();
try {
HRegionInterface rs =
connection.getHRegionConnection(server.getHostname(), server.getPort());
rs.closeRegion(region, false);
long timeout = conf.getLong("hbase.hbck.close.timeout", 120000);
long timeout = admin.getConfiguration().getLong("hbase.hbck.assign.timeout", 120000);
long expiration = timeout + System.currentTimeMillis();
while (System.currentTimeMillis() < expiration) {
try {
HRegionInfo rsRegion = rs.getRegionInfo(region.getRegionName());
if (rsRegion == null)
throw new NotServingRegionException();
} catch (Exception e) {
success = true;
return;
Map<String, RegionState> rits=
admin.getClusterStatus().getRegionsInTransition();
if (rits.keySet() != null && !rits.keySet().contains(region.getEncodedName())) {
// yay! no longer RIT
return;
}
// still in rit
LOG.info("Region still in transition, waiting for "
+ "it to become assigned: " + region);
} catch (IOException e) {
LOG.warn("Exception when waiting for region to become assigned,"
+ " retrying", e);
}
Thread.sleep(1000);
}
throw new IOException("Region " + region + " failed to close within"
+ " timeout " + timeout);
throw new IOException("Region " + region + " failed to move out of " +
"transition within timeout " + timeout + "ms");
} finally {
try {
connection.close();
} catch (IOException ioe) {
if (success) {
throw ioe;
}
throw ioe;
}
}
}
/**
* Contacts a region server and waits up to hbase.hbck.close.timeout ms
* (default 120s) to close the region. This bypasses the active hmaster.
*/
public static void closeRegionSilentlyAndWait(HBaseAdmin admin,
ServerName server, HRegionInfo region) throws IOException, InterruptedException {
HConnection connection = admin.getConnection();
HRegionInterface rs = connection.getHRegionConnection(server.getHostname(),
server.getPort());
rs.closeRegion(region, false);
long timeout = admin.getConfiguration()
.getLong("hbase.hbck.close.timeout", 120000);
long expiration = timeout + System.currentTimeMillis();
while (System.currentTimeMillis() < expiration) {
try {
HRegionInfo rsRegion = rs.getRegionInfo(region.getRegionName());
if (rsRegion == null)
return;
} catch (IOException ioe) {
return;
}
Thread.sleep(1000);
}
throw new IOException("Region " + region + " failed to close within"
+ " timeout " + timeout);
}
/**
* Puts the specified HRegionInfo into META.
*/
public static void fixMetaHoleOnline(Configuration conf,
HRegionInfo hri) throws IOException {
Put p = new Put(hri.getRegionName());
p.add(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER,
Writables.getBytes(hri));
HTable meta = new HTable(conf, HConstants.META_TABLE_NAME);
meta.put(p);
meta.close();
}
/**
* Creates, flushes, and closes a new region.
*/
public static HRegion createHDFSRegionDir(Configuration conf,
HRegionInfo hri, HTableDescriptor htd) throws IOException {
// Create HRegion
Path root = FSUtils.getRootDir(conf);
HRegion region = HRegion.createHRegion(hri, root, conf, htd);
HLog hlog = region.getLog();
// Close the new region to flush to disk. Close log file too.
region.close();
hlog.closeAndDelete();
return region;
}
}

View File

@ -44,7 +44,6 @@ import org.apache.hadoop.io.MultipleIOException;
@InterfaceStability.Evolving
public class OfflineMetaRepair {
private static final Log LOG = LogFactory.getLog(HBaseFsck.class.getName());
HBaseFsck fsck;
protected static void printUsageAndExit() {
System.err.println("Usage: OfflineMetaRepair [opts] ");
@ -52,6 +51,8 @@ public class OfflineMetaRepair {
System.err
.println(" -details Display full report of all regions.");
System.err.println(" -base <hdfs://> Base Hbase Data directory");
System.err.println(" -fix Auto fix as many problems as possible");
System.err.println(" -fixHoles Auto fix as region holes");
Runtime.getRuntime().exit(-2);
}
@ -67,18 +68,24 @@ public class OfflineMetaRepair {
Configuration conf = HBaseConfiguration.create();
conf.set("fs.defaultFS", conf.get(HConstants.HBASE_DIR));
HBaseFsck fsck = new HBaseFsck(conf);
boolean fixHoles = false;
// Process command-line args.
for (int i = 0; i < args.length; i++) {
String cmd = args[i];
if (cmd.equals("-details")) {
fsck.displayFullReport();
fsck.setDisplayFullReport();
} else if (cmd.equals("-base")) {
// update hbase root dir to user-specified base
i++;
String path = args[i];
conf.set(HConstants.HBASE_DIR, path);
conf.set("fs.defaultFS", conf.get(HConstants.HBASE_DIR));
} else if (cmd.equals("-fixHoles")) {
fixHoles = true;
} else if (cmd.equals("-fix")) {
// make all fix options true
fixHoles = true;
} else {
String str = "Unknown command line option : " + cmd;
LOG.info(str);
@ -91,7 +98,7 @@ public class OfflineMetaRepair {
// threads cleanly, so we do a System.exit.
boolean success = false;
try {
success = fsck.rebuildMeta();
success = fsck.rebuildMeta(fixHoles);
} catch (MultipleIOException mioes) {
for (IOException ioe : mioes.getExceptions()) {
LOG.error("Bailed out due to:", ioe);

View File

@ -1211,11 +1211,16 @@ public class HBaseTestingUtility {
List<byte[]> rows = new ArrayList<byte[]>();
ResultScanner s = t.getScanner(new Scan());
for (Result result : s) {
HRegionInfo info = Writables.getHRegionInfo(
result.getValue(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER));
byte[] val = result.getValue(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
if (val == null) {
LOG.error("No region info for row " + Bytes.toString(result.getRow()));
// TODO figure out what to do for this new hosed case.
continue;
}
HRegionInfo info = Writables.getHRegionInfo(val);
if (Bytes.compareTo(info.getTableName(), tableName) == 0) {
LOG.info("getMetaTableRows: row -> " +
Bytes.toStringBinary(result.getRow()));
Bytes.toStringBinary(result.getRow()) + info);
rows.add(result.getRow());
}
}

View File

@ -23,8 +23,12 @@ import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertErrors;
import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertNoErrors;
import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
@ -32,16 +36,27 @@ import java.util.Map.Entry;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.ClusterStatus;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MediumTests;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HConnection;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.ipc.HRegionInterface;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
import org.apache.zookeeper.KeeperException;
import org.junit.AfterClass;
@ -54,16 +69,20 @@ import org.junit.experimental.categories.Category;
*/
@Category(MediumTests.class)
public class TestHBaseFsck {
final Log LOG = LogFactory.getLog(getClass());
final static Log LOG = LogFactory.getLog(TestHBaseFsck.class);
private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
private final static Configuration conf = TEST_UTIL.getConfiguration();
private final static byte[] FAM = Bytes.toBytes("fam");
// for the instance, reset every test run
private HTable tbl;
private final static byte[][] splits= new byte[][] { Bytes.toBytes("A"),
private final static byte[][] SPLITS = new byte[][] { Bytes.toBytes("A"),
Bytes.toBytes("B"), Bytes.toBytes("C") };
// one row per region.
private final static byte[][] ROWKEYS= new byte[][] {
Bytes.toBytes("00"), Bytes.toBytes("50"), Bytes.toBytes("A0"), Bytes.toBytes("A5"),
Bytes.toBytes("B0"), Bytes.toBytes("B5"), Bytes.toBytes("C0"), Bytes.toBytes("C5") };
@BeforeClass
public static void setUpBeforeClass() throws Exception {
TEST_UTIL.getConfiguration().setBoolean("hbase.master.distributed.log.splitting", false);
@ -117,8 +136,8 @@ public class TestHBaseFsck {
assertErrors(doFsck(conf, true), new ERROR_CODE[]{
ERROR_CODE.SERVER_DOES_NOT_MATCH_META});
// fixing assignements require opening regions is not synchronous. To make
// the test pass consistentyl so for now we bake in some sleep to let it
// fixing assignments require opening regions is not synchronous. To make
// the test pass consistently so for now we bake in some sleep to let it
// finish. 1s seems sufficient.
Thread.sleep(1000);
@ -135,6 +154,9 @@ public class TestHBaseFsck {
meta.close();
}
/**
* Create a new region in META.
*/
private HRegionInfo createRegion(Configuration conf, final HTableDescriptor
htd, byte[] startKey, byte[] endKey)
throws IOException {
@ -147,47 +169,102 @@ public class TestHBaseFsck {
return hri;
}
public void dumpMeta(HTableDescriptor htd) throws IOException {
List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(htd.getName());
/**
* Debugging method to dump the contents of meta.
*/
private void dumpMeta(byte[] tableName) throws IOException {
List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName);
for (byte[] row : metaRows) {
LOG.info(Bytes.toString(row));
}
}
private void deleteRegion(Configuration conf, final HTableDescriptor htd,
byte[] startKey, byte[] endKey) throws IOException {
/**
* This method is used to undeploy a region -- close it and attempt to
* remove its state from the Master.
*/
private void undeployRegion(HBaseAdmin admin, ServerName sn,
HRegionInfo hri) throws IOException, InterruptedException {
try {
HBaseFsckRepair.closeRegionSilentlyAndWait(admin, sn, hri);
admin.getMaster().offline(hri.getRegionName());
} catch (IOException ioe) {
LOG.warn("Got exception when attempting to offline region "
+ Bytes.toString(hri.getRegionName()), ioe);
}
}
/**
* Delete a region from assignments, meta, or completely from hdfs.
* @param unassign if true unassign region if assigned
* @param metaRow if true remove region's row from META
* @param hdfs if true remove region's dir in HDFS
*/
private void deleteRegion(Configuration conf, final HTableDescriptor htd,
byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
boolean hdfs) throws IOException, InterruptedException {
deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false);
}
LOG.info("Before delete:");
dumpMeta(htd);
/**
* Delete a region from assignments, meta, or completely from hdfs.
* @param unassign if true unassign region if assigned
* @param metaRow if true remove region's row from META
* @param hdfs if true remove region's dir in HDFS
* @param regionInfoOnly if true remove a region dir's .regioninfo file
*/
private void deleteRegion(Configuration conf, final HTableDescriptor htd,
byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
boolean hdfs, boolean regionInfoOnly) throws IOException, InterruptedException {
LOG.info("** Before delete:");
dumpMeta(htd.getName());
Map<HRegionInfo, HServerAddress> hris = tbl.getRegionsInfo();
for (Entry<HRegionInfo, HServerAddress> e: hris.entrySet()) {
Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
for (Entry<HRegionInfo, ServerName> e: hris.entrySet()) {
HRegionInfo hri = e.getKey();
HServerAddress hsa = e.getValue();
if (Bytes.compareTo(hri.getStartKey(), startKey) == 0
ServerName hsa = e.getValue();
if (Bytes.compareTo(hri.getStartKey(), startKey) == 0
&& Bytes.compareTo(hri.getEndKey(), endKey) == 0) {
LOG.info("RegionName: " +hri.getRegionNameAsString());
byte[] deleteRow = hri.getRegionName();
TEST_UTIL.getHBaseAdmin().unassign(deleteRow, true);
LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
Path rootDir = new Path(conf.get(HConstants.HBASE_DIR));
FileSystem fs = rootDir.getFileSystem(conf);
Path p = new Path(rootDir + "/" + htd.getNameAsString(), hri.getEncodedName());
fs.delete(p, true);
if (unassign) {
LOG.info("Undeploying region " + hri + " from server " + hsa);
undeployRegion(new HBaseAdmin(conf), hsa, hri);
}
HTable meta = new HTable(conf, HConstants.META_TABLE_NAME);
Delete delete = new Delete(deleteRow);
meta.delete(delete);
if (regionInfoOnly) {
LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
Path rootDir = new Path(conf.get(HConstants.HBASE_DIR));
FileSystem fs = rootDir.getFileSystem(conf);
Path p = new Path(rootDir + "/" + htd.getNameAsString(), hri.getEncodedName());
Path hriPath = new Path(p, HRegion.REGIONINFO_FILE);
fs.delete(hriPath, true);
}
if (hdfs) {
LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
Path rootDir = new Path(conf.get(HConstants.HBASE_DIR));
FileSystem fs = rootDir.getFileSystem(conf);
Path p = new Path(rootDir + "/" + htd.getNameAsString(), hri.getEncodedName());
HBaseFsck.debugLsr(conf, p);
boolean success = fs.delete(p, true);
LOG.info("Deleted " + p + " sucessfully? " + success);
HBaseFsck.debugLsr(conf, p);
}
if (metaRow) {
HTable meta = new HTable(conf, HConstants.META_TABLE_NAME);
Delete delete = new Delete(deleteRow);
meta.delete(delete);
}
}
LOG.info(hri.toString() + hsa.toString());
}
TEST_UTIL.getMetaTableRows(htd.getName());
LOG.info("After delete:");
dumpMeta(htd);
LOG.info("*** After delete:");
dumpMeta(htd.getName());
}
/**
@ -201,11 +278,32 @@ public class TestHBaseFsck {
HTableDescriptor desc = new HTableDescriptor(tablename);
HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
TEST_UTIL.getHBaseAdmin().createTable(desc, splits);
TEST_UTIL.getHBaseAdmin().createTable(desc, SPLITS);
tbl = new HTable(TEST_UTIL.getConfiguration(), tablename);
List<Put> puts = new ArrayList<Put>();
for (byte[] row : ROWKEYS) {
Put p = new Put(row);
p.add(FAM, Bytes.toBytes("val"), row);
puts.add(p);
}
tbl.put(puts);
tbl.flushCommits();
return tbl;
}
/**
* Counts the number of row to verify data loss or non-dataloss.
*/
int countRows() throws IOException {
Scan s = new Scan();
ResultScanner rs = tbl.getScanner(s);
int i = 0;
while(rs.next() !=null) {
i++;
}
return i;
}
/**
* delete table in preparation for next test
@ -214,14 +312,21 @@ public class TestHBaseFsck {
* @throws IOException
*/
void deleteTable(String tablename) throws IOException {
HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
HBaseAdmin admin = new HBaseAdmin(conf);
admin.getConnection().clearRegionCache();
byte[] tbytes = Bytes.toBytes(tablename);
admin.disableTable(tbytes);
admin.disableTableAsync(tbytes);
while (!admin.isTableDisabled(tbytes)) {
try {
Thread.sleep(250);
} catch (InterruptedException e) {
e.printStackTrace();
fail("Interrupted when trying to disable table " + tablename);
}
}
admin.deleteTable(tbytes);
}
/**
* This creates a clean table and confirms that the table is clean.
*/
@ -234,18 +339,21 @@ public class TestHBaseFsck {
assertNoErrors(hbck);
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// We created 1 table, should be fine
hbck = doFsck(conf, false);
assertNoErrors(hbck);
assertEquals(0, hbck.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
} finally {
deleteTable(table);
}
}
/**
* This creates a bad table with regions that have a duplicate start key
* This create and fixes a bad table with regions that have a duplicate
* start key
*/
@Test
public void testDupeStartKey() throws Exception {
@ -253,6 +361,7 @@ public class TestHBaseFsck {
try {
setupTable(table);
assertNoErrors(doFsck(conf, false));
assertEquals(ROWKEYS.length, countRows());
// Now let's mess it up, by adding a region with a duplicate startkey
HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
@ -265,13 +374,112 @@ public class TestHBaseFsck {
assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
ERROR_CODE.DUPE_STARTKEYS});
assertEquals(2, hbck.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
// fix the degenerate region.
doFsck(conf,true);
// check that the degenerate region is gone and no data loss
HBaseFsck hbck2 = doFsck(conf,false);
assertNoErrors(hbck2);
assertEquals(0, hbck2.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
} finally {
deleteTable(table);
}
}
/**
* This creates a bad table with regions that has startkey == endkey
* Get region info from local cluster.
*/
Map<ServerName, List<String>> getDeployedHRIs(HBaseAdmin admin)
throws IOException {
ClusterStatus status = admin.getMaster().getClusterStatus();
Collection<ServerName> regionServers = status.getServers();
Map<ServerName, List<String>> mm =
new HashMap<ServerName, List<String>>();
HConnection connection = admin.getConnection();
for (ServerName hsi : regionServers) {
HRegionInterface server =
connection.getHRegionConnection(hsi.getHostname(), hsi.getPort());
// list all online regions from this region server
List<HRegionInfo> regions = server.getOnlineRegions();
List<String> regionNames = new ArrayList<String>();
for (HRegionInfo hri : regions) {
regionNames.add(hri.getRegionNameAsString());
}
mm.put(hsi, regionNames);
}
return mm;
}
/**
* Returns the HSI a region info is on.
*/
ServerName findDeployedHSI(Map<ServerName, List<String>> mm, HRegionInfo hri) {
for (Map.Entry<ServerName,List <String>> e : mm.entrySet()) {
if (e.getValue().contains(hri.getRegionNameAsString())) {
return e.getKey();
}
}
return null;
}
/**
* This create and fixes a bad table with regions that have a duplicate
* start key
*/
@Test
public void testDupeRegion() throws Exception {
String table = "tableDupeRegion";
try {
setupTable(table);
assertNoErrors(doFsck(conf, false));
assertEquals(ROWKEYS.length, countRows());
// Now let's mess it up, by adding a region with a duplicate startkey
HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
Bytes.toBytes("A"), Bytes.toBytes("B"));
TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
.waitForAssignment(hriDupe);
// Yikes! The assignment manager can't tell between diff between two
// different regions with the same start/endkeys since it doesn't
// differentiate on ts/regionId! We actually need to recheck
// deployments!
HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
ServerName hsi;
while ( (hsi = findDeployedHSI(getDeployedHRIs(admin), hriDupe)) == null) {
Thread.sleep(250);
}
LOG.debug("Finished assignment of dupe region");
// TODO why is dupe region different from dupe start keys?
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
ERROR_CODE.DUPE_STARTKEYS});
assertEquals(2, hbck.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
// fix the degenerate region.
doFsck(conf,true);
// check that the degenerate region is gone and no data loss
HBaseFsck hbck2 = doFsck(conf,false);
assertNoErrors(hbck2);
assertEquals(0, hbck2.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
} finally {
deleteTable(table);
}
}
/**
* This creates and fixes a bad table with regions that has startkey == endkey
*/
@Test
public void testDegenerateRegions() throws Exception {
@ -279,6 +487,7 @@ public class TestHBaseFsck {
try {
setupTable(table);
assertNoErrors(doFsck(conf,false));
assertEquals(ROWKEYS.length, countRows());
// Now let's mess it up, by adding a region with a duplicate startkey
HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
@ -291,19 +500,111 @@ public class TestHBaseFsck {
assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DEGENERATE_REGION,
ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.DUPE_STARTKEYS});
assertEquals(2, hbck.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
// fix the degenerate region.
doFsck(conf,true);
// check that the degenerate region is gone and no data loss
HBaseFsck hbck2 = doFsck(conf,false);
assertNoErrors(hbck2);
assertEquals(0, hbck2.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
} finally {
deleteTable(table);
}
}
/**
* This creates a bad table where a start key contained in another region.
* This creates and fixes a bad table where a region is completely contained
* by another region.
*/
@Test
public void testContainedRegionOverlap() throws Exception {
String table = "tableContainedRegionOverlap";
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by creating an overlap in the metadata
HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
Bytes.toBytes("A2"), Bytes.toBytes("B"));
TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
.waitForAssignment(hriOverlap);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new ERROR_CODE[] {
ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
assertEquals(2, hbck.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
// fix the problem.
doFsck(conf, true);
// verify that overlaps are fixed
HBaseFsck hbck2 = doFsck(conf,false);
assertNoErrors(hbck2);
assertEquals(0, hbck2.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
} finally {
deleteTable(table);
}
}
/**
* This creates and fixes a bad table where a region is completely contained
* by another region, and there is a hole (sort of like a bad split)
*/
@Test
public void testOverlapAndOrphan() throws Exception {
String table = "tableOverlapAndOrphan";
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by creating an overlap in the metadata
TEST_UTIL.getHBaseAdmin().disableTable(table);
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
Bytes.toBytes("B"), true, true, false, true);
TEST_UTIL.getHBaseAdmin().enableTable(table);
HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
Bytes.toBytes("A2"), Bytes.toBytes("B"));
TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
.waitForAssignment(hriOverlap);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new ERROR_CODE[] {
ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
ERROR_CODE.HOLE_IN_REGION_CHAIN});
// fix the problem.
doFsck(conf, true);
// verify that overlaps are fixed
HBaseFsck hbck2 = doFsck(conf,false);
assertNoErrors(hbck2);
assertEquals(0, hbck2.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
} finally {
deleteTable(table);
}
}
/**
* This creates and fixes a bad table where a region overlaps two regions --
* a start key contained in another region and its end key is contained in
* yet another region.
*/
@Test
public void testCoveredStartKey() throws Exception {
String table = "tableCoveredStartKey";
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by creating an overlap in the metadata
HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
@ -317,40 +618,239 @@ public class TestHBaseFsck {
ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
assertEquals(3, hbck.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
// fix the problem.
doFsck(conf, true);
// verify that overlaps are fixed
HBaseFsck hbck2 = doFsck(conf, false);
assertErrors(hbck2, new ERROR_CODE[0]);
assertEquals(0, hbck2.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
} finally {
deleteTable(table);
}
}
/**
* This creates a bad table with a hole in meta.
* This creates and fixes a bad table with a missing region -- hole in meta
* and data missing in the fs.
*/
@Test
public void testMetaHole() throws Exception {
String table = "tableMetaHole";
public void testRegionHole() throws Exception {
String table = "tableRegionHole";
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by leaving a hole in the meta data
HRegionInfo hriHole = createRegion(conf, tbl.getTableDescriptor(),
Bytes.toBytes("D"), Bytes.toBytes(""));
TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriHole);
TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
.waitForAssignment(hriHole);
// Mess it up by leaving a hole in the assignment, meta, and hdfs data
TEST_UTIL.getHBaseAdmin().disableTable(table);
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"), Bytes.toBytes(""));
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
Bytes.toBytes("C"), true, true, true);
TEST_UTIL.getHBaseAdmin().enableTable(table);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.HOLE_IN_REGION_CHAIN });
assertErrors(hbck, new ERROR_CODE[] {
ERROR_CODE.HOLE_IN_REGION_CHAIN});
// holes are separate from overlap groups
assertEquals(0, hbck.getOverlapGroups(table).size());
// fix hole
doFsck(conf, true);
// check that hole fixed
assertNoErrors(doFsck(conf,false));
assertEquals(ROWKEYS.length - 2 , countRows()); // lost a region so lost a row
} finally {
deleteTable(table);
}
}
/**
* This creates and fixes a bad table with a missing region -- hole in meta
* and data present but .regioinfino missing (an orphan hdfs region)in the fs.
*/
@Test
public void testHDFSRegioninfoMissing() throws Exception {
String table = "tableHDFSRegioininfoMissing";
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by leaving a hole in the meta data
TEST_UTIL.getHBaseAdmin().disableTable(table);
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
Bytes.toBytes("C"), true, true, false, true);
TEST_UTIL.getHBaseAdmin().enableTable(table);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new ERROR_CODE[] {
ERROR_CODE.ORPHAN_HDFS_REGION,
ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
ERROR_CODE.HOLE_IN_REGION_CHAIN});
// holes are separate from overlap groups
assertEquals(0, hbck.getOverlapGroups(table).size());
// fix hole
doFsck(conf, true);
// check that hole fixed
assertNoErrors(doFsck(conf, false));
assertEquals(ROWKEYS.length, countRows());
} finally {
deleteTable(table);
}
}
/**
* This creates and fixes a bad table with a region that is missing meta and
* not assigned to a region server.
*/
@Test
public void testNotInMetaOrDeployedHole() throws Exception {
String table = "tableNotInMetaOrDeployedHole";
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by leaving a hole in the meta data
TEST_UTIL.getHBaseAdmin().disableTable(table);
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
Bytes.toBytes("C"), true, true, false); // don't rm from fs
TEST_UTIL.getHBaseAdmin().enableTable(table);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new ERROR_CODE[] {
ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
// holes are separate from overlap groups
assertEquals(0, hbck.getOverlapGroups(table).size());
// fix hole
assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
// check that hole fixed
assertNoErrors(doFsck(conf,false));
assertEquals(ROWKEYS.length, countRows());
} finally {
deleteTable(table);
}
}
/**
* This creates fixes a bad table with a hole in meta.
*/
@Test
public void testNotInMetaHole() throws Exception {
String table = "tableNotInMetaHole";
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by leaving a hole in the meta data
TEST_UTIL.getHBaseAdmin().disableTable(table);
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
Bytes.toBytes("C"), false, true, false); // don't rm from fs
TEST_UTIL.getHBaseAdmin().enableTable(table);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new ERROR_CODE[] {
ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
// holes are separate from overlap groups
assertEquals(0, hbck.getOverlapGroups(table).size());
// fix hole
assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
// check that hole fixed
assertNoErrors(doFsck(conf,false));
assertEquals(ROWKEYS.length, countRows());
} finally {
deleteTable(table);
}
}
/**
* This creates and fixes a bad table with a region that is in meta but has
* no deployment or data hdfs
*/
@Test
public void testNotInHdfs() throws Exception {
String table = "tableNotInHdfs";
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// make sure data in regions, if in hlog only there is no data loss
TEST_UTIL.getHBaseAdmin().flush(table);
// Mess it up by leaving a hole in the hdfs data
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
Bytes.toBytes("C"), false, false, true); // don't rm meta
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
// holes are separate from overlap groups
assertEquals(0, hbck.getOverlapGroups(table).size());
// fix hole
doFsck(conf, true);
// check that hole fixed
assertNoErrors(doFsck(conf,false));
assertEquals(ROWKEYS.length - 2, countRows());
} finally {
deleteTable(table);
}
}
/**
* This creates entries in META with no hdfs data. This should cleanly
* remove the table.
*/
@Test
public void testNoHdfsTable() throws Exception {
String table = "NoHdfsTable";
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// make sure data in regions, if in hlog only there is no data loss
TEST_UTIL.getHBaseAdmin().flush(table);
// Mess it up by leaving a giant hole in meta
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""),
Bytes.toBytes("A"), false, false, true); // don't rm meta
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
Bytes.toBytes("B"), false, false, true); // don't rm meta
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
Bytes.toBytes("C"), false, false, true); // don't rm meta
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"),
Bytes.toBytes(""), false, false, true); // don't rm meta
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS,
ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS,
ERROR_CODE.NOT_IN_HDFS,});
// holes are separate from overlap groups
assertEquals(0, hbck.getOverlapGroups(table).size());
// fix hole
doFsck(conf, true); // in 0.92+, meta entries auto create regiondirs
// check that hole fixed
assertNoErrors(doFsck(conf,false));
try {
assertEquals(0, countRows());
} catch (IOException ioe) {
// we've actually deleted the table already. :)
return;
}
fail("Should have failed with IOException");
}
@org.junit.Rule
public org.apache.hadoop.hbase.ResourceCheckerJUnitRule cu =
new org.apache.hadoop.hbase.ResourceCheckerJUnitRule();

View File

@ -96,14 +96,6 @@ public class TestHBaseFsckComparator {
assertTrue(HBaseFsck.cmp.compare(hi2, hi1) > 0);
}
@Test
public void testTiebreaker() {
HbckInfo hi1 = genHbckInfo(table, keyA, keyC, 0);
HbckInfo hi2 = genHbckInfo(table, keyA, keyC, 1);
assertTrue(HBaseFsck.cmp.compare(hi1, hi2) < 0);
assertTrue(HBaseFsck.cmp.compare(hi2, hi1) > 0);
}
@org.junit.Rule
public org.apache.hadoop.hbase.ResourceCheckerJUnitRule cu =
new org.apache.hadoop.hbase.ResourceCheckerJUnitRule();

View File

@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.util.hbck;
import static org.junit.Assert.assertEquals;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@ -28,18 +29,29 @@ import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
public class HbckTestingUtil {
public static HBaseFsck doFsck(Configuration conf, boolean fix) throws Exception {
return doFsck(conf, fix, fix, fix, fix,fix);
}
public static HBaseFsck doFsck(Configuration conf, boolean fixAssignments,
boolean fixMeta, boolean fixHdfsHoles, boolean fixHdfsOverlaps,
boolean fixHdfsOrphans) throws Exception {
HBaseFsck fsck = new HBaseFsck(conf);
fsck.connect();
fsck.displayFullReport(); // i.e. -details
fsck.setDisplayFullReport(); // i.e. -details
fsck.setTimeLag(0);
fsck.setFixErrors(fix);
fsck.doWork();
fsck.setFixAssignments(fixAssignments);
fsck.setFixMeta(fixMeta);
fsck.setFixHdfsHoles(fixHdfsHoles);
fsck.setFixHdfsOverlaps(fixHdfsOverlaps);
fsck.setFixHdfsOrphans(fixHdfsOrphans);
fsck.onlineHbck();
return fsck;
}
public static void assertNoErrors(HBaseFsck fsck) throws Exception {
List<ERROR_CODE> errs = fsck.getErrors().getErrorList();
assertEquals(0, errs.size());
assertEquals(new ArrayList<ERROR_CODE>(), errs);
}
public static void assertErrors(HBaseFsck fsck, ERROR_CODE[] expectedErrors) {

View File

@ -61,7 +61,7 @@ public class TestOfflineMetaRebuildBase extends OfflineMetaRebuildTestCore {
// rebuild meta table from scratch
HBaseFsck fsck = new HBaseFsck(conf);
assertTrue(fsck.rebuildMeta());
assertTrue(fsck.rebuildMeta(false));
// bring up the minicluster
TEST_UTIL.startMiniZKCluster(); // tables seem enabled by default

View File

@ -21,6 +21,7 @@ import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertErrors;
import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.util.Arrays;
@ -64,7 +65,7 @@ public class TestOfflineMetaRebuildHole extends OfflineMetaRebuildTestCore {
// attempt to rebuild meta table from scratch
HBaseFsck fsck = new HBaseFsck(conf);
assertFalse(fsck.rebuildMeta());
assertFalse(fsck.rebuildMeta(false));
// bring up the minicluster
TEST_UTIL.startMiniZKCluster(); // tables seem enabled by default

View File

@ -69,7 +69,7 @@ public class TestOfflineMetaRebuildOverlap extends OfflineMetaRebuildTestCore {
// attempt to rebuild meta table from scratch
HBaseFsck fsck = new HBaseFsck(conf);
assertFalse(fsck.rebuildMeta());
assertFalse(fsck.rebuildMeta(false));
Multimap<byte[], HbckInfo> problems = fsck.getOverlapGroups(table);
assertEquals(1, problems.keySet().size());