HBASE-6392 UnknownRegionException blocks hbck from sideline big overlap regions

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1363190 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
jxiang 2012-07-19 00:23:56 +00:00
parent 7743260acc
commit 0da654d45f
2 changed files with 119 additions and 16 deletions

View File

@ -1288,6 +1288,7 @@ public class HBaseFsck {
* the offline ipc call exposed on the master (<0.90.5, <0.92.0) a master
* restart or failover may be required.
*/
@SuppressWarnings("deprecation")
private void closeRegion(HbckInfo hi) throws IOException, InterruptedException {
if (hi.metaEntry == null && hi.hdfsEntry == null) {
undeployRegions(hi);
@ -1901,22 +1902,22 @@ public class HBaseFsck {
LOG.debug("Contained region dir before close");
debugLsr(hi.getHdfsRegionDir());
try {
LOG.info("Closing region: " + hi);
closeRegion(hi);
} catch (IOException ioe) {
// TODO exercise this
LOG.warn("Was unable to close region " + hi.getRegionNameAsString()
+ ". Just continuing... ");
LOG.warn("Was unable to close region " + hi
+ ". Just continuing... ", ioe);
} catch (InterruptedException e) {
// TODO exercise this
LOG.warn("Was unable to close region " + hi.getRegionNameAsString()
+ ". Just continuing... ");
LOG.warn("Was unable to close region " + hi
+ ". Just continuing... ", e);
}
try {
LOG.info("Offlining region: " + hi);
offline(hi.getRegionName());
} catch (IOException ioe) {
LOG.warn("Unable to offline region from master: " + hi, ioe);
LOG.warn("Unable to offline region from master: " + hi
+ ". Just continuing... ", ioe);
}
}
@ -1965,14 +1966,21 @@ public class HBaseFsck {
try {
LOG.info("Closing region: " + regionToSideline);
closeRegion(regionToSideline);
} catch (InterruptedException ie) {
LOG.warn("Was unable to close region " + regionToSideline.getRegionNameAsString()
+ ". Interrupted.");
throw new IOException(ie);
} catch (IOException ioe) {
LOG.warn("Was unable to close region " + regionToSideline
+ ". Just continuing... ", ioe);
} catch (InterruptedException e) {
LOG.warn("Was unable to close region " + regionToSideline
+ ". Just continuing... ", e);
}
LOG.info("Offlining region: " + regionToSideline);
offline(regionToSideline.getRegionName());
try {
LOG.info("Offlining region: " + regionToSideline);
offline(regionToSideline.getRegionName());
} catch (IOException ioe) {
LOG.warn("Unable to offline region from master: " + regionToSideline
+ ". Just continuing... ", ioe);
}
LOG.info("Before sideline big overlapped region: " + regionToSideline.toString());
Path sidelineRegionDir = sidelineRegionDir(fs, TO_BE_LOADED, regionToSideline);

View File

@ -24,6 +24,7 @@ import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertNoErrors;
import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
@ -62,9 +63,11 @@ import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.executor.EventHandler.EventType;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.util.HBaseFsck.HbckInfo;
import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
@ -74,6 +77,8 @@ import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import com.google.common.collect.Multimap;
/**
* This tests HBaseFsck's ability to detect reasons for inconsistent tables.
*/
@ -483,8 +488,7 @@ public class TestHBaseFsck {
// differentiate on ts/regionId! We actually need to recheck
// deployments!
HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
ServerName hsi;
while ( (hsi = findDeployedHSI(getDeployedHRIs(admin), hriDupe)) == null) {
while (findDeployedHSI(getDeployedHRIs(admin), hriDupe) == null) {
Thread.sleep(250);
}
@ -547,7 +551,6 @@ public class TestHBaseFsck {
}
}
/**
* This creates and fixes a bad table where a region is completely contained
* by another region.
@ -585,6 +588,98 @@ public class TestHBaseFsck {
}
}
/**
* This creates and fixes a bad table where an overlap group of
* 3 regions. Set HBaseFsck.maxMerge to 2 to trigger sideline overlapped
* region. Mess around the meta data so that closeRegion/offlineRegion
* throws exceptions.
*/
@Test
public void testSidelineOverlapRegion() throws Exception {
String table = "testSidelineOverlapRegion";
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by creating an overlap
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
HMaster master = cluster.getMaster();
HRegionInfo hriOverlap1 = createRegion(conf, tbl.getTableDescriptor(),
Bytes.toBytes("A"), Bytes.toBytes("AB"));
master.assignRegion(hriOverlap1);
master.getAssignmentManager().waitForAssignment(hriOverlap1);
HRegionInfo hriOverlap2 = createRegion(conf, tbl.getTableDescriptor(),
Bytes.toBytes("AB"), Bytes.toBytes("B"));
master.assignRegion(hriOverlap2);
master.getAssignmentManager().waitForAssignment(hriOverlap2);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.DUPE_STARTKEYS,
ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
assertEquals(3, hbck.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
// mess around the overlapped regions, to trigger NotServingRegionException
Multimap<byte[], HbckInfo> overlapGroups = hbck.getOverlapGroups(table);
ServerName serverName = null;
byte[] regionName = null;
for (HbckInfo hbi: overlapGroups.values()) {
if ("A".equals(Bytes.toString(hbi.getStartKey()))
&& "B".equals(Bytes.toString(hbi.getEndKey()))) {
regionName = hbi.getRegionName();
// get an RS not serving the region to force bad assignment info in to META.
int k = cluster.getServerWith(regionName);
for (int i = 0; i < 3; i++) {
if (i != k) {
HRegionServer rs = cluster.getRegionServer(i);
serverName = rs.getServerName();
break;
}
}
HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
HBaseFsckRepair.closeRegionSilentlyAndWait(admin,
cluster.getRegionServer(k).getServerName(), hbi.getHdfsHRI());
admin.offline(regionName);
break;
}
}
assertNotNull(regionName);
assertNotNull(serverName);
HTable meta = new HTable(conf, HConstants.META_TABLE_NAME);
Put put = new Put(regionName);
put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
Bytes.toBytes(serverName.getHostAndPort()));
meta.put(put);
// fix the problem.
HBaseFsck fsck = new HBaseFsck(conf);
fsck.connect();
fsck.setDisplayFullReport(); // i.e. -details
fsck.setTimeLag(0);
fsck.setFixAssignments(true);
fsck.setFixMeta(true);
fsck.setFixHdfsHoles(true);
fsck.setFixHdfsOverlaps(true);
fsck.setFixHdfsOrphans(true);
fsck.setFixVersionFile(true);
fsck.setSidelineBigOverlaps(true);
fsck.setMaxMerge(2);
fsck.onlineHbck();
// verify that overlaps are fixed, and there are less rows
// since one region is sidelined.
HBaseFsck hbck2 = doFsck(conf,false);
assertNoErrors(hbck2);
assertEquals(0, hbck2.getOverlapGroups(table).size());
assertTrue(ROWKEYS.length > countRows());
} finally {
deleteTable(table);
}
}
/**
* This creates and fixes a bad table where a region is completely contained
* by another region, and there is a hole (sort of like a bad split)