HBASE-3695 Some improvements to Hbck to test the entire region chain in Meta and provide better error reporting

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1099566 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2011-05-04 19:28:45 +00:00
parent 4e4ded4cde
commit 3d6b1485bf
4 changed files with 340 additions and 120 deletions

View File

@ -285,6 +285,8 @@ Release 0.90.3 - Unreleased
HBASE-3580 Remove RS from DeadServer when new instance checks in HBASE-3580 Remove RS from DeadServer when new instance checks in
HBASE-2470 Add Scan.setTimeRange() support in Shell (Harsh J Chouraria) HBASE-2470 Add Scan.setTimeRange() support in Shell (Harsh J Chouraria)
HBASE-3805 Log RegionState that are processed too late in the master HBASE-3805 Log RegionState that are processed too late in the master
HBASE-3695 Some improvements to Hbck to test the entire region chain in
Meta and provide better error reporting (Marc Limotte)
TASKS TASKS
HBASE-3748 Add rolling of thrift/rest daemons to graceful_stop.sh script HBASE-3748 Add rolling of thrift/rest daemons to graceful_stop.sh script

View File

@ -20,14 +20,7 @@
package org.apache.hadoop.hbase.util; package org.apache.hadoop.hbase.util;
import java.io.IOException; import java.io.IOException;
import java.util.Collection; import java.util.*;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@ -69,6 +62,8 @@ import org.apache.zookeeper.KeeperException;
import com.google.common.base.Joiner; import com.google.common.base.Joiner;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import static org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
/** /**
* Check consistency among the in-memory states of the master and the * Check consistency among the in-memory states of the master and the
* region server(s) and the state of data in HDFS. * region server(s) and the state of data in HDFS.
@ -140,20 +135,21 @@ public class HBaseFsck {
tablesInfo.clear(); tablesInfo.clear();
emptyRegionInfoQualifiers.clear(); emptyRegionInfoQualifiers.clear();
disabledTables.clear(); disabledTables.clear();
errors.clear();
// get a list of all regions from the master. This involves // get a list of all regions from the master. This involves
// scanning the META table // scanning the META table
if (!recordRootRegion()) { if (!recordRootRegion()) {
// Will remove later if we can fix it // Will remove later if we can fix it
errors.reportError("Encountered fatal error. Exitting..."); errors.reportError("Encountered fatal error. Exiting...");
return -1; return -1;
} }
getMetaEntries(); getMetaEntries();
// Check if .META. is found only once and on the right place // Check if .META. is found only once and in the right place
if (!checkMetaEntries()) { if (!checkMetaEntries()) {
// Will remove later if we can fix it // Will remove later if we can fix it
errors.reportError("Encountered fatal error. Exitting..."); errors.reportError("Encountered fatal error. Exiting...");
return -1; return -1;
} }
@ -225,6 +221,10 @@ public class HBaseFsck {
return errors.summarize(); return errors.summarize();
} }
public ErrorReporter getErrors() {
return errors;
}
/** /**
* Load the list of disabled tables in ZK into local set. * Load the list of disabled tables in ZK into local set.
* @throws ZooKeeperConnectionException * @throws ZooKeeperConnectionException
@ -281,7 +281,8 @@ public class HBaseFsck {
// verify that version file exists // verify that version file exists
if (!foundVersionFile) { if (!foundVersionFile) {
errors.reportError("Version file does not exist in root dir " + rootDir); errors.reportError(ERROR_CODE.NO_VERSION_FILE,
"Version file does not exist in root dir " + rootDir);
} }
// level 1: <HBASE_DIR>/* // level 1: <HBASE_DIR>/*
@ -315,7 +316,8 @@ public class HBaseFsck {
// Check if Root region is valid and existing // Check if Root region is valid and existing
if (rootLocation == null || rootLocation.getRegionInfo() == null || if (rootLocation == null || rootLocation.getRegionInfo() == null ||
rootLocation.getHostname() == null) { rootLocation.getHostname() == null) {
errors.reportError("Root Region or some of its attributes is null."); errors.reportError(ERROR_CODE.NULL_ROOT_REGION,
"Root Region or some of its attributes are null.");
return false; return false;
} }
ServerName sn; ServerName sn;
@ -428,7 +430,7 @@ public class HBaseFsck {
LOG.debug("Region " + descriptiveName + " offline, ignoring."); LOG.debug("Region " + descriptiveName + " offline, ignoring.");
return; return;
} else if (recentlyModified) { } else if (recentlyModified) {
LOG.info("Region " + descriptiveName + " was recently modified -- skipping"); LOG.warn("Region " + descriptiveName + " was recently modified -- skipping");
return; return;
} }
// ========== Cases where the region is not in META ============= // ========== Cases where the region is not in META =============
@ -436,24 +438,29 @@ public class HBaseFsck {
// We shouldn't have record of this region at all then! // We shouldn't have record of this region at all then!
assert false : "Entry for region with no data"; assert false : "Entry for region with no data";
} else if (!inMeta && !inHdfs && isDeployed) { } else if (!inMeta && !inHdfs && isDeployed) {
errors.reportError("Region " + descriptiveName + ", key=" + key + ", not on HDFS or in META but " + errors.reportError(ERROR_CODE.NOT_IN_META_HDFS, "Region "
+ descriptiveName + ", key=" + key + ", not on HDFS or in META but " +
"deployed on " + Joiner.on(", ").join(hbi.deployedOn)); "deployed on " + Joiner.on(", ").join(hbi.deployedOn));
} else if (!inMeta && inHdfs && !isDeployed) { } else if (!inMeta && inHdfs && !isDeployed) {
errors.reportError("Region " + descriptiveName + " on HDFS, but not listed in META " + errors.reportError(ERROR_CODE.NOT_IN_META_OR_DEPLOYED, "Region "
+ descriptiveName + " on HDFS, but not listed in META " +
"or deployed on any region server."); "or deployed on any region server.");
} else if (!inMeta && inHdfs && isDeployed) { } else if (!inMeta && inHdfs && isDeployed) {
errors.reportError("Region " + descriptiveName + " not in META, but deployed on " + errors.reportError(ERROR_CODE.NOT_IN_META, "Region " + descriptiveName
Joiner.on(", ").join(hbi.deployedOn)); + " not in META, but deployed on " + Joiner.on(", ").join(hbi.deployedOn));
// ========== Cases where the region is in META ============= // ========== Cases where the region is in META =============
} else if (inMeta && !inHdfs && !isDeployed) { } else if (inMeta && !inHdfs && !isDeployed) {
errors.reportError("Region " + descriptiveName + " found in META, but not in HDFS " + errors.reportError(ERROR_CODE.NOT_IN_HDFS_OR_DEPLOYED, "Region "
"or deployed on any region server."); + descriptiveName + " found in META, but not in HDFS "
+ "or deployed on any region server.");
} else if (inMeta && !inHdfs && isDeployed) { } else if (inMeta && !inHdfs && isDeployed) {
errors.reportError("Region " + descriptiveName + " found in META, but not in HDFS, " + errors.reportError(ERROR_CODE.NOT_IN_HDFS, "Region " + descriptiveName
+ " found in META, but not in HDFS, " +
"and deployed on " + Joiner.on(", ").join(hbi.deployedOn)); "and deployed on " + Joiner.on(", ").join(hbi.deployedOn));
} else if (inMeta && inHdfs && !isDeployed && shouldBeDeployed) { } else if (inMeta && inHdfs && !isDeployed && shouldBeDeployed) {
errors.reportError("Region " + descriptiveName + " not deployed on any region server."); errors.reportError(ERROR_CODE.NOT_DEPLOYED, "Region " + descriptiveName
+ " not deployed on any region server.");
// If we are trying to fix the errors // If we are trying to fix the errors
if (shouldFix()) { if (shouldFix()) {
errors.print("Trying to fix unassigned region..."); errors.print("Trying to fix unassigned region...");
@ -461,11 +468,13 @@ public class HBaseFsck {
HBaseFsckRepair.fixUnassigned(this.conf, hbi.metaEntry); HBaseFsckRepair.fixUnassigned(this.conf, hbi.metaEntry);
} }
} else if (inMeta && inHdfs && isDeployed && !shouldBeDeployed) { } else if (inMeta && inHdfs && isDeployed && !shouldBeDeployed) {
errors.reportError("Region " + descriptiveName + " should not be deployed according " + errors.reportError(ERROR_CODE.SHOULD_NOT_BE_DEPLOYED, "Region "
+ descriptiveName + " should not be deployed according " +
"to META, but is deployed on " + Joiner.on(", ").join(hbi.deployedOn)); "to META, but is deployed on " + Joiner.on(", ").join(hbi.deployedOn));
} else if (inMeta && inHdfs && isMultiplyDeployed) { } else if (inMeta && inHdfs && isMultiplyDeployed) {
errors.reportError("Region " + descriptiveName + " is listed in META on region server " + errors.reportError(ERROR_CODE.MULTI_DEPLOYED, "Region " + descriptiveName
hbi.metaEntry.regionServer + " but is multiply assigned to region servers " + + " is listed in META on region server " + hbi.metaEntry.regionServer
+ " but is multiply assigned to region servers " +
Joiner.on(", ").join(hbi.deployedOn)); Joiner.on(", ").join(hbi.deployedOn));
// If we are trying to fix the errors // If we are trying to fix the errors
if (shouldFix()) { if (shouldFix()) {
@ -474,7 +483,8 @@ public class HBaseFsck {
HBaseFsckRepair.fixDupeAssignment(this.conf, hbi.metaEntry, hbi.deployedOn); HBaseFsckRepair.fixDupeAssignment(this.conf, hbi.metaEntry, hbi.deployedOn);
} }
} else if (inMeta && inHdfs && isDeployed && !deploymentMatchesMeta) { } else if (inMeta && inHdfs && isDeployed && !deploymentMatchesMeta) {
errors.reportError("Region " + descriptiveName + " listed in META on region server " + errors.reportError(ERROR_CODE.SERVER_DOES_NOT_MATCH_META, "Region "
+ descriptiveName + " listed in META on region server " +
hbi.metaEntry.regionServer + " but found on region server " + hbi.metaEntry.regionServer + " but found on region server " +
hbi.deployedOn.get(0)); hbi.deployedOn.get(0));
// If we are trying to fix the errors // If we are trying to fix the errors
@ -484,7 +494,8 @@ public class HBaseFsck {
HBaseFsckRepair.fixDupeAssignment(this.conf, hbi.metaEntry, hbi.deployedOn); HBaseFsckRepair.fixDupeAssignment(this.conf, hbi.metaEntry, hbi.deployedOn);
} }
} else { } else {
errors.reportError("Region " + descriptiveName + " is in an unforeseen state:" + errors.reportError(ERROR_CODE.UNKNOWN, "Region " + descriptiveName +
" is in an unforeseen state:" +
" inMeta=" + inMeta + " inMeta=" + inMeta +
" inHdfs=" + inHdfs + " inHdfs=" + inHdfs +
" isDeployed=" + isDeployed + " isDeployed=" + isDeployed +
@ -504,10 +515,15 @@ public class HBaseFsck {
// Check only valid, working regions // Check only valid, working regions
if (hbi.metaEntry == null) continue; if (hbi.metaEntry == null) continue;
if (hbi.metaEntry.regionServer == null) continue; if (hbi.metaEntry.regionServer == null) continue;
if (hbi.foundRegionDir == null) continue;
if (hbi.deployedOn.size() != 1) continue;
if (hbi.onlyEdits) continue; if (hbi.onlyEdits) continue;
// Missing regionDir or over-deployment is checked elsewhere. Include
// these cases in modTInfo, so we can evaluate those regions as part of
// the region chain in META
//if (hbi.foundRegionDir == null) continue;
//if (hbi.deployedOn.size() != 1) continue;
if (hbi.deployedOn.size() == 0) continue;
// We should be safe here // We should be safe here
String tableName = hbi.metaEntry.getTableDesc().getNameAsString(); String tableName = hbi.metaEntry.getTableDesc().getNameAsString();
TInfo modTInfo = tablesInfo.get(tableName); TInfo modTInfo = tablesInfo.get(tableName);
@ -517,13 +533,16 @@ public class HBaseFsck {
for (ServerName server : hbi.deployedOn) { for (ServerName server : hbi.deployedOn) {
modTInfo.addServer(server); modTInfo.addServer(server);
} }
modTInfo.addEdge(hbi.metaEntry.getStartKey(), hbi.metaEntry.getEndKey());
//modTInfo.addEdge(hbi.metaEntry.getStartKey(), hbi.metaEntry.getEndKey());
modTInfo.addRegionInfo(hbi);
tablesInfo.put(tableName, modTInfo); tablesInfo.put(tableName, modTInfo);
} }
for (TInfo tInfo : tablesInfo.values()) { for (TInfo tInfo : tablesInfo.values()) {
if (!tInfo.check()) { if (!tInfo.checkRegionChain()) {
errors.reportError("Found inconsistency in table " + tInfo.getName()); errors.report("Found inconsistency in table " + tInfo.getName());
} }
} }
} }
@ -533,17 +552,16 @@ public class HBaseFsck {
*/ */
private class TInfo { private class TInfo {
String tableName; String tableName;
TreeMap <byte[], byte[]> edges;
TreeSet <ServerName> deployedOn; TreeSet <ServerName> deployedOn;
List<HbckInfo> regions = new ArrayList<HbckInfo>();
TInfo(String name) { TInfo(String name) {
this.tableName = name; this.tableName = name;
edges = new TreeMap <byte[], byte[]> (Bytes.BYTES_COMPARATOR);
deployedOn = new TreeSet <ServerName>(); deployedOn = new TreeSet <ServerName>();
} }
public void addEdge(byte[] fromNode, byte[] toNode) { public void addRegionInfo (HbckInfo r) {
this.edges.put(fromNode, toNode); regions.add(r);
} }
public void addServer(ServerName server) { public void addServer(ServerName server) {
@ -555,46 +573,77 @@ public class HBaseFsck {
} }
public int getNumRegions() { public int getNumRegions() {
return edges.size(); return regions.size();
} }
public boolean check() { /**
byte[] last = new byte[0]; * Check the region chain (from META) of this table. We are looking for
byte[] next = new byte[0]; * holes, overlaps, and cycles.
TreeSet <byte[]> visited = new TreeSet<byte[]>(Bytes.BYTES_COMPARATOR); * @return false if there are errors
// Each table should start with a zero-length byte[] and end at a */
// zero-length byte[]. Just follow the edges to see if this is true public boolean checkRegionChain() {
while (true) { Collections.sort(regions);
// Check if chain is broken HbckInfo last = null;
if (!edges.containsKey(last)) {
errors.detail("Chain of regions in table " + tableName + for (HbckInfo r : regions) {
" is broken; edges does not contain " + Bytes.toString(last)); if (last == null) {
return false; // This is the first region, check that the start key is empty
if (! Bytes.equals(r.metaEntry.getStartKey(), HConstants.EMPTY_BYTE_ARRAY)) {
errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
"First region should start with an empty key.",
this, r);
} }
next = edges.get(last); } else {
// Found a cycle
if (visited.contains(next)) { // Check if endKey < startKey
errors.detail("Chain of regions in table " + tableName + // Previous implementation of this code checked for a cycle in the
" has a cycle around " + Bytes.toString(next)); // region chain. A cycle would imply that the endKey comes before
return false; // the startKey (i.e. endKey < startKey).
if (! Bytes.equals(r.metaEntry.getEndKey(), HConstants.EMPTY_BYTE_ARRAY)) {
// continue with this check if this is not the last region
int cmpRegionKeys = Bytes.compareTo(r.metaEntry.getStartKey(),
r.metaEntry.getEndKey());
if (cmpRegionKeys > 0) {
errors.reportError(ERROR_CODE.REGION_CYCLE,
String.format("The endkey for this region comes before the "
+ "startkey, startkey=%s, endkey=%s",
Bytes.toString(r.metaEntry.getStartKey()),
Bytes.toString(r.metaEntry.getEndKey())),
this, r, last);
} }
// Mark next node as visited
visited.add(next);
// If next is zero-length byte[] we are possibly at the end of the chain
if (next.length == 0) {
// If we have visited all elements we are fine
if (edges.size() != visited.size()) {
errors.detail("Chain of regions in table " + tableName +
" contains less elements than are listed in META; visited=" + visited.size() +
", edges=" + edges.size());
return false;
} }
return true;
// Check if the startkeys are different
if (Bytes.equals(r.metaEntry.getStartKey(), last.metaEntry.getStartKey())) {
errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
"Two regions have the same startkey: "
+ Bytes.toString(r.metaEntry.getStartKey()),
this, r, last);
} else {
// Check that the startkey is the same as the previous end key
int cmp = Bytes.compareTo(r.metaEntry.getStartKey(),
last.metaEntry.getEndKey());
if (cmp > 0) {
// hole
errors.reportError(ERROR_CODE.HOLE_IN_REGION_CHAIN,
"There is a hole in the region chain.",
this, r, last);
} else if (cmp < 0) {
// overlap
errors.reportError(ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
"There is an overlap in the region chain.",
this, r, last);
} }
last = next;
} }
// How did we get here?
} }
last = r;
}
return errors.getErrorList().size() == 0;
}
} }
/** /**
@ -603,7 +652,6 @@ public class HBaseFsck {
* if any of the REGIONINFO_QUALIFIER, SERVER_QUALIFIER, STARTCODE_QUALIFIER, * if any of the REGIONINFO_QUALIFIER, SERVER_QUALIFIER, STARTCODE_QUALIFIER,
* SPLITA_QUALIFIER, SPLITB_QUALIFIER have not changed in the last * SPLITA_QUALIFIER, SPLITB_QUALIFIER have not changed in the last
* milliseconds specified by timelag, then the table is a candidate to be returned. * milliseconds specified by timelag, then the table is a candidate to be returned.
* @param regionList - all entries found in .META
* @return tables that have not been modified recently * @return tables that have not been modified recently
* @throws IOException if an error is encountered * @throws IOException if an error is encountered
*/ */
@ -668,7 +716,7 @@ public class HBaseFsck {
// If there is no region holding .META. // If there is no region holding .META.
if (metaRegions.size() == 0) { if (metaRegions.size() == 0) {
errors.reportError(".META. is not found on any region."); errors.reportError(ERROR_CODE.NO_META_REGION, ".META. is not found on any region.");
if (shouldFix()) { if (shouldFix()) {
errors.print("Trying to fix a problem with .META..."); errors.print("Trying to fix a problem with .META...");
setShouldRerun(); setShouldRerun();
@ -678,7 +726,7 @@ public class HBaseFsck {
} }
// If there are more than one regions pretending to hold the .META. // If there are more than one regions pretending to hold the .META.
else if (metaRegions.size() > 1) { else if (metaRegions.size() > 1) {
errors.reportError(".META. is found on more than one region."); errors.reportError(ERROR_CODE.MULTI_META_REGION, ".META. is found on more than one region.");
if (shouldFix()) { if (shouldFix()) {
errors.print("Trying to fix a problem with .META..."); errors.print("Trying to fix a problem with .META...");
setShouldRerun(); setShouldRerun();
@ -773,7 +821,7 @@ public class HBaseFsck {
/** /**
* Maintain information about a particular region. * Maintain information about a particular region.
*/ */
static class HbckInfo { static class HbckInfo implements Comparable {
boolean onlyEdits = false; boolean onlyEdits = false;
MetaEntry metaEntry = null; MetaEntry metaEntry = null;
FileStatus foundRegionDir = null; FileStatus foundRegionDir = null;
@ -796,6 +844,16 @@ public class HBaseFsck {
return "UNKNOWN_REGION on " + Joiner.on(", ").join(deployedOn); return "UNKNOWN_REGION on " + Joiner.on(", ").join(deployedOn);
} }
} }
@Override
public int compareTo(Object o) {
HbckInfo other = (HbckInfo) o;
int startComparison = Bytes.compareTo(this.metaEntry.getStartKey(), other.metaEntry.getStartKey());
if (startComparison != 0)
return startComparison;
else
return Bytes.compareTo(this.metaEntry.getEndKey(), other.metaEntry.getEndKey());
}
} }
/** /**
@ -804,10 +862,10 @@ public class HBaseFsck {
private void printTableSummary() { private void printTableSummary() {
System.out.println("Summary:"); System.out.println("Summary:");
for (TInfo tInfo : tablesInfo.values()) { for (TInfo tInfo : tablesInfo.values()) {
if (tInfo.check()) { if (errors.tableHasErrors(tInfo)) {
System.out.println(" " + tInfo.getName() + " is okay.");
} else {
System.out.println("Table " + tInfo.getName() + " is inconsistent."); System.out.println("Table " + tInfo.getName() + " is inconsistent.");
} else {
System.out.println(" " + tInfo.getName() + " is okay.");
} }
System.out.println(" Number of regions: " + tInfo.getNumRegions()); System.out.println(" Number of regions: " + tInfo.getNumRegions());
System.out.print(" Deployed on: "); System.out.print(" Deployed on: ");
@ -819,19 +877,45 @@ public class HBaseFsck {
} }
interface ErrorReporter { interface ErrorReporter {
public static enum ERROR_CODE {
UNKNOWN, NO_META_REGION, NULL_ROOT_REGION, NO_VERSION_FILE, NOT_IN_META_HDFS, NOT_IN_META,
NOT_IN_META_OR_DEPLOYED, NOT_IN_HDFS_OR_DEPLOYED, NOT_IN_HDFS, SERVER_DOES_NOT_MATCH_META, NOT_DEPLOYED,
MULTI_DEPLOYED, SHOULD_NOT_BE_DEPLOYED, MULTI_META_REGION, RS_CONNECT_FAILURE,
FIRST_REGION_STARTKEY_NOT_EMPTY, DUPE_STARTKEYS,
HOLE_IN_REGION_CHAIN, OVERLAP_IN_REGION_CHAIN, REGION_CYCLE
}
public void clear();
public void report(String message);
public void reportError(String message); public void reportError(String message);
public void reportError(ERROR_CODE errorCode, String message);
public void reportError(ERROR_CODE errorCode, String message, TInfo table, HbckInfo info);
public void reportError(ERROR_CODE errorCode, String message, TInfo table, HbckInfo info1, HbckInfo info2);
public int summarize(); public int summarize();
public void detail(String details); public void detail(String details);
public ArrayList<ERROR_CODE> getErrorList();
public void progress(); public void progress();
public void print(String message); public void print(String message);
public void resetErrors(); public void resetErrors();
public boolean tableHasErrors(TInfo table);
} }
private static class PrintingErrorReporter implements ErrorReporter { private static class PrintingErrorReporter implements ErrorReporter {
public int errorCount = 0; public int errorCount = 0;
private int showProgress; private int showProgress;
public synchronized void reportError(String message) { Set<TInfo> errorTables = new HashSet<TInfo>();
// for use by unit tests to verify which errors were discovered
private ArrayList<ERROR_CODE> errorList = new ArrayList<ERROR_CODE>();
public void clear() {
errorTables.clear();
errorList.clear();
errorCount = 0;
}
public synchronized void reportError(ERROR_CODE errorCode, String message) {
errorList.add(errorCode);
if (!summary) { if (!summary) {
System.out.println("ERROR: " + message); System.out.println("ERROR: " + message);
} }
@ -839,6 +923,37 @@ public class HBaseFsck {
showProgress = 0; showProgress = 0;
} }
public synchronized void reportError(ERROR_CODE errorCode, String message, TInfo table,
HbckInfo info) {
errorTables.add(table);
String reference = "(region " + info.metaEntry.getRegionNameAsString() + ")";
reportError(errorCode, reference + " " + message);
}
public synchronized void reportError(ERROR_CODE errorCode, String message, TInfo table,
HbckInfo info1, HbckInfo info2) {
errorTables.add(table);
String reference = "(regions " + info1.metaEntry.getRegionNameAsString()
+ " and " + info2.metaEntry.getRegionNameAsString() + ")";
reportError(errorCode, reference + " " + message);
}
public synchronized void reportError(String message) {
reportError(ERROR_CODE.UNKNOWN, message);
}
/**
* Report error information, but do not increment the error count. Intended for cases
* where the actual error would have been reported previously.
* @param message
*/
public synchronized void report(String message) {
if (! summary) {
System.out.println("ERROR: " + message);
}
showProgress = 0;
}
public synchronized int summarize() { public synchronized int summarize() {
System.out.println(Integer.toString(errorCount) + System.out.println(Integer.toString(errorCount) +
" inconsistencies detected."); " inconsistencies detected.");
@ -851,12 +966,21 @@ public class HBaseFsck {
} }
} }
public ArrayList<ERROR_CODE> getErrorList() {
return errorList;
}
public synchronized void print(String message) { public synchronized void print(String message) {
if (!summary) { if (!summary) {
System.out.println(message); System.out.println(message);
} }
} }
@Override
public boolean tableHasErrors(TInfo table) {
return errorTables.contains(table);
}
@Override @Override
public void resetErrors() { public void resetErrors() {
errorCount = 0; errorCount = 0;
@ -923,13 +1047,13 @@ public class HBaseFsck {
} }
} }
// check to see if the existance of this region matches the region in META // check to see if the existence of this region matches the region in META
for (HRegionInfo r:regions) { for (HRegionInfo r:regions) {
HbckInfo hbi = hbck.getOrCreateInfo(r.getEncodedName()); HbckInfo hbi = hbck.getOrCreateInfo(r.getEncodedName());
hbi.addServer(rsinfo); hbi.addServer(rsinfo);
} }
} catch (IOException e) { // unable to connect to the region server. } catch (IOException e) { // unable to connect to the region server.
errors.reportError("RegionServer: " + rsinfo.getServerName() + errors.reportError(ERROR_CODE.RS_CONNECT_FAILURE, "RegionServer: " + rsinfo.getServerName() +
" Unable to fetch region information. " + e); " Unable to fetch region information. " + e);
} finally { } finally {
done = true; done = true;
@ -1000,7 +1124,7 @@ public class HBaseFsck {
} }
} }
} catch (IOException e) { // unable to connect to the region server. } catch (IOException e) { // unable to connect to the region server.
errors.reportError("Table Directory: " + tableDir.getPath().getName() + errors.reportError(ERROR_CODE.RS_CONNECT_FAILURE, "Table Directory: " + tableDir.getPath().getName() +
" Unable to fetch region information. " + e); " Unable to fetch region information. " + e);
} finally { } finally {
done = true; done = true;

View File

@ -89,6 +89,7 @@ public class HBaseTestingUtility {
*/ */
private boolean passedZkCluster = false; private boolean passedZkCluster = false;
private MiniDFSCluster dfsCluster = null; private MiniDFSCluster dfsCluster = null;
private MiniHBaseCluster hbaseCluster = null; private MiniHBaseCluster hbaseCluster = null;
private MiniMRCluster mrCluster = null; private MiniMRCluster mrCluster = null;
// If non-null, then already a cluster running. // If non-null, then already a cluster running.
@ -113,6 +114,10 @@ public class HBaseTestingUtility {
this.conf = conf; this.conf = conf;
} }
public MiniHBaseCluster getHbaseCluster() {
return hbaseCluster;
}
/** /**
* Returns this classes's instance of {@link Configuration}. Be careful how * Returns this classes's instance of {@link Configuration}. Be careful how
* you use the returned Configuration since {@link HConnection} instances * you use the returned Configuration since {@link HConnection} instances

View File

@ -19,23 +19,28 @@
*/ */
package org.apache.hadoop.hbase.util; package org.apache.hadoop.hbase.util;
import static org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.ipc.HRegionInterface;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.junit.BeforeClass; import org.junit.BeforeClass;
import org.junit.Test; import org.junit.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
public class TestHBaseFsck { public class TestHBaseFsck {
final Log LOG = LogFactory.getLog(getClass()); final Log LOG = LogFactory.getLog(getClass());
@ -50,32 +55,39 @@ public class TestHBaseFsck {
TEST_UTIL.startMiniCluster(3); TEST_UTIL.startMiniCluster(3);
} }
private int doFsck(boolean fix) throws Exception { private List doFsck(boolean fix) throws Exception {
HBaseFsck fsck = new HBaseFsck(conf); HBaseFsck fsck = new HBaseFsck(conf);
fsck.displayFullReport(); fsck.displayFullReport(); // i.e. -details
fsck.setTimeLag(0); fsck.setTimeLag(0);
fsck.setFixErrors(fix); fsck.setFixErrors(fix);
// Most basic check ever, 0 tables fsck.doWork();
return fsck.doWork(); return fsck.getErrors().getErrorList();
}
private void assertNoErrors(List errs) throws Exception {
assertEquals(0, errs.size());
}
private void assertErrors(List errs, ERROR_CODE[] expectedErrors) {
assertEquals(Arrays.asList(expectedErrors), errs);
} }
@Test @Test
public void testHBaseFsck() throws Exception { public void testHBaseFsck() throws Exception {
int result = doFsck(false); assertNoErrors(doFsck(false));
assertEquals(0, result);
TEST_UTIL.createTable(TABLE, FAM); TEST_UTIL.createTable(TABLE, FAM);
// We created 1 table, should be fine // We created 1 table, should be fine
result = doFsck(false); assertNoErrors(doFsck(false));
assertEquals(0, result);
// Now let's mess it up and change the assignment in .META. to // Now let's mess it up and change the assignment in .META. to
// point to a different region server // point to a different region server
HTable meta = new HTable(conf, HTableDescriptor.META_TABLEDESC.getName()); HTable meta = new HTable(conf, HTableDescriptor.META_TABLEDESC.getName());
ResultScanner scanner = meta.getScanner(new Scan()); ResultScanner scanner = meta.getScanner(new Scan());
resforloop : for (Result res : scanner) { resforloop:
for (Result res : scanner) {
long startCode = Bytes.toLong(res.getValue(HConstants.CATALOG_FAMILY, long startCode = Bytes.toLong(res.getValue(HConstants.CATALOG_FAMILY,
HConstants.STARTCODE_QUALIFIER)); HConstants.STARTCODE_QUALIFIER));
@ -98,13 +110,90 @@ public class TestHBaseFsck {
} }
// Try to fix the data // Try to fix the data
result = doFsck(true); assertErrors(doFsck(true), new ERROR_CODE[]{
assertEquals(-1, result); ERROR_CODE.SERVER_DOES_NOT_MATCH_META});
Thread.sleep(15000); Thread.sleep(15000);
result = doFsck(false);
// Should have fixed // Should be fixed now
assertEquals(0, result); assertNoErrors(doFsck(false));
// comment needed - what is the purpose of this line
new HTable(conf, TABLE).getScanner(new Scan()); new HTable(conf, TABLE).getScanner(new Scan());
} }
private HRegionInfo createRegion(Configuration conf, final HTableDescriptor
htd, byte[] startKey, byte[] endKey)
throws IOException {
HTable meta = new HTable(conf, HConstants.META_TABLE_NAME);
HRegionInfo hri = new HRegionInfo(htd, startKey, endKey);
Put put = new Put(hri.getRegionName());
put.add(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER,
Writables.getBytes(hri));
meta.put(put);
return hri;
}
@Test
/**
* Tests for inconsistencies in the META data (duplicate start keys, or holes)
*/
public void testHBaseFsckMeta() throws Exception {
assertNoErrors(doFsck(false));
HTable tbl = TEST_UTIL.createTable(Bytes.toBytes("table2"), FAM);
HRegionInfo hriOrig = tbl.getRegionsInfo().keySet().iterator().next();
HServerAddress rsAddressOrig = tbl.getRegionsInfo().get(hriOrig);
byte[][] startKeys = new byte[][]{
HConstants.EMPTY_BYTE_ARRAY,
Bytes.toBytes("A"),
Bytes.toBytes("B"),
Bytes.toBytes("C")
};
TEST_UTIL.createMultiRegions(conf, tbl, FAM, startKeys);
Path rootDir = new Path(conf.get(HConstants.HBASE_DIR));
FileSystem fs = rootDir.getFileSystem(conf);
Path p = new Path(rootDir + "/table2", hriOrig.getEncodedName());
fs.delete(p, true);
Thread.sleep(1 * 1000);
ArrayList servers = new ArrayList();
servers.add(rsAddressOrig);
HBaseFsckRepair.fixDupeAssignment(conf, hriOrig, servers);
// We created 1 table, should be fine
assertNoErrors(doFsck(false));
// Now let's mess it up, by adding a region with a duplicate startkey
HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
Bytes.toBytes("A"), Bytes.toBytes("A2"));
TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
.waitForAssignment(hriDupe);
assertErrors(doFsck(false), new ERROR_CODE[]{ERROR_CODE.DUPE_STARTKEYS});
// Mess it up by creating an overlap in the metadata
HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
Bytes.toBytes("A2"), Bytes.toBytes("B2"));
TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
.waitForAssignment(hriOverlap);
assertErrors(doFsck(false), new ERROR_CODE[]{
ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
// Mess it up by leaving a hole in the meta data
HRegionInfo hriHole = createRegion(conf, tbl.getTableDescriptor(),
Bytes.toBytes("D"), Bytes.toBytes("E"));
TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriHole);
TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
.waitForAssignment(hriHole);
// assertError(doFsck(false), ERROR_CODE.OVERLAP_IN_REGION_CHAIN);
assertErrors(doFsck(false), new ERROR_CODE[]{ ERROR_CODE.DUPE_STARTKEYS,
ERROR_CODE.OVERLAP_IN_REGION_CHAIN, ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
ERROR_CODE.HOLE_IN_REGION_CHAIN });
}
} }