HBASE-3695 Some improvements to Hbck to test the entire region chain in Meta and provide better error reporting
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1099566 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4e4ded4cde
commit
3d6b1485bf
|
@ -285,6 +285,8 @@ Release 0.90.3 - Unreleased
|
|||
HBASE-3580 Remove RS from DeadServer when new instance checks in
|
||||
HBASE-2470 Add Scan.setTimeRange() support in Shell (Harsh J Chouraria)
|
||||
HBASE-3805 Log RegionState that are processed too late in the master
|
||||
HBASE-3695 Some improvements to Hbck to test the entire region chain in
|
||||
Meta and provide better error reporting (Marc Limotte)
|
||||
|
||||
TASKS
|
||||
HBASE-3748 Add rolling of thrift/rest daemons to graceful_stop.sh script
|
||||
|
|
|
@ -20,14 +20,7 @@
|
|||
package org.apache.hadoop.hbase.util;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.ThreadPoolExecutor;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
@ -69,6 +62,8 @@ import org.apache.zookeeper.KeeperException;
|
|||
import com.google.common.base.Joiner;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import static org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
|
||||
|
||||
/**
|
||||
* Check consistency among the in-memory states of the master and the
|
||||
* region server(s) and the state of data in HDFS.
|
||||
|
@ -140,20 +135,21 @@ public class HBaseFsck {
|
|||
tablesInfo.clear();
|
||||
emptyRegionInfoQualifiers.clear();
|
||||
disabledTables.clear();
|
||||
errors.clear();
|
||||
|
||||
// get a list of all regions from the master. This involves
|
||||
// scanning the META table
|
||||
if (!recordRootRegion()) {
|
||||
// Will remove later if we can fix it
|
||||
errors.reportError("Encountered fatal error. Exitting...");
|
||||
errors.reportError("Encountered fatal error. Exiting...");
|
||||
return -1;
|
||||
}
|
||||
getMetaEntries();
|
||||
|
||||
// Check if .META. is found only once and on the right place
|
||||
// Check if .META. is found only once and in the right place
|
||||
if (!checkMetaEntries()) {
|
||||
// Will remove later if we can fix it
|
||||
errors.reportError("Encountered fatal error. Exitting...");
|
||||
errors.reportError("Encountered fatal error. Exiting...");
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -225,6 +221,10 @@ public class HBaseFsck {
|
|||
return errors.summarize();
|
||||
}
|
||||
|
||||
public ErrorReporter getErrors() {
|
||||
return errors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the list of disabled tables in ZK into local set.
|
||||
* @throws ZooKeeperConnectionException
|
||||
|
@ -281,7 +281,8 @@ public class HBaseFsck {
|
|||
|
||||
// verify that version file exists
|
||||
if (!foundVersionFile) {
|
||||
errors.reportError("Version file does not exist in root dir " + rootDir);
|
||||
errors.reportError(ERROR_CODE.NO_VERSION_FILE,
|
||||
"Version file does not exist in root dir " + rootDir);
|
||||
}
|
||||
|
||||
// level 1: <HBASE_DIR>/*
|
||||
|
@ -315,7 +316,8 @@ public class HBaseFsck {
|
|||
// Check if Root region is valid and existing
|
||||
if (rootLocation == null || rootLocation.getRegionInfo() == null ||
|
||||
rootLocation.getHostname() == null) {
|
||||
errors.reportError("Root Region or some of its attributes is null.");
|
||||
errors.reportError(ERROR_CODE.NULL_ROOT_REGION,
|
||||
"Root Region or some of its attributes are null.");
|
||||
return false;
|
||||
}
|
||||
ServerName sn;
|
||||
|
@ -428,7 +430,7 @@ public class HBaseFsck {
|
|||
LOG.debug("Region " + descriptiveName + " offline, ignoring.");
|
||||
return;
|
||||
} else if (recentlyModified) {
|
||||
LOG.info("Region " + descriptiveName + " was recently modified -- skipping");
|
||||
LOG.warn("Region " + descriptiveName + " was recently modified -- skipping");
|
||||
return;
|
||||
}
|
||||
// ========== Cases where the region is not in META =============
|
||||
|
@ -436,24 +438,29 @@ public class HBaseFsck {
|
|||
// We shouldn't have record of this region at all then!
|
||||
assert false : "Entry for region with no data";
|
||||
} else if (!inMeta && !inHdfs && isDeployed) {
|
||||
errors.reportError("Region " + descriptiveName + ", key=" + key + ", not on HDFS or in META but " +
|
||||
errors.reportError(ERROR_CODE.NOT_IN_META_HDFS, "Region "
|
||||
+ descriptiveName + ", key=" + key + ", not on HDFS or in META but " +
|
||||
"deployed on " + Joiner.on(", ").join(hbi.deployedOn));
|
||||
} else if (!inMeta && inHdfs && !isDeployed) {
|
||||
errors.reportError("Region " + descriptiveName + " on HDFS, but not listed in META " +
|
||||
errors.reportError(ERROR_CODE.NOT_IN_META_OR_DEPLOYED, "Region "
|
||||
+ descriptiveName + " on HDFS, but not listed in META " +
|
||||
"or deployed on any region server.");
|
||||
} else if (!inMeta && inHdfs && isDeployed) {
|
||||
errors.reportError("Region " + descriptiveName + " not in META, but deployed on " +
|
||||
Joiner.on(", ").join(hbi.deployedOn));
|
||||
errors.reportError(ERROR_CODE.NOT_IN_META, "Region " + descriptiveName
|
||||
+ " not in META, but deployed on " + Joiner.on(", ").join(hbi.deployedOn));
|
||||
|
||||
// ========== Cases where the region is in META =============
|
||||
} else if (inMeta && !inHdfs && !isDeployed) {
|
||||
errors.reportError("Region " + descriptiveName + " found in META, but not in HDFS " +
|
||||
"or deployed on any region server.");
|
||||
errors.reportError(ERROR_CODE.NOT_IN_HDFS_OR_DEPLOYED, "Region "
|
||||
+ descriptiveName + " found in META, but not in HDFS "
|
||||
+ "or deployed on any region server.");
|
||||
} else if (inMeta && !inHdfs && isDeployed) {
|
||||
errors.reportError("Region " + descriptiveName + " found in META, but not in HDFS, " +
|
||||
errors.reportError(ERROR_CODE.NOT_IN_HDFS, "Region " + descriptiveName
|
||||
+ " found in META, but not in HDFS, " +
|
||||
"and deployed on " + Joiner.on(", ").join(hbi.deployedOn));
|
||||
} else if (inMeta && inHdfs && !isDeployed && shouldBeDeployed) {
|
||||
errors.reportError("Region " + descriptiveName + " not deployed on any region server.");
|
||||
errors.reportError(ERROR_CODE.NOT_DEPLOYED, "Region " + descriptiveName
|
||||
+ " not deployed on any region server.");
|
||||
// If we are trying to fix the errors
|
||||
if (shouldFix()) {
|
||||
errors.print("Trying to fix unassigned region...");
|
||||
|
@ -461,11 +468,13 @@ public class HBaseFsck {
|
|||
HBaseFsckRepair.fixUnassigned(this.conf, hbi.metaEntry);
|
||||
}
|
||||
} else if (inMeta && inHdfs && isDeployed && !shouldBeDeployed) {
|
||||
errors.reportError("Region " + descriptiveName + " should not be deployed according " +
|
||||
errors.reportError(ERROR_CODE.SHOULD_NOT_BE_DEPLOYED, "Region "
|
||||
+ descriptiveName + " should not be deployed according " +
|
||||
"to META, but is deployed on " + Joiner.on(", ").join(hbi.deployedOn));
|
||||
} else if (inMeta && inHdfs && isMultiplyDeployed) {
|
||||
errors.reportError("Region " + descriptiveName + " is listed in META on region server " +
|
||||
hbi.metaEntry.regionServer + " but is multiply assigned to region servers " +
|
||||
errors.reportError(ERROR_CODE.MULTI_DEPLOYED, "Region " + descriptiveName
|
||||
+ " is listed in META on region server " + hbi.metaEntry.regionServer
|
||||
+ " but is multiply assigned to region servers " +
|
||||
Joiner.on(", ").join(hbi.deployedOn));
|
||||
// If we are trying to fix the errors
|
||||
if (shouldFix()) {
|
||||
|
@ -474,7 +483,8 @@ public class HBaseFsck {
|
|||
HBaseFsckRepair.fixDupeAssignment(this.conf, hbi.metaEntry, hbi.deployedOn);
|
||||
}
|
||||
} else if (inMeta && inHdfs && isDeployed && !deploymentMatchesMeta) {
|
||||
errors.reportError("Region " + descriptiveName + " listed in META on region server " +
|
||||
errors.reportError(ERROR_CODE.SERVER_DOES_NOT_MATCH_META, "Region "
|
||||
+ descriptiveName + " listed in META on region server " +
|
||||
hbi.metaEntry.regionServer + " but found on region server " +
|
||||
hbi.deployedOn.get(0));
|
||||
// If we are trying to fix the errors
|
||||
|
@ -484,7 +494,8 @@ public class HBaseFsck {
|
|||
HBaseFsckRepair.fixDupeAssignment(this.conf, hbi.metaEntry, hbi.deployedOn);
|
||||
}
|
||||
} else {
|
||||
errors.reportError("Region " + descriptiveName + " is in an unforeseen state:" +
|
||||
errors.reportError(ERROR_CODE.UNKNOWN, "Region " + descriptiveName +
|
||||
" is in an unforeseen state:" +
|
||||
" inMeta=" + inMeta +
|
||||
" inHdfs=" + inHdfs +
|
||||
" isDeployed=" + isDeployed +
|
||||
|
@ -504,10 +515,15 @@ public class HBaseFsck {
|
|||
// Check only valid, working regions
|
||||
if (hbi.metaEntry == null) continue;
|
||||
if (hbi.metaEntry.regionServer == null) continue;
|
||||
if (hbi.foundRegionDir == null) continue;
|
||||
if (hbi.deployedOn.size() != 1) continue;
|
||||
if (hbi.onlyEdits) continue;
|
||||
|
||||
// Missing regionDir or over-deployment is checked elsewhere. Include
|
||||
// these cases in modTInfo, so we can evaluate those regions as part of
|
||||
// the region chain in META
|
||||
//if (hbi.foundRegionDir == null) continue;
|
||||
//if (hbi.deployedOn.size() != 1) continue;
|
||||
if (hbi.deployedOn.size() == 0) continue;
|
||||
|
||||
// We should be safe here
|
||||
String tableName = hbi.metaEntry.getTableDesc().getNameAsString();
|
||||
TInfo modTInfo = tablesInfo.get(tableName);
|
||||
|
@ -517,13 +533,16 @@ public class HBaseFsck {
|
|||
for (ServerName server : hbi.deployedOn) {
|
||||
modTInfo.addServer(server);
|
||||
}
|
||||
modTInfo.addEdge(hbi.metaEntry.getStartKey(), hbi.metaEntry.getEndKey());
|
||||
|
||||
//modTInfo.addEdge(hbi.metaEntry.getStartKey(), hbi.metaEntry.getEndKey());
|
||||
modTInfo.addRegionInfo(hbi);
|
||||
|
||||
tablesInfo.put(tableName, modTInfo);
|
||||
}
|
||||
|
||||
for (TInfo tInfo : tablesInfo.values()) {
|
||||
if (!tInfo.check()) {
|
||||
errors.reportError("Found inconsistency in table " + tInfo.getName());
|
||||
if (!tInfo.checkRegionChain()) {
|
||||
errors.report("Found inconsistency in table " + tInfo.getName());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -533,17 +552,16 @@ public class HBaseFsck {
|
|||
*/
|
||||
private class TInfo {
|
||||
String tableName;
|
||||
TreeMap <byte[], byte[]> edges;
|
||||
TreeSet <ServerName> deployedOn;
|
||||
List<HbckInfo> regions = new ArrayList<HbckInfo>();
|
||||
|
||||
TInfo(String name) {
|
||||
this.tableName = name;
|
||||
edges = new TreeMap <byte[], byte[]> (Bytes.BYTES_COMPARATOR);
|
||||
deployedOn = new TreeSet <ServerName>();
|
||||
}
|
||||
|
||||
public void addEdge(byte[] fromNode, byte[] toNode) {
|
||||
this.edges.put(fromNode, toNode);
|
||||
public void addRegionInfo (HbckInfo r) {
|
||||
regions.add(r);
|
||||
}
|
||||
|
||||
public void addServer(ServerName server) {
|
||||
|
@ -555,46 +573,77 @@ public class HBaseFsck {
|
|||
}
|
||||
|
||||
public int getNumRegions() {
|
||||
return edges.size();
|
||||
return regions.size();
|
||||
}
|
||||
|
||||
public boolean check() {
|
||||
byte[] last = new byte[0];
|
||||
byte[] next = new byte[0];
|
||||
TreeSet <byte[]> visited = new TreeSet<byte[]>(Bytes.BYTES_COMPARATOR);
|
||||
// Each table should start with a zero-length byte[] and end at a
|
||||
// zero-length byte[]. Just follow the edges to see if this is true
|
||||
while (true) {
|
||||
// Check if chain is broken
|
||||
if (!edges.containsKey(last)) {
|
||||
errors.detail("Chain of regions in table " + tableName +
|
||||
" is broken; edges does not contain " + Bytes.toString(last));
|
||||
return false;
|
||||
/**
|
||||
* Check the region chain (from META) of this table. We are looking for
|
||||
* holes, overlaps, and cycles.
|
||||
* @return false if there are errors
|
||||
*/
|
||||
public boolean checkRegionChain() {
|
||||
Collections.sort(regions);
|
||||
HbckInfo last = null;
|
||||
|
||||
for (HbckInfo r : regions) {
|
||||
if (last == null) {
|
||||
// This is the first region, check that the start key is empty
|
||||
if (! Bytes.equals(r.metaEntry.getStartKey(), HConstants.EMPTY_BYTE_ARRAY)) {
|
||||
errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
|
||||
"First region should start with an empty key.",
|
||||
this, r);
|
||||
}
|
||||
next = edges.get(last);
|
||||
// Found a cycle
|
||||
if (visited.contains(next)) {
|
||||
errors.detail("Chain of regions in table " + tableName +
|
||||
" has a cycle around " + Bytes.toString(next));
|
||||
return false;
|
||||
} else {
|
||||
|
||||
// Check if endKey < startKey
|
||||
// Previous implementation of this code checked for a cycle in the
|
||||
// region chain. A cycle would imply that the endKey comes before
|
||||
// the startKey (i.e. endKey < startKey).
|
||||
if (! Bytes.equals(r.metaEntry.getEndKey(), HConstants.EMPTY_BYTE_ARRAY)) {
|
||||
// continue with this check if this is not the last region
|
||||
int cmpRegionKeys = Bytes.compareTo(r.metaEntry.getStartKey(),
|
||||
r.metaEntry.getEndKey());
|
||||
if (cmpRegionKeys > 0) {
|
||||
errors.reportError(ERROR_CODE.REGION_CYCLE,
|
||||
String.format("The endkey for this region comes before the "
|
||||
+ "startkey, startkey=%s, endkey=%s",
|
||||
Bytes.toString(r.metaEntry.getStartKey()),
|
||||
Bytes.toString(r.metaEntry.getEndKey())),
|
||||
this, r, last);
|
||||
}
|
||||
// Mark next node as visited
|
||||
visited.add(next);
|
||||
// If next is zero-length byte[] we are possibly at the end of the chain
|
||||
if (next.length == 0) {
|
||||
// If we have visited all elements we are fine
|
||||
if (edges.size() != visited.size()) {
|
||||
errors.detail("Chain of regions in table " + tableName +
|
||||
" contains less elements than are listed in META; visited=" + visited.size() +
|
||||
", edges=" + edges.size());
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
||||
// Check if the startkeys are different
|
||||
if (Bytes.equals(r.metaEntry.getStartKey(), last.metaEntry.getStartKey())) {
|
||||
errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
|
||||
"Two regions have the same startkey: "
|
||||
+ Bytes.toString(r.metaEntry.getStartKey()),
|
||||
this, r, last);
|
||||
} else {
|
||||
// Check that the startkey is the same as the previous end key
|
||||
int cmp = Bytes.compareTo(r.metaEntry.getStartKey(),
|
||||
last.metaEntry.getEndKey());
|
||||
if (cmp > 0) {
|
||||
// hole
|
||||
errors.reportError(ERROR_CODE.HOLE_IN_REGION_CHAIN,
|
||||
"There is a hole in the region chain.",
|
||||
this, r, last);
|
||||
} else if (cmp < 0) {
|
||||
// overlap
|
||||
errors.reportError(ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
|
||||
"There is an overlap in the region chain.",
|
||||
this, r, last);
|
||||
}
|
||||
last = next;
|
||||
}
|
||||
// How did we get here?
|
||||
|
||||
}
|
||||
|
||||
last = r;
|
||||
}
|
||||
|
||||
return errors.getErrorList().size() == 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -603,7 +652,6 @@ public class HBaseFsck {
|
|||
* if any of the REGIONINFO_QUALIFIER, SERVER_QUALIFIER, STARTCODE_QUALIFIER,
|
||||
* SPLITA_QUALIFIER, SPLITB_QUALIFIER have not changed in the last
|
||||
* milliseconds specified by timelag, then the table is a candidate to be returned.
|
||||
* @param regionList - all entries found in .META
|
||||
* @return tables that have not been modified recently
|
||||
* @throws IOException if an error is encountered
|
||||
*/
|
||||
|
@ -668,7 +716,7 @@ public class HBaseFsck {
|
|||
|
||||
// If there is no region holding .META.
|
||||
if (metaRegions.size() == 0) {
|
||||
errors.reportError(".META. is not found on any region.");
|
||||
errors.reportError(ERROR_CODE.NO_META_REGION, ".META. is not found on any region.");
|
||||
if (shouldFix()) {
|
||||
errors.print("Trying to fix a problem with .META...");
|
||||
setShouldRerun();
|
||||
|
@ -678,7 +726,7 @@ public class HBaseFsck {
|
|||
}
|
||||
// If there are more than one regions pretending to hold the .META.
|
||||
else if (metaRegions.size() > 1) {
|
||||
errors.reportError(".META. is found on more than one region.");
|
||||
errors.reportError(ERROR_CODE.MULTI_META_REGION, ".META. is found on more than one region.");
|
||||
if (shouldFix()) {
|
||||
errors.print("Trying to fix a problem with .META...");
|
||||
setShouldRerun();
|
||||
|
@ -773,7 +821,7 @@ public class HBaseFsck {
|
|||
/**
|
||||
* Maintain information about a particular region.
|
||||
*/
|
||||
static class HbckInfo {
|
||||
static class HbckInfo implements Comparable {
|
||||
boolean onlyEdits = false;
|
||||
MetaEntry metaEntry = null;
|
||||
FileStatus foundRegionDir = null;
|
||||
|
@ -796,6 +844,16 @@ public class HBaseFsck {
|
|||
return "UNKNOWN_REGION on " + Joiner.on(", ").join(deployedOn);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(Object o) {
|
||||
HbckInfo other = (HbckInfo) o;
|
||||
int startComparison = Bytes.compareTo(this.metaEntry.getStartKey(), other.metaEntry.getStartKey());
|
||||
if (startComparison != 0)
|
||||
return startComparison;
|
||||
else
|
||||
return Bytes.compareTo(this.metaEntry.getEndKey(), other.metaEntry.getEndKey());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -804,10 +862,10 @@ public class HBaseFsck {
|
|||
private void printTableSummary() {
|
||||
System.out.println("Summary:");
|
||||
for (TInfo tInfo : tablesInfo.values()) {
|
||||
if (tInfo.check()) {
|
||||
System.out.println(" " + tInfo.getName() + " is okay.");
|
||||
} else {
|
||||
if (errors.tableHasErrors(tInfo)) {
|
||||
System.out.println("Table " + tInfo.getName() + " is inconsistent.");
|
||||
} else {
|
||||
System.out.println(" " + tInfo.getName() + " is okay.");
|
||||
}
|
||||
System.out.println(" Number of regions: " + tInfo.getNumRegions());
|
||||
System.out.print(" Deployed on: ");
|
||||
|
@ -819,19 +877,45 @@ public class HBaseFsck {
|
|||
}
|
||||
|
||||
interface ErrorReporter {
|
||||
public static enum ERROR_CODE {
|
||||
UNKNOWN, NO_META_REGION, NULL_ROOT_REGION, NO_VERSION_FILE, NOT_IN_META_HDFS, NOT_IN_META,
|
||||
NOT_IN_META_OR_DEPLOYED, NOT_IN_HDFS_OR_DEPLOYED, NOT_IN_HDFS, SERVER_DOES_NOT_MATCH_META, NOT_DEPLOYED,
|
||||
MULTI_DEPLOYED, SHOULD_NOT_BE_DEPLOYED, MULTI_META_REGION, RS_CONNECT_FAILURE,
|
||||
FIRST_REGION_STARTKEY_NOT_EMPTY, DUPE_STARTKEYS,
|
||||
HOLE_IN_REGION_CHAIN, OVERLAP_IN_REGION_CHAIN, REGION_CYCLE
|
||||
}
|
||||
public void clear();
|
||||
public void report(String message);
|
||||
public void reportError(String message);
|
||||
public void reportError(ERROR_CODE errorCode, String message);
|
||||
public void reportError(ERROR_CODE errorCode, String message, TInfo table, HbckInfo info);
|
||||
public void reportError(ERROR_CODE errorCode, String message, TInfo table, HbckInfo info1, HbckInfo info2);
|
||||
public int summarize();
|
||||
public void detail(String details);
|
||||
public ArrayList<ERROR_CODE> getErrorList();
|
||||
public void progress();
|
||||
public void print(String message);
|
||||
public void resetErrors();
|
||||
public boolean tableHasErrors(TInfo table);
|
||||
}
|
||||
|
||||
private static class PrintingErrorReporter implements ErrorReporter {
|
||||
public int errorCount = 0;
|
||||
private int showProgress;
|
||||
|
||||
public synchronized void reportError(String message) {
|
||||
Set<TInfo> errorTables = new HashSet<TInfo>();
|
||||
|
||||
// for use by unit tests to verify which errors were discovered
|
||||
private ArrayList<ERROR_CODE> errorList = new ArrayList<ERROR_CODE>();
|
||||
|
||||
public void clear() {
|
||||
errorTables.clear();
|
||||
errorList.clear();
|
||||
errorCount = 0;
|
||||
}
|
||||
|
||||
public synchronized void reportError(ERROR_CODE errorCode, String message) {
|
||||
errorList.add(errorCode);
|
||||
if (!summary) {
|
||||
System.out.println("ERROR: " + message);
|
||||
}
|
||||
|
@ -839,6 +923,37 @@ public class HBaseFsck {
|
|||
showProgress = 0;
|
||||
}
|
||||
|
||||
public synchronized void reportError(ERROR_CODE errorCode, String message, TInfo table,
|
||||
HbckInfo info) {
|
||||
errorTables.add(table);
|
||||
String reference = "(region " + info.metaEntry.getRegionNameAsString() + ")";
|
||||
reportError(errorCode, reference + " " + message);
|
||||
}
|
||||
|
||||
public synchronized void reportError(ERROR_CODE errorCode, String message, TInfo table,
|
||||
HbckInfo info1, HbckInfo info2) {
|
||||
errorTables.add(table);
|
||||
String reference = "(regions " + info1.metaEntry.getRegionNameAsString()
|
||||
+ " and " + info2.metaEntry.getRegionNameAsString() + ")";
|
||||
reportError(errorCode, reference + " " + message);
|
||||
}
|
||||
|
||||
public synchronized void reportError(String message) {
|
||||
reportError(ERROR_CODE.UNKNOWN, message);
|
||||
}
|
||||
|
||||
/**
|
||||
* Report error information, but do not increment the error count. Intended for cases
|
||||
* where the actual error would have been reported previously.
|
||||
* @param message
|
||||
*/
|
||||
public synchronized void report(String message) {
|
||||
if (! summary) {
|
||||
System.out.println("ERROR: " + message);
|
||||
}
|
||||
showProgress = 0;
|
||||
}
|
||||
|
||||
public synchronized int summarize() {
|
||||
System.out.println(Integer.toString(errorCount) +
|
||||
" inconsistencies detected.");
|
||||
|
@ -851,12 +966,21 @@ public class HBaseFsck {
|
|||
}
|
||||
}
|
||||
|
||||
public ArrayList<ERROR_CODE> getErrorList() {
|
||||
return errorList;
|
||||
}
|
||||
|
||||
public synchronized void print(String message) {
|
||||
if (!summary) {
|
||||
System.out.println(message);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean tableHasErrors(TInfo table) {
|
||||
return errorTables.contains(table);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resetErrors() {
|
||||
errorCount = 0;
|
||||
|
@ -923,13 +1047,13 @@ public class HBaseFsck {
|
|||
}
|
||||
}
|
||||
|
||||
// check to see if the existance of this region matches the region in META
|
||||
// check to see if the existence of this region matches the region in META
|
||||
for (HRegionInfo r:regions) {
|
||||
HbckInfo hbi = hbck.getOrCreateInfo(r.getEncodedName());
|
||||
hbi.addServer(rsinfo);
|
||||
}
|
||||
} catch (IOException e) { // unable to connect to the region server.
|
||||
errors.reportError("RegionServer: " + rsinfo.getServerName() +
|
||||
errors.reportError(ERROR_CODE.RS_CONNECT_FAILURE, "RegionServer: " + rsinfo.getServerName() +
|
||||
" Unable to fetch region information. " + e);
|
||||
} finally {
|
||||
done = true;
|
||||
|
@ -1000,7 +1124,7 @@ public class HBaseFsck {
|
|||
}
|
||||
}
|
||||
} catch (IOException e) { // unable to connect to the region server.
|
||||
errors.reportError("Table Directory: " + tableDir.getPath().getName() +
|
||||
errors.reportError(ERROR_CODE.RS_CONNECT_FAILURE, "Table Directory: " + tableDir.getPath().getName() +
|
||||
" Unable to fetch region information. " + e);
|
||||
} finally {
|
||||
done = true;
|
||||
|
|
|
@ -89,6 +89,7 @@ public class HBaseTestingUtility {
|
|||
*/
|
||||
private boolean passedZkCluster = false;
|
||||
private MiniDFSCluster dfsCluster = null;
|
||||
|
||||
private MiniHBaseCluster hbaseCluster = null;
|
||||
private MiniMRCluster mrCluster = null;
|
||||
// If non-null, then already a cluster running.
|
||||
|
@ -113,6 +114,10 @@ public class HBaseTestingUtility {
|
|||
this.conf = conf;
|
||||
}
|
||||
|
||||
public MiniHBaseCluster getHbaseCluster() {
|
||||
return hbaseCluster;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns this classes's instance of {@link Configuration}. Be careful how
|
||||
* you use the returned Configuration since {@link HConnection} instances
|
||||
|
|
|
@ -19,23 +19,28 @@
|
|||
*/
|
||||
package org.apache.hadoop.hbase.util;
|
||||
|
||||
import static org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
|
||||
import static org.junit.Assert.assertArrayEquals;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.HTableDescriptor;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.client.HTable;
|
||||
import org.apache.hadoop.hbase.client.Put;
|
||||
import org.apache.hadoop.hbase.client.Result;
|
||||
import org.apache.hadoop.hbase.client.ResultScanner;
|
||||
import org.apache.hadoop.hbase.client.Scan;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.*;
|
||||
import org.apache.hadoop.hbase.client.*;
|
||||
import org.apache.hadoop.hbase.ipc.HRegionInterface;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
public class TestHBaseFsck {
|
||||
|
||||
final Log LOG = LogFactory.getLog(getClass());
|
||||
|
@ -50,32 +55,39 @@ public class TestHBaseFsck {
|
|||
TEST_UTIL.startMiniCluster(3);
|
||||
}
|
||||
|
||||
private int doFsck(boolean fix) throws Exception {
|
||||
private List doFsck(boolean fix) throws Exception {
|
||||
HBaseFsck fsck = new HBaseFsck(conf);
|
||||
fsck.displayFullReport();
|
||||
fsck.displayFullReport(); // i.e. -details
|
||||
fsck.setTimeLag(0);
|
||||
fsck.setFixErrors(fix);
|
||||
// Most basic check ever, 0 tables
|
||||
return fsck.doWork();
|
||||
fsck.doWork();
|
||||
return fsck.getErrors().getErrorList();
|
||||
}
|
||||
|
||||
private void assertNoErrors(List errs) throws Exception {
|
||||
assertEquals(0, errs.size());
|
||||
}
|
||||
|
||||
private void assertErrors(List errs, ERROR_CODE[] expectedErrors) {
|
||||
assertEquals(Arrays.asList(expectedErrors), errs);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHBaseFsck() throws Exception {
|
||||
int result = doFsck(false);
|
||||
assertEquals(0, result);
|
||||
assertNoErrors(doFsck(false));
|
||||
|
||||
TEST_UTIL.createTable(TABLE, FAM);
|
||||
|
||||
// We created 1 table, should be fine
|
||||
result = doFsck(false);
|
||||
assertEquals(0, result);
|
||||
assertNoErrors(doFsck(false));
|
||||
|
||||
// Now let's mess it up and change the assignment in .META. to
|
||||
// point to a different region server
|
||||
HTable meta = new HTable(conf, HTableDescriptor.META_TABLEDESC.getName());
|
||||
ResultScanner scanner = meta.getScanner(new Scan());
|
||||
|
||||
resforloop : for (Result res : scanner) {
|
||||
resforloop:
|
||||
for (Result res : scanner) {
|
||||
long startCode = Bytes.toLong(res.getValue(HConstants.CATALOG_FAMILY,
|
||||
HConstants.STARTCODE_QUALIFIER));
|
||||
|
||||
|
@ -98,13 +110,90 @@ public class TestHBaseFsck {
|
|||
}
|
||||
|
||||
// Try to fix the data
|
||||
result = doFsck(true);
|
||||
assertEquals(-1, result);
|
||||
|
||||
assertErrors(doFsck(true), new ERROR_CODE[]{
|
||||
ERROR_CODE.SERVER_DOES_NOT_MATCH_META});
|
||||
Thread.sleep(15000);
|
||||
result = doFsck(false);
|
||||
// Should have fixed
|
||||
assertEquals(0, result);
|
||||
|
||||
// Should be fixed now
|
||||
assertNoErrors(doFsck(false));
|
||||
|
||||
// comment needed - what is the purpose of this line
|
||||
new HTable(conf, TABLE).getScanner(new Scan());
|
||||
}
|
||||
|
||||
private HRegionInfo createRegion(Configuration conf, final HTableDescriptor
|
||||
htd, byte[] startKey, byte[] endKey)
|
||||
throws IOException {
|
||||
HTable meta = new HTable(conf, HConstants.META_TABLE_NAME);
|
||||
HRegionInfo hri = new HRegionInfo(htd, startKey, endKey);
|
||||
Put put = new Put(hri.getRegionName());
|
||||
put.add(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER,
|
||||
Writables.getBytes(hri));
|
||||
meta.put(put);
|
||||
return hri;
|
||||
}
|
||||
|
||||
@Test
|
||||
/**
|
||||
* Tests for inconsistencies in the META data (duplicate start keys, or holes)
|
||||
*/
|
||||
public void testHBaseFsckMeta() throws Exception {
|
||||
assertNoErrors(doFsck(false));
|
||||
|
||||
HTable tbl = TEST_UTIL.createTable(Bytes.toBytes("table2"), FAM);
|
||||
|
||||
HRegionInfo hriOrig = tbl.getRegionsInfo().keySet().iterator().next();
|
||||
HServerAddress rsAddressOrig = tbl.getRegionsInfo().get(hriOrig);
|
||||
|
||||
byte[][] startKeys = new byte[][]{
|
||||
HConstants.EMPTY_BYTE_ARRAY,
|
||||
Bytes.toBytes("A"),
|
||||
Bytes.toBytes("B"),
|
||||
Bytes.toBytes("C")
|
||||
};
|
||||
TEST_UTIL.createMultiRegions(conf, tbl, FAM, startKeys);
|
||||
Path rootDir = new Path(conf.get(HConstants.HBASE_DIR));
|
||||
FileSystem fs = rootDir.getFileSystem(conf);
|
||||
Path p = new Path(rootDir + "/table2", hriOrig.getEncodedName());
|
||||
fs.delete(p, true);
|
||||
|
||||
Thread.sleep(1 * 1000);
|
||||
ArrayList servers = new ArrayList();
|
||||
servers.add(rsAddressOrig);
|
||||
HBaseFsckRepair.fixDupeAssignment(conf, hriOrig, servers);
|
||||
|
||||
// We created 1 table, should be fine
|
||||
assertNoErrors(doFsck(false));
|
||||
|
||||
// Now let's mess it up, by adding a region with a duplicate startkey
|
||||
HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
|
||||
Bytes.toBytes("A"), Bytes.toBytes("A2"));
|
||||
TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
|
||||
TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
|
||||
.waitForAssignment(hriDupe);
|
||||
assertErrors(doFsck(false), new ERROR_CODE[]{ERROR_CODE.DUPE_STARTKEYS});
|
||||
|
||||
// Mess it up by creating an overlap in the metadata
|
||||
HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
|
||||
Bytes.toBytes("A2"), Bytes.toBytes("B2"));
|
||||
TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
|
||||
TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
|
||||
.waitForAssignment(hriOverlap);
|
||||
assertErrors(doFsck(false), new ERROR_CODE[]{
|
||||
ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
|
||||
ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
|
||||
|
||||
// Mess it up by leaving a hole in the meta data
|
||||
HRegionInfo hriHole = createRegion(conf, tbl.getTableDescriptor(),
|
||||
Bytes.toBytes("D"), Bytes.toBytes("E"));
|
||||
TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriHole);
|
||||
TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
|
||||
.waitForAssignment(hriHole);
|
||||
// assertError(doFsck(false), ERROR_CODE.OVERLAP_IN_REGION_CHAIN);
|
||||
assertErrors(doFsck(false), new ERROR_CODE[]{ ERROR_CODE.DUPE_STARTKEYS,
|
||||
ERROR_CODE.OVERLAP_IN_REGION_CHAIN, ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
|
||||
ERROR_CODE.HOLE_IN_REGION_CHAIN });
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue