HBASE-4797 [availability] Skip recovered.edits files with edits we know older than what region currently has (Jimmy Jiang)

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1205290 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Zhihong Yu 2011-11-23 05:23:45 +00:00
parent ef52263373
commit 351e75629f
2 changed files with 328 additions and 181 deletions

View File

@ -168,7 +168,7 @@ public class HRegion implements HeapSize { // , Writable{
// Members // Members
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
private final ConcurrentHashMap<HashedBytes, CountDownLatch> lockedRows = private final ConcurrentHashMap<HashedBytes, CountDownLatch> lockedRows =
new ConcurrentHashMap<HashedBytes, CountDownLatch>(); new ConcurrentHashMap<HashedBytes, CountDownLatch>();
private final ConcurrentHashMap<Integer, HashedBytes> lockIds = private final ConcurrentHashMap<Integer, HashedBytes> lockIds =
new ConcurrentHashMap<Integer, HashedBytes>(); new ConcurrentHashMap<Integer, HashedBytes>();
@ -487,10 +487,10 @@ public class HRegion implements HeapSize { // , Writable{
*/ */
public long initialize(final CancelableProgressable reporter) public long initialize(final CancelableProgressable reporter)
throws IOException { throws IOException {
MonitoredTask status = TaskMonitor.get().createStatus( MonitoredTask status = TaskMonitor.get().createStatus(
"Initializing region " + this); "Initializing region " + this);
if (coprocessorHost != null) { if (coprocessorHost != null) {
status.setStatus("Running coprocessor pre-open hook"); status.setStatus("Running coprocessor pre-open hook");
coprocessorHost.preOpen(); coprocessorHost.preOpen();
@ -558,17 +558,17 @@ public class HRegion implements HeapSize { // , Writable{
this.writestate.setReadOnly(this.htableDescriptor.isReadOnly()); this.writestate.setReadOnly(this.htableDescriptor.isReadOnly());
this.writestate.compacting = 0; this.writestate.compacting = 0;
// Initialize split policy // Initialize split policy
this.splitPolicy = RegionSplitPolicy.create(this, conf); this.splitPolicy = RegionSplitPolicy.create(this, conf);
this.lastFlushTime = EnvironmentEdgeManager.currentTimeMillis(); this.lastFlushTime = EnvironmentEdgeManager.currentTimeMillis();
// Use maximum of log sequenceid or that which was found in stores // Use maximum of log sequenceid or that which was found in stores
// (particularly if no recovered edits, seqid will be -1). // (particularly if no recovered edits, seqid will be -1).
long nextSeqid = maxSeqId + 1; long nextSeqid = maxSeqId + 1;
LOG.info("Onlined " + this.toString() + "; next sequenceid=" + nextSeqid); LOG.info("Onlined " + this.toString() + "; next sequenceid=" + nextSeqid);
if (coprocessorHost != null) { if (coprocessorHost != null) {
status.setStatus("Running coprocessor post-open hooks"); status.setStatus("Running coprocessor post-open hooks");
coprocessorHost.postOpen(); coprocessorHost.postOpen();
@ -605,7 +605,7 @@ public class HRegion implements HeapSize { // , Writable{
} }
return false; return false;
} }
/** /**
* This function will return the HDFS blocks distribution based on the data * This function will return the HDFS blocks distribution based on the data
* captured when HFile is created * captured when HFile is created
@ -642,7 +642,7 @@ public class HRegion implements HeapSize { // , Writable{
Path tablePath = FSUtils.getTablePath(FSUtils.getRootDir(conf), Path tablePath = FSUtils.getTablePath(FSUtils.getRootDir(conf),
tableDescriptor.getName()); tableDescriptor.getName());
FileSystem fs = tablePath.getFileSystem(conf); FileSystem fs = tablePath.getFileSystem(conf);
for (HColumnDescriptor family: tableDescriptor.getFamilies()) { for (HColumnDescriptor family: tableDescriptor.getFamilies()) {
Path storeHomeDir = Store.getStoreHomedir(tablePath, regionEncodedName, Path storeHomeDir = Store.getStoreHomedir(tablePath, regionEncodedName,
family.getName()); family.getName());
@ -660,27 +660,27 @@ public class HRegion implements HeapSize { // , Writable{
} }
return hdfsBlocksDistribution; return hdfsBlocksDistribution;
} }
public AtomicLong getMemstoreSize() { public AtomicLong getMemstoreSize() {
return memstoreSize; return memstoreSize;
} }
/** /**
* Increase the size of mem store in this region and the size of global mem * Increase the size of mem store in this region and the size of global mem
* store * store
* @param memStoreSize * @param memStoreSize
* @return the size of memstore in this region * @return the size of memstore in this region
*/ */
public long addAndGetGlobalMemstoreSize(long memStoreSize) { public long addAndGetGlobalMemstoreSize(long memStoreSize) {
if (this.rsServices != null) { if (this.rsServices != null) {
RegionServerAccounting rsAccounting = RegionServerAccounting rsAccounting =
this.rsServices.getRegionServerAccounting(); this.rsServices.getRegionServerAccounting();
if (rsAccounting != null) { if (rsAccounting != null) {
rsAccounting.addAndGetGlobalMemstoreSize(memStoreSize); rsAccounting.addAndGetGlobalMemstoreSize(memStoreSize);
} }
} }
return this.memstoreSize.getAndAdd(memStoreSize); return this.memstoreSize.getAndAdd(memStoreSize);
} }
/* /*
@ -795,7 +795,7 @@ public class HRegion implements HeapSize { // , Writable{
MonitoredTask status = TaskMonitor.get().createStatus( MonitoredTask status = TaskMonitor.get().createStatus(
"Closing region " + this + "Closing region " + this +
(abort ? " due to abort" : "")); (abort ? " due to abort" : ""));
status.setStatus("Waiting for close lock"); status.setStatus("Waiting for close lock");
try { try {
synchronized (closeLock) { synchronized (closeLock) {
@ -1180,7 +1180,7 @@ public class HRegion implements HeapSize { // , Writable{
} }
} }
boolean result = internalFlushcache(status); boolean result = internalFlushcache(status);
if (coprocessorHost != null) { if (coprocessorHost != null) {
status.setStatus("Running post-flush coprocessor hooks"); status.setStatus("Running post-flush coprocessor hooks");
coprocessorHost.postFlush(); coprocessorHost.postFlush();
@ -1228,7 +1228,7 @@ public class HRegion implements HeapSize { // , Writable{
* routes. * routes.
* *
* <p> This method may block for some time. * <p> This method may block for some time.
* @param status * @param status
* *
* @return true if the region needs compacting * @return true if the region needs compacting
* *
@ -1245,7 +1245,7 @@ public class HRegion implements HeapSize { // , Writable{
* @param wal Null if we're NOT to go via hlog/wal. * @param wal Null if we're NOT to go via hlog/wal.
* @param myseqid The seqid to use if <code>wal</code> is null writing out * @param myseqid The seqid to use if <code>wal</code> is null writing out
* flush file. * flush file.
* @param status * @param status
* @return true if the region needs compacting * @return true if the region needs compacting
* @throws IOException * @throws IOException
* @see #internalFlushcache(MonitoredTask) * @see #internalFlushcache(MonitoredTask)
@ -1791,7 +1791,7 @@ public class HRegion implements HeapSize { // , Writable{
/** /**
* Perform a batch of puts. * Perform a batch of puts.
* *
* @param putsAndLocks * @param putsAndLocks
* the list of puts paired with their requested lock IDs. * the list of puts paired with their requested lock IDs.
* @return an array of OperationStatus which internally contains the * @return an array of OperationStatus which internally contains the
@ -2015,7 +2015,7 @@ public class HRegion implements HeapSize { // , Writable{
// STEP 7. Sync wal. // STEP 7. Sync wal.
// ------------------------- // -------------------------
if (walEdit.size() > 0 && if (walEdit.size() > 0 &&
(this.regionInfo.isMetaRegion() || (this.regionInfo.isMetaRegion() ||
!this.htableDescriptor.isDeferredLogFlush())) { !this.htableDescriptor.isDeferredLogFlush())) {
this.log.sync(txid); this.log.sync(txid);
} }
@ -2378,7 +2378,7 @@ public class HRegion implements HeapSize { // , Writable{
/** /**
* Remove all the keys listed in the map from the memstore. This method is * Remove all the keys listed in the map from the memstore. This method is
* called when a Put has updated memstore but subequently fails to update * called when a Put has updated memstore but subequently fails to update
* the wal. This method is then invoked to rollback the memstore. * the wal. This method is then invoked to rollback the memstore.
*/ */
private void rollbackMemstore(BatchOperationInProgress<Pair<Put, Integer>> batchOp, private void rollbackMemstore(BatchOperationInProgress<Pair<Put, Integer>> batchOp,
@ -2392,13 +2392,13 @@ public class HRegion implements HeapSize { // , Writable{
continue; continue;
} }
// Rollback all the kvs for this row. // Rollback all the kvs for this row.
Map<byte[], List<KeyValue>> familyMap = familyMaps[i]; Map<byte[], List<KeyValue>> familyMap = familyMaps[i];
for (Map.Entry<byte[], List<KeyValue>> e : familyMap.entrySet()) { for (Map.Entry<byte[], List<KeyValue>> e : familyMap.entrySet()) {
byte[] family = e.getKey(); byte[] family = e.getKey();
List<KeyValue> edits = e.getValue(); List<KeyValue> edits = e.getValue();
// Remove those keys from the memstore that matches our // Remove those keys from the memstore that matches our
// key's (row, cf, cq, timestamp, memstoreTS). The interesting part is // key's (row, cf, cq, timestamp, memstoreTS). The interesting part is
// that even the memstoreTS has to match for keys that will be rolleded-back. // that even the memstoreTS has to match for keys that will be rolleded-back.
Store store = getStore(family); Store store = getStore(family);
@ -2502,16 +2502,36 @@ public class HRegion implements HeapSize { // , Writable{
protected long replayRecoveredEditsIfAny(final Path regiondir, protected long replayRecoveredEditsIfAny(final Path regiondir,
final long minSeqId, final CancelableProgressable reporter, final long minSeqId, final CancelableProgressable reporter,
final MonitoredTask status) final MonitoredTask status)
throws UnsupportedEncodingException, IOException { throws UnsupportedEncodingException, IOException {
long seqid = minSeqId; long seqid = minSeqId;
NavigableSet<Path> files = HLog.getSplitEditFilesSorted(this.fs, regiondir); NavigableSet<Path> files = HLog.getSplitEditFilesSorted(this.fs, regiondir);
if (files == null || files.isEmpty()) return seqid; if (files == null || files.isEmpty()) return seqid;
boolean checkSafeToSkip = true;
for (Path edits: files) { for (Path edits: files) {
if (edits == null || !this.fs.exists(edits)) { if (edits == null || !this.fs.exists(edits)) {
LOG.warn("Null or non-existent edits file: " + edits); LOG.warn("Null or non-existent edits file: " + edits);
continue; continue;
} }
if (isZeroLengthThenDelete(this.fs, edits)) continue; if (isZeroLengthThenDelete(this.fs, edits)) continue;
if (checkSafeToSkip) {
Path higher = files.higher(edits);
long maxSeqId = Long.MAX_VALUE;
if (higher != null) {
// Edit file name pattern, HLog.EDITFILES_NAME_PATTERN: "-?[0-9]+"
String fileName = higher.getName();
maxSeqId = Math.abs(Long.parseLong(fileName));
}
if (maxSeqId <= minSeqId) {
String msg = "Maximum possible sequenceid for this log is " + maxSeqId
+ ", skipped the whole file, path=" + edits;
LOG.debug(msg);
continue;
} else {
checkSafeToSkip = false;
}
}
try { try {
seqid = replayRecoveredEdits(edits, seqid, reporter); seqid = replayRecoveredEdits(edits, seqid, reporter);
} catch (IOException e) { } catch (IOException e) {
@ -2556,139 +2576,139 @@ public class HRegion implements HeapSize { // , Writable{
minSeqId + "; path=" + edits; minSeqId + "; path=" + edits;
LOG.info(msg); LOG.info(msg);
MonitoredTask status = TaskMonitor.get().createStatus(msg); MonitoredTask status = TaskMonitor.get().createStatus(msg);
status.setStatus("Opening logs"); status.setStatus("Opening logs");
HLog.Reader reader = HLog.getReader(this.fs, edits, conf); HLog.Reader reader = HLog.getReader(this.fs, edits, conf);
try { try {
long currentEditSeqId = minSeqId; long currentEditSeqId = minSeqId;
long firstSeqIdInLog = -1; long firstSeqIdInLog = -1;
long skippedEdits = 0; long skippedEdits = 0;
long editsCount = 0; long editsCount = 0;
long intervalEdits = 0; long intervalEdits = 0;
HLog.Entry entry; HLog.Entry entry;
Store store = null; Store store = null;
boolean reported_once = false; boolean reported_once = false;
try { try {
// How many edits seen before we check elapsed time // How many edits seen before we check elapsed time
int interval = this.conf.getInt("hbase.hstore.report.interval.edits", int interval = this.conf.getInt("hbase.hstore.report.interval.edits",
2000); 2000);
// How often to send a progress report (default 1/2 master timeout) // How often to send a progress report (default 1/2 master timeout)
int period = this.conf.getInt("hbase.hstore.report.period", int period = this.conf.getInt("hbase.hstore.report.period",
this.conf.getInt("hbase.master.assignment.timeoutmonitor.timeout", this.conf.getInt("hbase.master.assignment.timeoutmonitor.timeout",
180000) / 2); 180000) / 2);
long lastReport = EnvironmentEdgeManager.currentTimeMillis(); long lastReport = EnvironmentEdgeManager.currentTimeMillis();
while ((entry = reader.next()) != null) { while ((entry = reader.next()) != null) {
HLogKey key = entry.getKey(); HLogKey key = entry.getKey();
WALEdit val = entry.getEdit(); WALEdit val = entry.getEdit();
if (reporter != null) { if (reporter != null) {
intervalEdits += val.size(); intervalEdits += val.size();
if (intervalEdits >= interval) { if (intervalEdits >= interval) {
// Number of edits interval reached // Number of edits interval reached
intervalEdits = 0; intervalEdits = 0;
long cur = EnvironmentEdgeManager.currentTimeMillis(); long cur = EnvironmentEdgeManager.currentTimeMillis();
if (lastReport + period <= cur) { if (lastReport + period <= cur) {
status.setStatus("Replaying edits..." + status.setStatus("Replaying edits..." +
" skipped=" + skippedEdits + " skipped=" + skippedEdits +
" edits=" + editsCount); " edits=" + editsCount);
// Timeout reached // Timeout reached
if(!reporter.progress()) { if(!reporter.progress()) {
msg = "Progressable reporter failed, stopping replay"; msg = "Progressable reporter failed, stopping replay";
LOG.warn(msg); LOG.warn(msg);
status.abort(msg); status.abort(msg);
throw new IOException(msg); throw new IOException(msg);
}
reported_once = true;
lastReport = cur;
} }
reported_once = true;
lastReport = cur;
} }
} }
}
// Start coprocessor replay here. The coprocessor is for each WALEdit // Start coprocessor replay here. The coprocessor is for each WALEdit
// instead of a KeyValue. // instead of a KeyValue.
if (coprocessorHost != null) { if (coprocessorHost != null) {
status.setStatus("Running pre-WAL-restore hook in coprocessors"); status.setStatus("Running pre-WAL-restore hook in coprocessors");
if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) { if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) {
// if bypass this log entry, ignore it ... // if bypass this log entry, ignore it ...
continue; continue;
}
} }
}
if (firstSeqIdInLog == -1) { if (firstSeqIdInLog == -1) {
firstSeqIdInLog = key.getLogSeqNum(); firstSeqIdInLog = key.getLogSeqNum();
}
// Now, figure if we should skip this edit.
if (key.getLogSeqNum() <= currentEditSeqId) {
skippedEdits++;
continue;
}
currentEditSeqId = key.getLogSeqNum();
boolean flush = false;
for (KeyValue kv: val.getKeyValues()) {
// Check this edit is for me. Also, guard against writing the special
// METACOLUMN info such as HBASE::CACHEFLUSH entries
if (kv.matchingFamily(HLog.METAFAMILY) ||
!Bytes.equals(key.getEncodedRegionName(), this.regionInfo.getEncodedNameAsBytes())) {
skippedEdits++;
continue;
}
// Figure which store the edit is meant for.
if (store == null || !kv.matchingFamily(store.getFamily().getName())) {
store = this.stores.get(kv.getFamily());
} }
if (store == null) { // Now, figure if we should skip this edit.
// This should never happen. Perhaps schema was changed between if (key.getLogSeqNum() <= currentEditSeqId) {
// crash and redeploy?
LOG.warn("No family for " + kv);
skippedEdits++; skippedEdits++;
continue; continue;
} }
// Once we are over the limit, restoreEdit will keep returning true to currentEditSeqId = key.getLogSeqNum();
// flush -- but don't flush until we've played all the kvs that make up boolean flush = false;
// the WALEdit. for (KeyValue kv: val.getKeyValues()) {
flush = restoreEdit(store, kv); // Check this edit is for me. Also, guard against writing the special
editsCount++; // METACOLUMN info such as HBASE::CACHEFLUSH entries
} if (kv.matchingFamily(HLog.METAFAMILY) ||
if (flush) internalFlushcache(null, currentEditSeqId, status); !Bytes.equals(key.getEncodedRegionName(), this.regionInfo.getEncodedNameAsBytes())) {
skippedEdits++;
continue;
}
// Figure which store the edit is meant for.
if (store == null || !kv.matchingFamily(store.getFamily().getName())) {
store = this.stores.get(kv.getFamily());
}
if (store == null) {
// This should never happen. Perhaps schema was changed between
// crash and redeploy?
LOG.warn("No family for " + kv);
skippedEdits++;
continue;
}
// Once we are over the limit, restoreEdit will keep returning true to
// flush -- but don't flush until we've played all the kvs that make up
// the WALEdit.
flush = restoreEdit(store, kv);
editsCount++;
}
if (flush) internalFlushcache(null, currentEditSeqId, status);
if (coprocessorHost != null) { if (coprocessorHost != null) {
coprocessorHost.postWALRestore(this.getRegionInfo(), key, val); coprocessorHost.postWALRestore(this.getRegionInfo(), key, val);
}
} }
} } catch (EOFException eof) {
} catch (EOFException eof) {
Path p = HLog.moveAsideBadEditsFile(fs, edits);
msg = "Encountered EOF. Most likely due to Master failure during " +
"log spliting, so we have this data in another edit. " +
"Continuing, but renaming " + edits + " as " + p;
LOG.warn(msg, eof);
status.abort(msg);
} catch (IOException ioe) {
// If the IOE resulted from bad file format,
// then this problem is idempotent and retrying won't help
if (ioe.getCause() instanceof ParseException) {
Path p = HLog.moveAsideBadEditsFile(fs, edits); Path p = HLog.moveAsideBadEditsFile(fs, edits);
msg = "File corruption encountered! " + msg = "Encountered EOF. Most likely due to Master failure during " +
"log spliting, so we have this data in another edit. " +
"Continuing, but renaming " + edits + " as " + p; "Continuing, but renaming " + edits + " as " + p;
LOG.warn(msg, ioe); LOG.warn(msg, eof);
status.setStatus(msg); status.abort(msg);
} else { } catch (IOException ioe) {
status.abort(StringUtils.stringifyException(ioe)); // If the IOE resulted from bad file format,
// other IO errors may be transient (bad network connection, // then this problem is idempotent and retrying won't help
// checksum exception on one datanode, etc). throw & retry if (ioe.getCause() instanceof ParseException) {
throw ioe; Path p = HLog.moveAsideBadEditsFile(fs, edits);
msg = "File corruption encountered! " +
"Continuing, but renaming " + edits + " as " + p;
LOG.warn(msg, ioe);
status.setStatus(msg);
} else {
status.abort(StringUtils.stringifyException(ioe));
// other IO errors may be transient (bad network connection,
// checksum exception on one datanode, etc). throw & retry
throw ioe;
}
} }
} if (reporter != null && !reported_once) {
if (reporter != null && !reported_once) { reporter.progress();
reporter.progress(); }
} msg = "Applied " + editsCount + ", skipped " + skippedEdits +
msg = "Applied " + editsCount + ", skipped " + skippedEdits + ", firstSequenceidInLog=" + firstSeqIdInLog +
", firstSequenceidInLog=" + firstSeqIdInLog + ", maxSequenceidInLog=" + currentEditSeqId + ", path=" + edits;
", maxSequenceidInLog=" + currentEditSeqId + ", path=" + edits; status.markComplete(msg);
status.markComplete(msg); LOG.debug(msg);
LOG.debug(msg); return currentEditSeqId;
return currentEditSeqId;
} finally { } finally {
reader.close(); reader.close();
status.cleanup(); status.cleanup();
@ -2712,7 +2732,7 @@ public class HRegion implements HeapSize { // , Writable{
* @throws IOException * @throws IOException
*/ */
private static boolean isZeroLengthThenDelete(final FileSystem fs, final Path p) private static boolean isZeroLengthThenDelete(final FileSystem fs, final Path p)
throws IOException { throws IOException {
FileStatus stat = fs.getFileStatus(p); FileStatus stat = fs.getFileStatus(p);
if (stat.getLen() > 0) return false; if (stat.getLen() > 0) return false;
LOG.warn("File " + p + " is zero-length, deleting."); LOG.warn("File " + p + " is zero-length, deleting.");
@ -2721,7 +2741,7 @@ public class HRegion implements HeapSize { // , Writable{
} }
protected Store instantiateHStore(Path tableDir, HColumnDescriptor c) protected Store instantiateHStore(Path tableDir, HColumnDescriptor c)
throws IOException { throws IOException {
return new Store(tableDir, this, c, this.fs, this.conf); return new Store(tableDir, this, c, this.fs, this.conf);
} }
@ -2801,7 +2821,7 @@ public class HRegion implements HeapSize { // , Writable{
try { try {
HashedBytes rowKey = new HashedBytes(row); HashedBytes rowKey = new HashedBytes(row);
CountDownLatch rowLatch = new CountDownLatch(1); CountDownLatch rowLatch = new CountDownLatch(1);
// loop until we acquire the row lock (unless !waitForLock) // loop until we acquire the row lock (unless !waitForLock)
while (true) { while (true) {
CountDownLatch existingLatch = lockedRows.putIfAbsent(rowKey, rowLatch); CountDownLatch existingLatch = lockedRows.putIfAbsent(rowKey, rowLatch);
@ -2822,7 +2842,7 @@ public class HRegion implements HeapSize { // , Writable{
} }
} }
} }
// loop until we generate an unused lock id // loop until we generate an unused lock id
while (true) { while (true) {
Integer lockId = lockIdGenerator.incrementAndGet(); Integer lockId = lockIdGenerator.incrementAndGet();
@ -2848,7 +2868,7 @@ public class HRegion implements HeapSize { // , Writable{
HashedBytes rowKey = lockIds.get(lockid); HashedBytes rowKey = lockIds.get(lockid);
return rowKey == null ? null : rowKey.getBytes(); return rowKey == null ? null : rowKey.getBytes();
} }
/** /**
* Release the row lock! * Release the row lock!
* @param lockId The lock ID to release. * @param lockId The lock ID to release.
@ -2899,7 +2919,7 @@ public class HRegion implements HeapSize { // , Writable{
} }
return lid; return lid;
} }
/** /**
* Determines whether multiple column families are present * Determines whether multiple column families are present
* Precondition: familyPaths is not null * Precondition: familyPaths is not null
@ -2994,7 +3014,7 @@ public class HRegion implements HeapSize { // , Writable{
try { try {
store.bulkLoadHFile(path); store.bulkLoadHFile(path);
} catch (IOException ioe) { } catch (IOException ioe) {
// a failure here causes an atomicity violation that we currently // a failure here causes an atomicity violation that we currently
// cannot recover from since it is likely a failed hdfs operation. // cannot recover from since it is likely a failed hdfs operation.
// TODO Need a better story for reverting partial failures due to HDFS. // TODO Need a better story for reverting partial failures due to HDFS.
@ -3303,9 +3323,9 @@ public class HRegion implements HeapSize { // , Writable{
/** /**
* Convenience method creating new HRegions. Used by createTable. * Convenience method creating new HRegions. Used by createTable.
* The {@link HLog} for the created region needs to be closed explicitly. * The {@link HLog} for the created region needs to be closed explicitly.
* Use {@link HRegion#getLog()} to get access. * Use {@link HRegion#getLog()} to get access.
* *
* @param info Info for region to create. * @param info Info for region to create.
* @param rootDir Root directory for HBase instance * @param rootDir Root directory for HBase instance
* @param conf * @param conf
@ -3332,14 +3352,14 @@ public class HRegion implements HeapSize { // , Writable{
HLog effectiveHLog = hlog; HLog effectiveHLog = hlog;
if (hlog == null) { if (hlog == null) {
effectiveHLog = new HLog(fs, new Path(regionDir, HConstants.HREGION_LOGDIR_NAME), effectiveHLog = new HLog(fs, new Path(regionDir, HConstants.HREGION_LOGDIR_NAME),
new Path(regionDir, HConstants.HREGION_OLDLOGDIR_NAME), conf); new Path(regionDir, HConstants.HREGION_OLDLOGDIR_NAME), conf);
} }
HRegion region = HRegion.newHRegion(tableDir, HRegion region = HRegion.newHRegion(tableDir,
effectiveHLog, fs, conf, info, hTableDescriptor, null); effectiveHLog, fs, conf, info, hTableDescriptor, null);
region.initialize(); region.initialize();
return region; return region;
} }
/** /**
* Open a Region. * Open a Region.
* @param info Info for region to be opened. * @param info Info for region to be opened.
@ -3932,12 +3952,12 @@ public class HRegion implements HeapSize { // , Writable{
// TODO: There's a lot of boiler plate code identical // TODO: There's a lot of boiler plate code identical
// to increment... See how to better unify that. // to increment... See how to better unify that.
/** /**
* *
* Perform one or more append operations on a row. * Perform one or more append operations on a row.
* <p> * <p>
* Appends performed are done under row lock but reads do not take locks out * Appends performed are done under row lock but reads do not take locks out
* so this can be seen partially complete by gets and scans. * so this can be seen partially complete by gets and scans.
* *
* @param append * @param append
* @param lockid * @param lockid
* @param writeToWAL * @param writeToWAL
@ -4308,7 +4328,7 @@ public class HRegion implements HeapSize { // , Writable{
public static final long DEEP_OVERHEAD = FIXED_OVERHEAD + public static final long DEEP_OVERHEAD = FIXED_OVERHEAD +
ClassSize.OBJECT + // closeLock ClassSize.OBJECT + // closeLock
(2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing
ClassSize.ATOMIC_LONG + // memStoreSize ClassSize.ATOMIC_LONG + // memStoreSize
ClassSize.ATOMIC_INTEGER + // lockIdGenerator ClassSize.ATOMIC_INTEGER + // lockIdGenerator
(3 * ClassSize.CONCURRENT_HASHMAP) + // lockedRows, lockIds, scannerReadPoints (3 * ClassSize.CONCURRENT_HASHMAP) + // lockedRows, lockIds, scannerReadPoints
WriteState.HEAP_SIZE + // writestate WriteState.HEAP_SIZE + // writestate
@ -4533,13 +4553,13 @@ public class HRegion implements HeapSize { // , Writable{
if (this.explicitSplitPoint != null) { if (this.explicitSplitPoint != null) {
return this.explicitSplitPoint; return this.explicitSplitPoint;
} }
if (!splitPolicy.shouldSplit()) { if (!splitPolicy.shouldSplit()) {
return null; return null;
} }
byte[] ret = splitPolicy.getSplitPoint(); byte[] ret = splitPolicy.getSplitPoint();
if (ret != null) { if (ret != null) {
try { try {
checkRow(ret, "calculated split"); checkRow(ret, "calculated split");
@ -4547,7 +4567,7 @@ public class HRegion implements HeapSize { // , Writable{
LOG.error("Ignoring invalid split", e); LOG.error("Ignoring invalid split", e);
return null; return null;
} }
} }
return ret; return ret;
} }

View File

@ -34,16 +34,30 @@ import java.util.concurrent.atomic.AtomicReference;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.*; import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HBaseTestCase;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HConstants.OperationStatusCode; import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
import org.apache.hadoop.hbase.HDFSBlocksDistribution;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.MediumTests;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.MultithreadedTestUtil;
import org.apache.hadoop.hbase.MultithreadedTestUtil.TestThread; import org.apache.hadoop.hbase.MultithreadedTestUtil.TestThread;
import org.apache.hadoop.hbase.client.Delete; import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.filter.BinaryComparator; import org.apache.hadoop.hbase.filter.BinaryComparator;
import org.apache.hadoop.hbase.filter.ColumnCountGetFilter; import org.apache.hadoop.hbase.filter.ColumnCountGetFilter;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp; import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
@ -52,9 +66,13 @@ import org.apache.hadoop.hbase.filter.FilterList;
import org.apache.hadoop.hbase.filter.NullComparator; import org.apache.hadoop.hbase.filter.NullComparator;
import org.apache.hadoop.hbase.filter.PrefixFilter; import org.apache.hadoop.hbase.filter.PrefixFilter;
import org.apache.hadoop.hbase.filter.SingleColumnValueFilter; import org.apache.hadoop.hbase.filter.SingleColumnValueFilter;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
import org.apache.hadoop.hbase.regionserver.HRegion.RegionScannerImpl; import org.apache.hadoop.hbase.regionserver.HRegion.RegionScannerImpl;
import org.apache.hadoop.hbase.regionserver.metrics.SchemaMetrics; import org.apache.hadoop.hbase.regionserver.metrics.SchemaMetrics;
import org.apache.hadoop.hbase.regionserver.wal.HLog; import org.apache.hadoop.hbase.regionserver.wal.HLog;
import org.apache.hadoop.hbase.regionserver.wal.HLogKey;
import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper; import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper;
@ -64,9 +82,9 @@ import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.PairOfSameType; import org.apache.hadoop.hbase.util.PairOfSameType;
import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.util.Threads;
import org.junit.Test; import org.junit.Test;
import org.junit.experimental.categories.Category;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import org.junit.experimental.categories.Category;
/** /**
@ -1296,7 +1314,7 @@ public class TestHRegion extends HBaseTestCase {
LOG.info("" + addContent(region, fam3)); LOG.info("" + addContent(region, fam3));
region.flushcache(); region.flushcache();
region.compactStores(); region.compactStores();
byte [] splitRow = region.checkSplit(); byte [] splitRow = region.checkSplit();
assertNotNull(splitRow); assertNotNull(splitRow);
LOG.info("SplitRow: " + Bytes.toString(splitRow)); LOG.info("SplitRow: " + Bytes.toString(splitRow));
HRegion [] subregions = splitRegion(region, splitRow); HRegion [] subregions = splitRegion(region, splitRow);
@ -2170,7 +2188,7 @@ public class TestHRegion extends HBaseTestCase {
} catch (Exception exception) { } catch (Exception exception) {
// Expected. // Expected.
} }
assertICV(row1, fam1, qual1, row1Field1); assertICV(row1, fam1, qual1, row1Field1);
assertICV(row1, fam1, qual2, row1Field2); assertICV(row1, fam1, qual2, row1Field2);
@ -2302,7 +2320,7 @@ public class TestHRegion extends HBaseTestCase {
LOG.info("" + addContent(region, fam3)); LOG.info("" + addContent(region, fam3));
region.flushcache(); region.flushcache();
region.compactStores(); region.compactStores();
byte [] splitRow = region.checkSplit(); byte [] splitRow = region.checkSplit();
assertNotNull(splitRow); assertNotNull(splitRow);
LOG.info("SplitRow: " + Bytes.toString(splitRow)); LOG.info("SplitRow: " + Bytes.toString(splitRow));
HRegion [] regions = splitRegion(region, splitRow); HRegion [] regions = splitRegion(region, splitRow);
@ -2337,7 +2355,7 @@ public class TestHRegion extends HBaseTestCase {
// To make regions splitable force compaction. // To make regions splitable force compaction.
for (int i = 0; i < regions.length; i++) { for (int i = 0; i < regions.length; i++) {
regions[i].compactStores(); regions[i].compactStores();
midkeys[i] = regions[i].checkSplit(); midkeys[i] = regions[i].checkSplit();
} }
TreeMap<String, HRegion> sortedMap = new TreeMap<String, HRegion>(); TreeMap<String, HRegion> sortedMap = new TreeMap<String, HRegion>();
@ -2809,6 +2827,115 @@ public class TestHRegion extends HBaseTestCase {
region.get(g, null); region.get(g, null);
} }
public void testSkipRecoveredEditsReplay() throws Exception {
String method = "testSkipRecoveredEditsReplay";
byte[] tableName = Bytes.toBytes(method);
byte[] family = Bytes.toBytes("family");
Configuration conf = HBaseConfiguration.create();
initHRegion(tableName, method, conf, family);
Path regiondir = region.getRegionDir();
FileSystem fs = region.getFilesystem();
byte[] regionName = region.getRegionInfo().getEncodedNameAsBytes();
Path recoveredEditsDir = HLog.getRegionDirRecoveredEditsDir(regiondir);
long maxSeqId = 1050;
long minSeqId = 1000;
for (long i = minSeqId; i <= maxSeqId; i += 10) {
Path recoveredEdits = new Path(recoveredEditsDir, String.format("%019d", i));
HLog.Writer writer = HLog.createWriter(fs, recoveredEdits, conf);
long time = System.nanoTime();
WALEdit edit = new WALEdit();
edit.add(new KeyValue(row, family, Bytes.toBytes(i),
time, KeyValue.Type.Put, Bytes.toBytes(i)));
writer.append(new HLog.Entry(new HLogKey(regionName, tableName,
i, time, HConstants.DEFAULT_CLUSTER_ID), edit));
writer.close();
}
MonitoredTask status = TaskMonitor.get().createStatus(method);
long seqId = region.replayRecoveredEditsIfAny(regiondir, minSeqId-1, null, status);
assertEquals(maxSeqId, seqId);
Get get = new Get(row);
Result result = region.get(get, null);
for (long i = minSeqId; i <= maxSeqId; i += 10) {
List<KeyValue> kvs = result.getColumn(family, Bytes.toBytes(i));
assertEquals(1, kvs.size());
assertEquals(Bytes.toBytes(i), kvs.get(0).getValue());
}
}
public void testSkipRecoveredEditsReplaySomeIgnored() throws Exception {
String method = "testSkipRecoveredEditsReplaySomeIgnored";
byte[] tableName = Bytes.toBytes(method);
byte[] family = Bytes.toBytes("family");
initHRegion(tableName, method, HBaseConfiguration.create(), family);
Path regiondir = region.getRegionDir();
FileSystem fs = region.getFilesystem();
byte[] regionName = region.getRegionInfo().getEncodedNameAsBytes();
Path recoveredEditsDir = HLog.getRegionDirRecoveredEditsDir(regiondir);
long maxSeqId = 1050;
long minSeqId = 1000;
for (long i = minSeqId; i <= maxSeqId; i += 10) {
Path recoveredEdits = new Path(recoveredEditsDir, String.format("%019d", i));
HLog.Writer writer = HLog.createWriter(fs, recoveredEdits, conf);
long time = System.nanoTime();
WALEdit edit = new WALEdit();
edit.add(new KeyValue(row, family, Bytes.toBytes(i),
time, KeyValue.Type.Put, Bytes.toBytes(i)));
writer.append(new HLog.Entry(new HLogKey(regionName, tableName,
i, time, HConstants.DEFAULT_CLUSTER_ID), edit));
writer.close();
}
long recoverSeqId = 1030;
MonitoredTask status = TaskMonitor.get().createStatus(method);
long seqId = region.replayRecoveredEditsIfAny(regiondir, recoverSeqId-1, null, status);
assertEquals(maxSeqId, seqId);
Get get = new Get(row);
Result result = region.get(get, null);
for (long i = minSeqId; i <= maxSeqId; i += 10) {
List<KeyValue> kvs = result.getColumn(family, Bytes.toBytes(i));
if (i < recoverSeqId) {
assertEquals(0, kvs.size());
} else {
assertEquals(1, kvs.size());
assertEquals(Bytes.toBytes(i), kvs.get(0).getValue());
}
}
}
public void testSkipRecoveredEditsReplayAllIgnored() throws Exception {
String method = "testSkipRecoveredEditsReplayAllIgnored";
byte[] tableName = Bytes.toBytes(method);
byte[] family = Bytes.toBytes("family");
initHRegion(tableName, method, HBaseConfiguration.create(), family);
Path regiondir = region.getRegionDir();
FileSystem fs = region.getFilesystem();
Path recoveredEditsDir = HLog.getRegionDirRecoveredEditsDir(regiondir);
for (int i = 1000; i < 1050; i += 10) {
Path recoveredEdits = new Path(
recoveredEditsDir, String.format("%019d", i));
FSDataOutputStream dos= fs.create(recoveredEdits);
dos.writeInt(i);
dos.close();
}
long minSeqId = 2000;
Path recoveredEdits = new Path(
recoveredEditsDir, String.format("%019d", minSeqId-1));
FSDataOutputStream dos= fs.create(recoveredEdits);
dos.close();
long seqId = region.replayRecoveredEditsIfAny(regiondir, minSeqId, null, null);
assertEquals(minSeqId, seqId);
}
public void testIndexesScanWithOneDeletedRow() throws IOException { public void testIndexesScanWithOneDeletedRow() throws IOException {
byte[] tableName = Bytes.toBytes("testIndexesScanWithOneDeletedRow"); byte[] tableName = Bytes.toBytes("testIndexesScanWithOneDeletedRow");
byte[] family = Bytes.toBytes("family"); byte[] family = Bytes.toBytes("family");
@ -2864,13 +2991,13 @@ public class TestHRegion extends HBaseTestCase {
HColumnDescriptor hcd = new HColumnDescriptor(fam1, Integer.MAX_VALUE, HColumnDescriptor hcd = new HColumnDescriptor(fam1, Integer.MAX_VALUE,
HColumnDescriptor.DEFAULT_COMPRESSION, false, true, HColumnDescriptor.DEFAULT_COMPRESSION, false, true,
HColumnDescriptor.DEFAULT_TTL, "rowcol"); HColumnDescriptor.DEFAULT_TTL, "rowcol");
HTableDescriptor htd = new HTableDescriptor(tableName); HTableDescriptor htd = new HTableDescriptor(tableName);
htd.addFamily(hcd); htd.addFamily(hcd);
HRegionInfo info = new HRegionInfo(htd.getName(), null, null, false); HRegionInfo info = new HRegionInfo(htd.getName(), null, null, false);
Path path = new Path(DIR + "testBloomFilterSize"); Path path = new Path(DIR + "testBloomFilterSize");
region = HRegion.createHRegion(info, path, conf, htd); region = HRegion.createHRegion(info, path, conf, htd);
int num_unique_rows = 10; int num_unique_rows = 10;
int duplicate_multiplier =2; int duplicate_multiplier =2;
int num_storefiles = 4; int num_storefiles = 4;
@ -2887,7 +3014,7 @@ public class TestHRegion extends HBaseTestCase {
} }
region.flushcache(); region.flushcache();
} }
//before compaction //before compaction
Store store = region.getStore(fam1); Store store = region.getStore(fam1);
List<StoreFile> storeFiles = store.getStorefiles(); List<StoreFile> storeFiles = store.getStorefiles();
for (StoreFile storefile : storeFiles) { for (StoreFile storefile : storeFiles) {
@ -2897,10 +3024,10 @@ public class TestHRegion extends HBaseTestCase {
assertEquals(num_unique_rows*duplicate_multiplier, reader.getEntries()); assertEquals(num_unique_rows*duplicate_multiplier, reader.getEntries());
assertEquals(num_unique_rows, reader.getFilterEntries()); assertEquals(num_unique_rows, reader.getFilterEntries());
} }
region.compactStores(true); region.compactStores(true);
//after compaction //after compaction
storeFiles = store.getStorefiles(); storeFiles = store.getStorefiles();
for (StoreFile storefile : storeFiles) { for (StoreFile storefile : storeFiles) {
StoreFile.Reader reader = storefile.getReader(); StoreFile.Reader reader = storefile.getReader();
@ -2909,9 +3036,9 @@ public class TestHRegion extends HBaseTestCase {
assertEquals(num_unique_rows*duplicate_multiplier*num_storefiles, assertEquals(num_unique_rows*duplicate_multiplier*num_storefiles,
reader.getEntries()); reader.getEntries());
assertEquals(num_unique_rows, reader.getFilterEntries()); assertEquals(num_unique_rows, reader.getFilterEntries());
} }
} }
public void testAllColumnsWithBloomFilter() throws IOException { public void testAllColumnsWithBloomFilter() throws IOException {
byte [] TABLE = Bytes.toBytes("testAllColumnsWithBloomFilter"); byte [] TABLE = Bytes.toBytes("testAllColumnsWithBloomFilter");
byte [] FAMILY = Bytes.toBytes("family"); byte [] FAMILY = Bytes.toBytes("family");
@ -3002,13 +3129,13 @@ public class TestHRegion extends HBaseTestCase {
final int DEFAULT_BLOCK_SIZE = 1024; final int DEFAULT_BLOCK_SIZE = 1024;
htu.getConfiguration().setLong("dfs.block.size", DEFAULT_BLOCK_SIZE); htu.getConfiguration().setLong("dfs.block.size", DEFAULT_BLOCK_SIZE);
htu.getConfiguration().setInt("dfs.replication", 2); htu.getConfiguration().setInt("dfs.replication", 2);
// set up a cluster with 3 nodes // set up a cluster with 3 nodes
MiniHBaseCluster cluster; MiniHBaseCluster cluster;
String dataNodeHosts[] = new String[] { "host1", "host2", "host3" }; String dataNodeHosts[] = new String[] { "host1", "host2", "host3" };
int regionServersCount = 3; int regionServersCount = 3;
try { try {
cluster = htu.startMiniCluster(1, regionServersCount, dataNodeHosts); cluster = htu.startMiniCluster(1, regionServersCount, dataNodeHosts);
byte [][] families = {fam1, fam2}; byte [][] families = {fam1, fam2};
@ -3018,31 +3145,31 @@ public class TestHRegion extends HBaseTestCase {
byte row[] = Bytes.toBytes("row1"); byte row[] = Bytes.toBytes("row1");
byte col[] = Bytes.toBytes("col1"); byte col[] = Bytes.toBytes("col1");
Put put = new Put(row); Put put = new Put(row);
put.add(fam1, col, 1, Bytes.toBytes("test1")); put.add(fam1, col, 1, Bytes.toBytes("test1"));
put.add(fam2, col, 1, Bytes.toBytes("test2")); put.add(fam2, col, 1, Bytes.toBytes("test2"));
ht.put(put); ht.put(put);
HRegion firstRegion = htu.getHBaseCluster(). HRegion firstRegion = htu.getHBaseCluster().
getRegions(Bytes.toBytes(this.getName())).get(0); getRegions(Bytes.toBytes(this.getName())).get(0);
firstRegion.flushcache(); firstRegion.flushcache();
HDFSBlocksDistribution blocksDistribution1 = HDFSBlocksDistribution blocksDistribution1 =
firstRegion.getHDFSBlocksDistribution(); firstRegion.getHDFSBlocksDistribution();
// given the default replication factor is 2 and we have 2 HFiles, // given the default replication factor is 2 and we have 2 HFiles,
// we will have total of 4 replica of blocks on 3 datanodes; thus there // we will have total of 4 replica of blocks on 3 datanodes; thus there
// must be at least one host that have replica for 2 HFiles. That host's // must be at least one host that have replica for 2 HFiles. That host's
// weight will be equal to the unique block weight. // weight will be equal to the unique block weight.
long uniqueBlocksWeight1 = long uniqueBlocksWeight1 =
blocksDistribution1.getUniqueBlocksTotalWeight(); blocksDistribution1.getUniqueBlocksTotalWeight();
String topHost = blocksDistribution1.getTopHosts().get(0); String topHost = blocksDistribution1.getTopHosts().get(0);
long topHostWeight = blocksDistribution1.getWeight(topHost); long topHostWeight = blocksDistribution1.getWeight(topHost);
assertTrue(uniqueBlocksWeight1 == topHostWeight); assertTrue(uniqueBlocksWeight1 == topHostWeight);
// use the static method to compute the value, it should be the same. // use the static method to compute the value, it should be the same.
// static method is used by load balancer or other components // static method is used by load balancer or other components
HDFSBlocksDistribution blocksDistribution2 = HDFSBlocksDistribution blocksDistribution2 =
HRegion.computeHDFSBlocksDistribution(htu.getConfiguration(), HRegion.computeHDFSBlocksDistribution(htu.getConfiguration(),
firstRegion.getTableDesc(), firstRegion.getTableDesc(),
firstRegion.getRegionInfo().getEncodedName()); firstRegion.getRegionInfo().getEncodedName());
@ -3054,7 +3181,7 @@ public class TestHRegion extends HBaseTestCase {
htu.shutdownMiniCluster(); htu.shutdownMiniCluster();
} }
} }
private void putData(int startRow, int numRows, byte [] qf, private void putData(int startRow, int numRows, byte [] qf,
byte [] ...families) byte [] ...families)
throws IOException { throws IOException {