HBASE-3845 data loss because lastSeqWritten can miss memstore edits

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1151190 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2011-07-26 18:14:39 +00:00
parent c3b24e6c58
commit 4c51d6dc6b
5 changed files with 88 additions and 16 deletions

View File

@ -173,6 +173,8 @@ Release 0.91.0 - Unreleased
Content-Encoding: gzip in parallel
HBASE-4116 [stargate] StringIndexOutOfBoundsException in row spec parse
(Allan Yan)
HBASE-3845 data loss because lastSeqWritten can miss memstore edits
(Prakash Khemani and ramkrishna.s.vasudevan)
IMPROVEMENTS
HBASE-3290 Max Compaction Size (Nicolas Spiegelberg via Stack)

View File

@ -1094,7 +1094,8 @@ public class HRegion implements HeapSize { // , Writable{
final long currentMemStoreSize = this.memstoreSize.get();
List<StoreFlusher> storeFlushers = new ArrayList<StoreFlusher>(stores.size());
try {
sequenceId = (wal == null)? myseqid: wal.startCacheFlush();
sequenceId = (wal == null)? myseqid :
wal.startCacheFlush(this.regionInfo.getEncodedNameAsBytes());
completeSequenceId = this.getCompleteCacheFlushSequenceId(sequenceId);
for (Store s : stores.values()) {
@ -1144,7 +1145,9 @@ public class HRegion implements HeapSize { // , Writable{
// We used to only catch IOEs but its possible that we'd get other
// exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch
// all and sundry.
if (wal != null) wal.abortCacheFlush();
if (wal != null) {
wal.abortCacheFlush(this.regionInfo.getEncodedNameAsBytes());
}
DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
Bytes.toStringBinary(getRegionName()));
dse.initCause(t);

View File

@ -1116,6 +1116,18 @@ public class HLog implements Syncable {
return outputfiles.size();
}
private byte[] getSnapshotName(byte[] encodedRegionName) {
byte snp[] = new byte[encodedRegionName.length + 3];
// an encoded region name has only hex digits. s, n or p are not hex
// and therefore snapshot-names will never collide with
// encoded-region-names
snp[0] = 's'; snp[1] = 'n'; snp[2] = 'p';
for (int i = 0; i < encodedRegionName.length; i++) {
snp[i+3] = encodedRegionName[i];
}
return snp;
}
/**
* By acquiring a log sequence ID, we can allow log messages to continue while
* we flush the cache.
@ -1124,16 +1136,53 @@ public class HLog implements Syncable {
* completion of a cache-flush. Otherwise the log-seq-id for the flush will
* not appear in the correct logfile.
*
* @return sequence ID to pass {@link #completeCacheFlush(byte[], byte[], long, boolean)}
* (byte[], byte[], long)}
* Ensuring that flushes and log-rolls don't happen concurrently also allows
* us to temporarily put a log-seq-number in lastSeqWritten against the region
* being flushed that might not be the earliest in-memory log-seq-number for
* that region. By the time the flush is completed or aborted and before the
* cacheFlushLock is released it is ensured that lastSeqWritten again has the
* oldest in-memory edit's lsn for the region that was being flushed.
*
* In this method, by removing the entry in lastSeqWritten for the region
* being flushed we ensure that the next edit inserted in this region will be
* correctly recorded in {@link #append(HRegionInfo, HLogKey, WALEdit)}. The
* lsn of the earliest in-memory lsn - which is now in the memstore snapshot -
* is saved temporarily in the lastSeqWritten map while the flush is active.
*
* @return sequence ID to pass
* {@link #completeCacheFlush(byte[], byte[], long, boolean)} (byte[],
* byte[], long)}
* @see #completeCacheFlush(byte[], byte[], long, boolean)
* @see #abortCacheFlush()
*/
public long startCacheFlush() {
public long startCacheFlush(final byte[] encodedRegionName) {
this.cacheFlushLock.lock();
Long seq = this.lastSeqWritten.remove(encodedRegionName);
// seq is the lsn of the oldest edit associated with this region. If a
// snapshot already exists - because the last flush failed - then seq will
// be the lsn of the oldest edit in the snapshot
if (seq != null) {
// keeping the earliest sequence number of the snapshot in
// lastSeqWritten maintains the correctness of
// getOldestOutstandingSeqNum(). But it doesn't matter really because
// everything is being done inside of cacheFlush lock.
Long oldseq =
lastSeqWritten.put(getSnapshotName(encodedRegionName), seq);
if (oldseq != null) {
LOG.error("Logic Error Snapshot seq id from earlier flush still" +
" present! for region " + Bytes.toString(encodedRegionName) +
" overwritten oldseq=" + oldseq + "with new seq=" + seq);
Runtime.getRuntime().halt(1);
}
} else {
LOG.error("Logic Error - flushing an empty region??? " +
Bytes.toString(encodedRegionName));
Runtime.getRuntime().halt(1);
}
return obtainSeqNum();
}
/**
* Complete the cache flush
*
@ -1160,15 +1209,15 @@ public class HLog implements Syncable {
writeTime += System.currentTimeMillis() - now;
writeOps++;
this.numEntries.incrementAndGet();
Long seq = this.lastSeqWritten.get(encodedRegionName);
if (seq != null && logSeqId >= seq.longValue()) {
this.lastSeqWritten.remove(encodedRegionName);
}
}
// sync txn to file system
this.sync();
} finally {
// updateLock not needed for removing snapshot's entry
// Cleaning up of lastSeqWritten is in the finally clause because we
// don't want to confuse getOldestOutstandingSeqNum()
this.lastSeqWritten.remove(getSnapshotName(encodedRegionName));
this.cacheFlushLock.unlock();
}
}
@ -1187,7 +1236,25 @@ public class HLog implements Syncable {
* currently is a restart of the regionserver so the snapshot content dropped
* by the failure gets restored to the memstore.
*/
public void abortCacheFlush() {
public void abortCacheFlush(byte[] encodedRegionName) {
Long snapshot_seq =
this.lastSeqWritten.remove(getSnapshotName(encodedRegionName));
if (snapshot_seq != null) {
// updateLock not necessary because we are racing against
// lastSeqWritten.putIfAbsent() in append() and we will always win
// before releasing cacheFlushLock make sure that the region's entry in
// lastSeqWritten points to the earliest edit in the region
Long current_memstore_earliest_seq =
this.lastSeqWritten.put(encodedRegionName, snapshot_seq);
if (current_memstore_earliest_seq != null &&
(current_memstore_earliest_seq.longValue() <=
snapshot_seq.longValue())) {
LOG.error("Logic Error region " + Bytes.toString(encodedRegionName) +
"acquired edits out of order current memstore seq=" +
current_memstore_earliest_seq + " snapshot seq=" + snapshot_seq);
Runtime.getRuntime().halt(1);
}
}
this.cacheFlushLock.unlock();
}

View File

@ -469,7 +469,7 @@ public class TestHLog {
htd.addFamily(new HColumnDescriptor("column"));
log.append(info, tableName, cols, System.currentTimeMillis(), htd);
long logSeqId = log.startCacheFlush();
long logSeqId = log.startCacheFlush(info.getEncodedNameAsBytes());
log.completeCacheFlush(info.getEncodedNameAsBytes(), tableName, logSeqId,
info.isMetaRegion());
log.close();
@ -540,7 +540,7 @@ public class TestHLog {
HTableDescriptor htd = new HTableDescriptor();
htd.addFamily(new HColumnDescriptor("column"));
log.append(hri, tableName, cols, System.currentTimeMillis(), htd);
long logSeqId = log.startCacheFlush();
long logSeqId = log.startCacheFlush(hri.getEncodedNameAsBytes());
log.completeCacheFlush(hri.getEncodedNameAsBytes(), tableName, logSeqId, false);
log.close();
Path filename = log.computeFilename();
@ -651,7 +651,7 @@ public class TestHLog {
// Flush the first region, we expect to see the first two files getting
// archived
long seqId = log.startCacheFlush();
long seqId = log.startCacheFlush(hri.getEncodedNameAsBytes());
log.completeCacheFlush(hri.getEncodedNameAsBytes(), tableName, seqId, false);
log.rollWriter();
assertEquals(2, log.getNumLogFiles());
@ -659,7 +659,7 @@ public class TestHLog {
// Flush the second region, which removes all the remaining output files
// since the oldest was completely flushed and the two others only contain
// flush information
seqId = log.startCacheFlush();
seqId = log.startCacheFlush(hri2.getEncodedNameAsBytes());
log.completeCacheFlush(hri2.getEncodedNameAsBytes(), tableName2, seqId, false);
log.rollWriter();
assertEquals(0, log.getNumLogFiles());

View File

@ -371,7 +371,7 @@ public class TestWALReplay {
}
// Add a cache flush, shouldn't have any effect
long logSeqId = wal.startCacheFlush();
long logSeqId = wal.startCacheFlush(regionName);
wal.completeCacheFlush(regionName, tableName, logSeqId, hri.isMetaRegion());
// Add an edit to another family, should be skipped.