diff --git a/CHANGES.txt b/CHANGES.txt index 781c8fc26af..de075f60932 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -403,6 +403,7 @@ Release 0.92.0 - Unreleased HBASE-4670 Fix javadoc warnings HBASE-4367 Deadlock in MemStore flusher due to JDK internally synchronizing on current thread + HBASE-4645 Edits Log recovery losing data across column families TESTS HBASE-4450 test for number of blocks read: to serve as baseline for expected diff --git a/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java b/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java index 01391d28e1e..5243b58a2cd 100644 --- a/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java +++ b/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java @@ -447,20 +447,39 @@ public class HRegion implements HeapSize { // , Writable{ status.setStatus("Cleaning up temporary data from old regions"); cleanupTmpDir(); - // Load in all the HStores. Get maximum seqid. + // Load in all the HStores. + // Get minimum of the maxSeqId across all the store. + // + // Context: During replay we want to ensure that we do not lose any data. So, we + // have to be conservative in how we replay logs. For each store, we calculate + // the maxSeqId up to which the store was flushed. But, since different stores + // could have a different maxSeqId, we choose the + // minimum across all the stores. + // This could potentially result in duplication of data for stores that are ahead + // of others. ColumnTrackers in the ScanQueryMatchers do the de-duplication, so we + // do not have to worry. + // TODO: If there is a store that was never flushed in a long time, we could replay + // a lot of data. Currently, this is not a problem because we flush all the stores at + // the same time. If we move to per-cf flushing, we might want to revisit this and send + // in a vector of maxSeqIds instead of sending in a single number, which has to be the + // min across all the max. + long minSeqId = -1; long maxSeqId = -1; for (HColumnDescriptor c : this.htableDescriptor.getFamilies()) { status.setStatus("Instantiating store for column family " + c); Store store = instantiateHStore(this.tableDir, c); this.stores.put(c.getName(), store); long storeSeqId = store.getMaxSequenceId(); - if (storeSeqId > maxSeqId) { + if (minSeqId == -1 || storeSeqId < minSeqId) { + minSeqId = storeSeqId; + } + if (maxSeqId == -1 || storeSeqId > maxSeqId) { maxSeqId = storeSeqId; } } // Recover any edits if available. - maxSeqId = replayRecoveredEditsIfAny( - this.regiondir, maxSeqId, reporter, status); + maxSeqId = Math.max(maxSeqId, replayRecoveredEditsIfAny( + this.regiondir, minSeqId, reporter, status)); status.setStatus("Cleaning up detritus from prior splits"); // Get rid of any splits or merges that were lost in-progress. Clean out diff --git a/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestWALReplay.java b/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestWALReplay.java index 2327d0a2cff..cc996e44a17 100644 --- a/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestWALReplay.java +++ b/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestWALReplay.java @@ -276,24 +276,20 @@ public class TestWALReplay { Result result = region.get(g, null); assertEquals(countPerFamily * htd.getFamilies().size(), result.size()); - // Now close the region, split the log, reopen the region and assert that - // replay of log has no effect, that our seqids are calculated correctly so + // Now close the region (without flush), split the log, reopen the region and assert that + // replay of log has the correct effect, that our seqids are calculated correctly so // all edits in logs are seen as 'stale'/old. - region.close(); + region.close(true); wal.close(); runWALSplit(this.conf); HLog wal2 = createWAL(this.conf); - HRegion region2 = new HRegion(basedir, wal2, this.fs, this.conf, hri, htd, null) { - @Override - protected boolean restoreEdit(Store s, KeyValue kv) { - super.restoreEdit(s, kv); - throw new RuntimeException("Called when it should not have been!"); - } - }; + HRegion region2 = new HRegion(basedir, wal2, this.fs, this.conf, hri, htd, null); long seqid2 = region2.initialize(); // HRegionServer usually does this. It knows the largest seqid across all regions. wal2.setSequenceNumber(seqid2); assertTrue(seqid + result.size() < seqid2); + final Result result1b = region2.get(g, null); + assertEquals(result.size(), result1b.size()); // Next test. Add more edits, then 'crash' this region by stealing its wal // out from under it and assert that replay of the log adds the edits back @@ -343,6 +339,88 @@ public class TestWALReplay { }); } + /** + * Test that we recover correctly when there is a failure in between the + * flushes. i.e. Some stores got flushed but others did not. + * + * Unfortunately, there is no easy hook to flush at a store level. The way + * we get around this is by flushing at the region level, and then deleting + * the recently flushed store file for one of the Stores. This would put us + * back in the situation where all but that store got flushed and the region + * died. + * + * We restart Region again, and verify that the edits were replayed. + * + * @throws IOException + * @throws IllegalAccessException + * @throws NoSuchFieldException + * @throws IllegalArgumentException + * @throws SecurityException + */ + @Test + public void testReplayEditsAfterPartialFlush() + throws IOException, SecurityException, IllegalArgumentException, + NoSuchFieldException, IllegalAccessException, InterruptedException { + final String tableNameStr = "testReplayEditsWrittenViaHRegion"; + final HRegionInfo hri = createBasic3FamilyHRegionInfo(tableNameStr); + final Path basedir = new Path(this.hbaseRootDir, tableNameStr); + deleteDir(basedir); + final byte[] rowName = Bytes.toBytes(tableNameStr); + final int countPerFamily = 10; + final HTableDescriptor htd = createBasic3FamilyHTD(tableNameStr); + HRegion region3 = HRegion.createHRegion(hri, + hbaseRootDir, this.conf, htd); + + // Write countPerFamily edits into the three families. Do a flush on one + // of the families during the load of edits so its seqid is not same as + // others to test we do right thing when different seqids. + HLog wal = createWAL(this.conf); + HRegion region = new HRegion(basedir, wal, this.fs, this.conf, hri, htd, null); + long seqid = region.initialize(); + // HRegionServer usually does this. It knows the largest seqid across all regions. + wal.setSequenceNumber(seqid); + for (HColumnDescriptor hcd: htd.getFamilies()) { + addRegionEdits(rowName, hcd.getName(), countPerFamily, this.ee, region, "x"); + } + + // Now assert edits made it in. + final Get g = new Get(rowName); + Result result = region.get(g, null); + assertEquals(countPerFamily * htd.getFamilies().size(), + result.size()); + + // Let us flush the region + region.flushcache(); + region.close(true); + wal.close(); + + // delete the store files in the second column family to simulate a failure + // in between the flushcache(); + // we have 3 families. killing the middle one ensures that taking the maximum + // will make us fail. + int cf_count = 0; + for (HColumnDescriptor hcd: htd.getFamilies()) { + cf_count++; + if (cf_count == 2) { + this.fs.delete(new Path(region.getRegionDir(), Bytes.toString(hcd.getName())) + , true); + } + } + + + // Let us try to split and recover + runWALSplit(this.conf); + HLog wal2 = createWAL(this.conf); + HRegion region2 = new HRegion(basedir, wal2, this.fs, this.conf, hri, htd, null); + long seqid2 = region2.initialize(); + // HRegionServer usually does this. It knows the largest seqid across all regions. + wal2.setSequenceNumber(seqid2); + assertTrue(seqid + result.size() < seqid2); + + final Result result1b = region2.get(g, null); + assertEquals(result.size(), result1b.size()); + } + /** * Create an HRegion with the result of a HLog split and test we only see the * good edits