From fed1075104f33057ee43188b8b38b1999c497a3f Mon Sep 17 00:00:00 2001 From: Jim Kellerman Date: Sun, 13 Jan 2008 23:39:01 +0000 Subject: [PATCH] HADOOP-2500 Unreadable region kills region servers HADOOP-2587 Splits blocked by compactions cause region to be offline for duration of compaction. Fix bug in TestCompaction in which two mini dfs clusters were being started for the same test. git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk/src/contrib/hbase@611681 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 3 ++ src/java/org/apache/hadoop/hbase/HMaster.java | 15 +++++++--- src/java/org/apache/hadoop/hbase/HRegion.java | 29 ++++++++++++------- .../apache/hadoop/hbase/HRegionServer.java | 25 ++++++++++------ .../apache/hadoop/hbase/TestTimestamp.java | 3 +- 5 files changed, 51 insertions(+), 24 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 3248e151317..1c358a5265f 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -118,6 +118,9 @@ Trunk (unreleased changes) HADOOP-2530 Missing type in new hbase custom RPC serializer HADOOP-2490 Failure in nightly #346 (Added debugging of hudson failures). HADOOP-2558 fixes for build up on hudson (part 1, part 2, part 3, part 4) + HADOOP-2500 Unreadable region kills region servers + HADOOP-2587 Splits blocked by compactions cause region to be offline for + duration of compaction. IMPROVEMENTS HADOOP-2401 Add convenience put method that takes writable diff --git a/src/java/org/apache/hadoop/hbase/HMaster.java b/src/java/org/apache/hadoop/hbase/HMaster.java index a7fb194b544..4a0e4892d67 100644 --- a/src/java/org/apache/hadoop/hbase/HMaster.java +++ b/src/java/org/apache/hadoop/hbase/HMaster.java @@ -464,10 +464,10 @@ public class HMaster extends Thread implements HConstants, HMasterInterface, ) ) { - // The current assignment is no good + // The current assignment is invalid if (LOG.isDebugEnabled()) { LOG.debug("Current assignment of " + info.getRegionName() + - " is no good: storedInfo: " + storedInfo + ", startCode: " + + " is not valid: storedInfo: " + storedInfo + ", startCode: " + startCode + ", storedInfo.startCode: " + ((storedInfo != null)? storedInfo.getStartCode(): -1) + ", unassignedRegions: " + unassignedRegions.containsKey(info) + @@ -963,7 +963,9 @@ public class HMaster extends Thread implements HConstants, HMasterInterface, */ void unassignRootRegion() { this.rootRegionLocation.set(null); - this.unassignedRegions.put(HRegionInfo.rootRegionInfo, ZERO_L); + if (!this.shutdownRequested) { + this.unassignedRegions.put(HRegionInfo.rootRegionInfo, ZERO_L); + } } /** @@ -1622,10 +1624,15 @@ public class HMaster extends Thread implements HConstants, HMasterInterface, // Root region + if (region.isOffline()) { + // Can't proceed without root region. Shutdown. + LOG.fatal("root region is marked offline"); + shutdown(); + } unassignRootRegion(); } else { - boolean reassignRegion = true; + boolean reassignRegion = !region.isOffline(); boolean deleteRegion = false; if (killedRegions.remove(region.getRegionName())) { diff --git a/src/java/org/apache/hadoop/hbase/HRegion.java b/src/java/org/apache/hadoop/hbase/HRegion.java index df41b763377..9f50139e52d 100644 --- a/src/java/org/apache/hadoop/hbase/HRegion.java +++ b/src/java/org/apache/hadoop/hbase/HRegion.java @@ -353,7 +353,7 @@ public class HRegion implements HConstants { * @throws IOException */ public List close() throws IOException { - return close(false); + return close(false, null); } /** @@ -364,13 +364,15 @@ public class HRegion implements HConstants { * time-sensitive thread. * * @param abort true if server is aborting (only during testing) + * @param listener call back to alert caller on close status * @return Vector of all the storage files that the HRegion's component * HStores make use of. It's a list of HStoreFile objects. Can be null if * we are not to close at this time or we are already closed. * * @throws IOException */ - List close(boolean abort) throws IOException { + List close(boolean abort, + final RegionUnavailableListener listener) throws IOException { if (isClosed()) { LOG.info("region " + this.regionInfo.getRegionName() + " already closed"); return null; @@ -410,6 +412,13 @@ public class HRegion implements HConstants { // outstanding updates. waitOnRowLocks(); + if (listener != null) { + // If there is a listener, let them know that we have now + // acquired all the necessary locks and are starting to + // do the close + listener.closing(getRegionName()); + } + // Don't flush the cache if we are aborting if (!abort) { internalFlushcache(snapshotMemcaches()); @@ -420,6 +429,13 @@ public class HRegion implements HConstants { result.addAll(store.close()); } this.closed.set(true); + + if (listener != null) { + // If there is a listener, tell them that the region is now + // closed. + listener.closed(getRegionName()); + } + LOG.info("closed " + this.regionInfo.getRegionName()); return result; } finally { @@ -553,17 +569,10 @@ public class HRegion implements HConstants { throw new IOException("Cannot split; target file collision at " + dirB); } - // Notify the caller that we are about to close the region. This moves - // us to the 'retiring' queue. Means no more updates coming in -- just - // whatever is outstanding. - if (listener != null) { - listener.closing(getRegionName()); - } - // Now close the HRegion. Close returns all store files or null if not // supposed to close (? What to do in this case? Implement abort of close?) // Close also does wait on outstanding rows and calls a flush just-in-case. - List hstoreFilesToSplit = close(); + List hstoreFilesToSplit = close(false, listener); if (hstoreFilesToSplit == null) { LOG.warn("Close came back null (Implement abort of close?)"); throw new RuntimeException("close returned empty vector of HStoreFiles"); diff --git a/src/java/org/apache/hadoop/hbase/HRegionServer.java b/src/java/org/apache/hadoop/hbase/HRegionServer.java index ff3714d985f..00a3f3a0e18 100644 --- a/src/java/org/apache/hadoop/hbase/HRegionServer.java +++ b/src/java/org/apache/hadoop/hbase/HRegionServer.java @@ -1091,13 +1091,13 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable { } /** Add to the outbound message buffer */ - private void reportOpen(HRegion region) { - outboundMsgs.add(new HMsg(HMsg.MSG_REPORT_OPEN, region.getRegionInfo())); + private void reportOpen(HRegionInfo region) { + outboundMsgs.add(new HMsg(HMsg.MSG_REPORT_OPEN, region)); } /** Add to the outbound message buffer */ - private void reportClose(HRegion region) { - outboundMsgs.add(new HMsg(HMsg.MSG_REPORT_CLOSE, region.getRegionInfo())); + private void reportClose(HRegionInfo region) { + outboundMsgs.add(new HMsg(HMsg.MSG_REPORT_CLOSE, region)); } /** @@ -1222,7 +1222,14 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable { } catch (IOException e) { LOG.error("error opening region " + regionInfo.getRegionName(), e); - reportClose(region); + + // Mark the region offline. + // TODO: add an extra field in HRegionInfo to indicate that there is + // an error. We can't do that now because that would be an incompatible + // change that would require a migration + + regionInfo.setOffline(true); + reportClose(regionInfo); return; } this.lock.writeLock().lock(); @@ -1232,7 +1239,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable { } finally { this.lock.writeLock().unlock(); } - reportOpen(region); + reportOpen(regionInfo); } } @@ -1249,7 +1256,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable { if(region != null) { region.close(); if(reportWhenCompleted) { - reportClose(region); + reportClose(hri); } } } @@ -1269,7 +1276,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable { LOG.debug("closing region " + region.getRegionName()); } try { - region.close(abortRequested); + region.close(abortRequested, null); } catch (IOException e) { LOG.error("error closing region " + region.getRegionName(), RemoteExceptionHandler.checkIOException(e)); @@ -1303,7 +1310,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable { LOG.debug("closing region " + region.getRegionName()); } try { - region.close(false); + region.close(); } catch (IOException e) { LOG.error("error closing region " + region.getRegionName(), RemoteExceptionHandler.checkIOException(e)); diff --git a/src/test/org/apache/hadoop/hbase/TestTimestamp.java b/src/test/org/apache/hadoop/hbase/TestTimestamp.java index e25b7165357..56f91020cee 100644 --- a/src/test/org/apache/hadoop/hbase/TestTimestamp.java +++ b/src/test/org/apache/hadoop/hbase/TestTimestamp.java @@ -105,7 +105,8 @@ public class TestTimestamp extends HBaseTestCase { * @throws IOException */ public void testTimestamps() throws IOException { - final MiniHBaseCluster cluster = new MiniHBaseCluster(this.conf, 1); + final MiniHBaseCluster cluster = + new MiniHBaseCluster(this.conf, 1, this.cluster, true); try { HTable t = createTable(); Incommon incommon = new HTableIncommon(t);