From ee03d1626d6bc3a098af367fc3484e99afdee661 Mon Sep 17 00:00:00 2001 From: Zhihong Yu Date: Fri, 13 Jan 2012 21:17:30 +0000 Subject: [PATCH] HBASE-5196 Failure in region split after PONR could cause region hole (Jimmy Xiang) git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1231302 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 1 + .../apache/hadoop/hbase/master/HMaster.java | 42 ++++++++++++++++++- .../master/handler/ServerShutdownHandler.java | 19 +++++---- .../hbase/regionserver/SplitRequest.java | 4 +- 4 files changed, 56 insertions(+), 10 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 41581f28001..20199f574b6 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -478,6 +478,7 @@ Release 0.92.0 - Unreleased HBASE-5137 MasterFileSystem.splitLog() should abort even if waitOnSafeMode() throws IOException(Ted) HBASE-5121 MajorCompaction may affect scan's correctness (chunhui shen and Lars H) HBASE-5143 Fix config typo in pluggable load balancer factory (Harsh J) + HBASE-5196 Failure in region split after PONR could cause region hole (Jimmy Xiang) TESTS HBASE-4450 test for number of blocks read: to serve as baseline for expected diff --git a/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index cb2f084aa80..9d21903cbaf 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -25,6 +25,7 @@ import java.lang.reflect.InvocationTargetException; import java.net.InetAddress; import java.net.InetSocketAddress; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; @@ -68,11 +69,13 @@ import org.apache.hadoop.hbase.ipc.HMasterInterface; import org.apache.hadoop.hbase.ipc.HMasterRegionInterface; import org.apache.hadoop.hbase.ipc.ProtocolSignature; import org.apache.hadoop.hbase.ipc.RpcServer; +import org.apache.hadoop.hbase.master.CatalogJanitor.SplitParentFirstComparator; import org.apache.hadoop.hbase.master.handler.CreateTableHandler; import org.apache.hadoop.hbase.master.handler.DeleteTableHandler; import org.apache.hadoop.hbase.master.handler.DisableTableHandler; import org.apache.hadoop.hbase.master.handler.EnableTableHandler; import org.apache.hadoop.hbase.master.handler.ModifyTableHandler; +import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler; import org.apache.hadoop.hbase.master.handler.TableAddFamilyHandler; import org.apache.hadoop.hbase.master.handler.TableDeleteFamilyHandler; import org.apache.hadoop.hbase.master.handler.TableModifyFamilyHandler; @@ -527,7 +530,11 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server { this.balancer.setClusterStatus(getClusterStatus()); this.balancer.setMasterServices(this); - + + // Fixing up missing daughters if any + status.setStatus("Fixing up missing daughters"); + fixupDaughters(status); + // Start balancer and meta catalog janitor after meta and regions have // been assigned. status.setStatus("Starting balancer and catalog janitor"); @@ -622,6 +629,39 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server { return assigned; } + void fixupDaughters(final MonitoredTask status) throws IOException { + final Map offlineSplitParents = + new HashMap(); + // This visitor collects offline split parents in the .META. table + MetaReader.Visitor visitor = new MetaReader.Visitor() { + @Override + public boolean visit(Result r) throws IOException { + if (r == null || r.isEmpty()) return true; + HRegionInfo info = + MetaReader.parseHRegionInfoFromCatalogResult( + r, HConstants.REGIONINFO_QUALIFIER); + if (info == null) return true; // Keep scanning + if (info.isOffline() && info.isSplit()) { + offlineSplitParents.put(info, r); + } + // Returning true means "keep scanning" + return true; + } + }; + // Run full scan of .META. catalog table passing in our custom visitor + MetaReader.fullScan(this.catalogTracker, visitor); + // Now work on our list of found parents. See if any we can clean up. + int fixups = 0; + for (Map.Entry e : offlineSplitParents.entrySet()) { + fixups += ServerShutdownHandler.fixupDaughters( + e.getValue(), assignmentManager, catalogTracker); + } + if (fixups != 0) { + LOG.info("Scanned the catalog and fixed up " + fixups + + " missing daughter region(s)"); + } + } + /** * Expire a server if we find it is one of the online servers set. * @param sn ServerName to check. diff --git a/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java b/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java index 8f4f4b8f8b9..4307d89a404 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java +++ b/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java @@ -342,31 +342,34 @@ public class ServerShutdownHandler extends EventHandler { * Check that daughter regions are up in .META. and if not, add them. * @param hris All regions for this server in meta. * @param result The contents of the parent row in .META. + * @return the number of daughters missing and fixed * @throws IOException */ - static void fixupDaughters(final Result result, + public static int fixupDaughters(final Result result, final AssignmentManager assignmentManager, final CatalogTracker catalogTracker) throws IOException { - fixupDaughter(result, HConstants.SPLITA_QUALIFIER, assignmentManager, - catalogTracker); - fixupDaughter(result, HConstants.SPLITB_QUALIFIER, assignmentManager, - catalogTracker); + int fixedA = fixupDaughter(result, HConstants.SPLITA_QUALIFIER, + assignmentManager, catalogTracker); + int fixedB = fixupDaughter(result, HConstants.SPLITB_QUALIFIER, + assignmentManager, catalogTracker); + return fixedA + fixedB; } /** * Check individual daughter is up in .META.; fixup if its not. * @param result The contents of the parent row in .META. * @param qualifier Which daughter to check for. + * @return 1 if the daughter is missing and fixed. Otherwise 0 * @throws IOException */ - static void fixupDaughter(final Result result, final byte [] qualifier, + static int fixupDaughter(final Result result, final byte [] qualifier, final AssignmentManager assignmentManager, final CatalogTracker catalogTracker) throws IOException { HRegionInfo daughter = MetaReader.parseHRegionInfoFromCatalogResult(result, qualifier); - if (daughter == null) return; + if (daughter == null) return 0; if (isDaughterMissing(catalogTracker, daughter)) { LOG.info("Fixup; missing daughter " + daughter.getRegionNameAsString()); MetaEditor.addDaughter(catalogTracker, daughter, null); @@ -377,9 +380,11 @@ public class ServerShutdownHandler extends EventHandler { // And assign it. assignmentManager.assign(daughter, true); + return 1; } else { LOG.debug("Daughter " + daughter.getRegionNameAsString() + " present"); } + return 0; } /** diff --git a/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java b/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java index 41f5dff49a2..20a5b332f26 100644 --- a/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java +++ b/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java @@ -68,7 +68,7 @@ class SplitRequest implements Runnable { } catch (Exception e) { try { LOG.info("Running rollback/cleanup of failed split of " + - parent.getRegionNameAsString() + "; " + e.getMessage()); + parent.getRegionNameAsString() + "; " + e.getMessage(), e); if (st.rollback(this.server, this.server)) { LOG.info("Successful rollback of failed split of " + parent.getRegionNameAsString()); @@ -95,4 +95,4 @@ class SplitRequest implements Runnable { server.checkFileSystem(); } } -} \ No newline at end of file +}