From ca6e67a6de242d681b6e6f3d53a0db5b10d1450a Mon Sep 17 00:00:00 2001 From: stack Date: Mon, 18 Nov 2019 15:03:10 -0800 Subject: [PATCH] HBASE-23315 Miscellaneous HBCK Report page cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add a bit of javadoc around SerialReplicationChecker. * Miniscule edit to the profiler jsp page and then a bit of doc on how to make it work that might help. * Add some detail if NPE getting BitSetNode to help w/ debug. * Change HbckChore to log region names instead of encoded names; helps doing diagnostics; can take region name and query in shell to find out all about the region according to hbase:meta. * Add some fix-it help inline in the HBCK Report page – how to fix. * Add counts in procedures page so can see if making progress; move listing of WALs to end of the page. --- .../hadoop/hbase/MetaTableAccessor.java | 6 +- .../hadoop/hbase/http/ProfileServlet.java | 4 +- .../hbase/procedure2/store/BitSetNode.java | 12 ++- .../apache/hadoop/hbase/master/HbckChore.java | 22 +++-- .../SerialReplicationChecker.java | 9 +- .../apache/hadoop/hbase/tool/CanaryTool.java | 6 +- .../resources/hbase-webapps/master/hbck.jsp | 14 ++- .../hbase-webapps/master/procedures.jsp | 98 +++++++++++-------- .../master/assignment/TestHbckChore.java | 6 +- 9 files changed, 110 insertions(+), 67 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java index ad54324c7b3..f93c3cce9fa 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java @@ -136,7 +136,7 @@ import org.apache.hbase.thirdparty.com.google.common.base.Throwables; * columns: info:merge0001, info:merge0002. You make also see 'mergeA', * and 'mergeB'. This is old form replaced by the new format that allows * for more than two parents to be merged at a time. - * TODO: Add rep_barrier for serial replication explaination. + * TODO: Add rep_barrier for serial replication explaination. See SerialReplicationChecker. * *

*

@@ -607,6 +607,7 @@ public class MetaTableAccessor { * @param excludeOfflinedSplitParents don't return split parents * @return Return list of regioninfos and server addresses. */ + // What happens here when 1M regions in hbase:meta? This won't scale? public static List> getTableRegionsAndLocations( Connection connection, @Nullable final TableName tableName, final boolean excludeOfflinedSplitParents) throws IOException { @@ -1928,6 +1929,9 @@ public class MetaTableAccessor { return put; } + /** + * See class comment on SerialReplicationChecker + */ public static void addReplicationBarrier(Put put, long openSeqNum) throws IOException { put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY) .setRow(put.getRow()) diff --git a/hbase-http/src/main/java/org/apache/hadoop/hbase/http/ProfileServlet.java b/hbase-http/src/main/java/org/apache/hadoop/hbase/http/ProfileServlet.java index 642d05a314a..fc75530cc50 100644 --- a/hbase-http/src/main/java/org/apache/hadoop/hbase/http/ProfileServlet.java +++ b/hbase-http/src/main/java/org/apache/hadoop/hbase/http/ProfileServlet.java @@ -270,7 +270,7 @@ public class ProfileServlet extends HttpServlet { resp.getWriter().write( "Started [" + event.getInternalName() + "] profiling. This page will automatically redirect to " + - relativeUrl + " after " + duration + " seconds.\n\ncommand:\n" + + relativeUrl + " after " + duration + " seconds.\n\nCommand:\n" + Joiner.on(" ").join(cmd)); // to avoid auto-refresh by ProfileOutputServlet, refreshDelay can be specified @@ -395,4 +395,4 @@ public class ProfileServlet extends HttpServlet { } -} \ No newline at end of file +} diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/BitSetNode.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/BitSetNode.java index f42199b740c..78d2d91ca86 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/BitSetNode.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/BitSetNode.java @@ -407,7 +407,15 @@ class BitSetNode { int wordIndex = bitmapIndex >> ADDRESS_BITS_PER_WORD; long value = (1L << bitmapIndex); - modified[wordIndex] |= value; + try { + modified[wordIndex] |= value; + } catch (ArrayIndexOutOfBoundsException aioobe) { + // We've gotten a AIOOBE in here; add detail to help debug. + ArrayIndexOutOfBoundsException aioobe2 = + new ArrayIndexOutOfBoundsException("pid=" + procId + ", deleted=" + isDeleted); + aioobe2.initCause(aioobe); + throw aioobe2; + } if (isDeleted) { deleted[wordIndex] |= value; } else { @@ -431,4 +439,4 @@ class BitSetNode { private static long alignDown(final long x) { return x & -BITS_PER_WORD; } -} \ No newline at end of file +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java index fc0111d71fb..cf4368581e7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java @@ -190,10 +190,10 @@ public class HbckChore extends ScheduledChore { RegionInfo regionInfo = regionState.getRegion(); if (master.getTableStateManager() .isTableState(regionInfo.getTable(), TableState.State.DISABLED)) { - disabledTableRegions.add(regionInfo.getEncodedName()); + disabledTableRegions.add(regionInfo.getRegionNameAsString()); } if (regionInfo.isSplitParent()) { - splitParentRegions.add(regionInfo.getEncodedName()); + splitParentRegions.add(regionInfo.getRegionNameAsString()); } HbckRegionInfo.MetaEntry metaEntry = new HbckRegionInfo.MetaEntry(regionInfo, regionState.getServerName(), @@ -212,7 +212,7 @@ public class HbckChore extends ScheduledChore { String encodedRegionName = RegionInfo.encodeRegionName(regionName); HbckRegionInfo hri = regionInfoMap.get(encodedRegionName); if (hri == null) { - orphanRegionsOnRS.put(encodedRegionName, serverName); + orphanRegionsOnRS.put(RegionInfo.getRegionNameAsString(regionName), serverName); continue; } hri.addServer(hri.getMetaEntry(), serverName); @@ -223,7 +223,6 @@ public class HbckChore extends ScheduledChore { numRegions, rsReports.size(), orphanRegionsOnFS.size()); for (Map.Entry entry : regionInfoMap.entrySet()) { - String encodedRegionName = entry.getKey(); HbckRegionInfo hri = entry.getValue(); ServerName locationInMeta = hri.getMetaEntry().getRegionServer(); if (hri.getDeployedOn().size() == 0) { @@ -231,21 +230,24 @@ public class HbckChore extends ScheduledChore { continue; } // skip the offline region which belong to disabled table. - if (disabledTableRegions.contains(encodedRegionName)) { + if (disabledTableRegions.contains(hri.getRegionNameAsString())) { continue; } // skip the split parent regions - if (splitParentRegions.contains(encodedRegionName)) { + if (splitParentRegions.contains(hri.getRegionNameAsString())) { continue; } // Master thought this region opened, but no regionserver reported it. - inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, new LinkedList<>())); + inconsistentRegions.put(hri.getRegionNameAsString(), + new Pair<>(locationInMeta, new LinkedList<>())); } else if (hri.getDeployedOn().size() > 1) { // More than one regionserver reported opened this region - inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn())); + inconsistentRegions.put(hri.getRegionNameAsString(), + new Pair<>(locationInMeta, hri.getDeployedOn())); } else if (!hri.getDeployedOn().get(0).equals(locationInMeta)) { // Master thought this region opened on Server1, but regionserver reported Server2 - inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn())); + inconsistentRegions.put(hri.getRegionNameAsString(), + new Pair<>(locationInMeta, hri.getDeployedOn())); } } } @@ -339,4 +341,4 @@ public class HbckChore extends ScheduledChore { public long getCheckingEndTimestamp() { return this.checkingEndTimestamp; } -} \ No newline at end of file +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/SerialReplicationChecker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/SerialReplicationChecker.java index 4b88050ee26..321bbb420bc 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/SerialReplicationChecker.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/SerialReplicationChecker.java @@ -50,12 +50,13 @@ import org.apache.hbase.thirdparty.com.google.common.cache.LoadingCache; *

*

* We record all the open sequence number for a region in a special family in meta, which is called - * 'barrier', so there will be a sequence of open sequence number (b1, b2, b3, ...). We call [bn, - * bn+1) a range, and it is obvious that a region will always be on the same RS within a range. + * 'rep_barrier', so there will be a sequence of open sequence number (b1, b2, b3, ...). We call + * [bn, bn+1) a range, and it is obvious that a region will always be on the same RS within a + * range. *

* When split and merge, we will also record the parent for the generated region(s) in the special - * family in meta. And also, we will write an extra 'open sequence number' for the parent region(s), - * which is the max sequence id of the region plus one. + * family in meta. And also, we will write an extra 'open sequence number' for the parent + * region(s), which is the max sequence id of the region plus one. *

*

*

diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java index 4f59cf33284..af9b879bfe6 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java @@ -277,13 +277,15 @@ public class CanaryTool implements Tool, Canary { public void publishReadFailure(ServerName serverName, RegionInfo region, Exception e) { incReadFailureCount(); - LOG.error("Read from {} on {} failed", region.getRegionNameAsString(), serverName, e); + LOG.error("Read from {} on serverName={} failed", + region.getRegionNameAsString(), serverName, e); } public void publishReadFailure(ServerName serverName, RegionInfo region, ColumnFamilyDescriptor column, Exception e) { incReadFailureCount(); - LOG.error("Read from {} on {} {} failed", region.getRegionNameAsString(), serverName, + LOG.error("Read from {} on serverName={}, columnFamily={} failed", + region.getRegionNameAsString(), serverName, column.getNameAsString(), e); } diff --git a/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp b/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp index 0ac66782dc1..f89aac8bce6 100644 --- a/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp +++ b/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp @@ -78,7 +78,7 @@

@@ -119,7 +119,7 @@ - + @@ -142,10 +142,18 @@

Orphan Regions on RegionServer

+

+ + The below are Regions we've lost account of. To be safe, run bulk load of any data found in these Region orphan directories back into the HBase cluster. + First make sure hbase:meta is in healthy state; run 'hbkc2 fixMeta' to be sure. Once this is done, per Region below, run a bulk + load -- '$ hbase completebulkload REGION_DIR_PATH TABLE_NAME' -- and then delete the desiccated directory content (HFiles are removed upon successful load; all that is left are empty directories + and occasionally a seqid marking file). + +

Region Encoded NameRegion Name Location in META Reported Online RegionServers
- + <% for (Map.Entry entry : orphanRegionsOnRS.entrySet()) { %> diff --git a/hbase-server/src/main/resources/hbase-webapps/master/procedures.jsp b/hbase-server/src/main/resources/hbase-webapps/master/procedures.jsp index c918b303939..ea252cff39a 100644 --- a/hbase-server/src/main/resources/hbase-webapps/master/procedures.jsp +++ b/hbase-server/src/main/resources/hbase-webapps/master/procedures.jsp @@ -81,11 +81,14 @@ - <% for (Procedure proc : procedures) { + <% + int displayCount = 0; + for (Procedure proc : procedures) { // Don't show SUCCESS procedures. if (proc.isSuccess()) { continue; } + displayCount++; %> @@ -99,9 +102,63 @@ <% } %> + <% + if (displayCount > 0) { + %> +

<%= displayCount %> procedure(s).

+ <% + } + %>
Region Encoded NameRegion Name Reported Online RegionServer
Errors Parameters
<%= proc.getProcId() %><%= escapeXml(proc.toString()) %>

+
+
+ +
+ <% + if (lockedResources.size() > 0) { + %> +

<%= lockedResources.size() %> lock(s).

+ <% + } + %> + <% for (LockedResource lockedResource : lockedResources) { %> +

<%= lockedResource.getResourceType() %>: <%= lockedResource.getResourceName() %>

+ <% + switch (lockedResource.getLockType()) { + case EXCLUSIVE: + %> +

Lock type: EXCLUSIVE

+

Owner procedure: <%= escapeXml(lockedResource.getExclusiveLockOwnerProcedure().toStringDetails()) %>

+ <% + break; + case SHARED: + %> +

Lock type: SHARED

+

Number of shared locks: <%= lockedResource.getSharedLockCount() %>

+ <% + break; + } + + List> waitingProcedures = lockedResource.getWaitingProcedures(); + + if (!waitingProcedures.isEmpty()) { + %> +

Waiting procedures

+ + <% for (Procedure proc : procedures) { %> + + + + <% } %> +
<%= escapeXml(proc.toStringDetails()) %>
+ <% } %> + <% } %> +
+

-
-
- -
- <% for (LockedResource lockedResource : lockedResources) { %> -

<%= lockedResource.getResourceType() %>: <%= lockedResource.getResourceName() %>

- <% - switch (lockedResource.getLockType()) { - case EXCLUSIVE: - %> -

Lock type: EXCLUSIVE

-

Owner procedure: <%= escapeXml(lockedResource.getExclusiveLockOwnerProcedure().toStringDetails()) %>

- <% - break; - case SHARED: - %> -

Lock type: SHARED

-

Number of shared locks: <%= lockedResource.getSharedLockCount() %>

- <% - break; - } - - List> waitingProcedures = lockedResource.getWaitingProcedures(); - - if (!waitingProcedures.isEmpty()) { - %> -

Waiting procedures

- - <% for (Procedure proc : procedures) { %> - - - - <% } %> -
<%= escapeXml(proc.toStringDetails()) %>
- <% } %> - <% } %> -
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestHbckChore.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestHbckChore.java index 4c0a194e4d1..ea705082494 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestHbckChore.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestHbckChore.java @@ -69,7 +69,7 @@ public class TestHbckChore extends TestAssignmentManagerBase { @Test public void testForMeta() { byte[] metaRegionNameAsBytes = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName(); - String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getEncodedName(); + String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionNameAsString(); List serverNames = master.getServerManager().getOnlineServersList(); assertEquals(NSERVERS, serverNames.size()); @@ -96,7 +96,7 @@ public class TestHbckChore extends TestAssignmentManagerBase { public void testForUserTable() throws Exception { TableName tableName = TableName.valueOf("testForUserTable"); RegionInfo hri = createRegionInfo(tableName, 1); - String regionName = hri.getEncodedName(); + String regionName = hri.getRegionNameAsString(); rsDispatcher.setMockRsExecutor(new GoodRsExecutor()); Future future = submitProcedure(createAssignProcedure(hri)); waitOnFuture(future); @@ -154,7 +154,7 @@ public class TestHbckChore extends TestAssignmentManagerBase { public void testForDisabledTable() throws Exception { TableName tableName = TableName.valueOf("testForDisabledTable"); RegionInfo hri = createRegionInfo(tableName, 1); - String regionName = hri.getEncodedName(); + String regionName = hri.getRegionNameAsString(); rsDispatcher.setMockRsExecutor(new GoodRsExecutor()); Future future = submitProcedure(createAssignProcedure(hri)); waitOnFuture(future);