From ca6e67a6de242d681b6e6f3d53a0db5b10d1450a Mon Sep 17 00:00:00 2001
From: stack
Date: Mon, 18 Nov 2019 15:03:10 -0800
Subject: [PATCH] HBASE-23315 Miscellaneous HBCK Report page cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* Add a bit of javadoc around SerialReplicationChecker.
* Miniscule edit to the profiler jsp page and then a bit of doc on how to make it work that might help.
* Add some detail if NPE getting BitSetNode to help w/ debug.
* Change HbckChore to log region names instead of encoded names; helps doing diagnostics; can take region name and query in shell to find out all about the region according to hbase:meta.
* Add some fix-it help inline in the HBCK Report page – how to fix.
* Add counts in procedures page so can see if making progress; move listing of WALs to end of the page.
---
.../hadoop/hbase/MetaTableAccessor.java | 6 +-
.../hadoop/hbase/http/ProfileServlet.java | 4 +-
.../hbase/procedure2/store/BitSetNode.java | 12 ++-
.../apache/hadoop/hbase/master/HbckChore.java | 22 +++--
.../SerialReplicationChecker.java | 9 +-
.../apache/hadoop/hbase/tool/CanaryTool.java | 6 +-
.../resources/hbase-webapps/master/hbck.jsp | 14 ++-
.../hbase-webapps/master/procedures.jsp | 98 +++++++++++--------
.../master/assignment/TestHbckChore.java | 6 +-
9 files changed, 110 insertions(+), 67 deletions(-)
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
index ad54324c7b3..f93c3cce9fa 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
@@ -136,7 +136,7 @@ import org.apache.hbase.thirdparty.com.google.common.base.Throwables;
* columns: info:merge0001, info:merge0002. You make also see 'mergeA',
* and 'mergeB'. This is old form replaced by the new format that allows
* for more than two parents to be merged at a time.
- * TODO: Add rep_barrier for serial replication explaination.
+ * TODO: Add rep_barrier for serial replication explaination. See SerialReplicationChecker.
*
*
*
@@ -607,6 +607,7 @@ public class MetaTableAccessor {
* @param excludeOfflinedSplitParents don't return split parents
* @return Return list of regioninfos and server addresses.
*/
+ // What happens here when 1M regions in hbase:meta? This won't scale?
public static List> getTableRegionsAndLocations(
Connection connection, @Nullable final TableName tableName,
final boolean excludeOfflinedSplitParents) throws IOException {
@@ -1928,6 +1929,9 @@ public class MetaTableAccessor {
return put;
}
+ /**
+ * See class comment on SerialReplicationChecker
+ */
public static void addReplicationBarrier(Put put, long openSeqNum) throws IOException {
put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY)
.setRow(put.getRow())
diff --git a/hbase-http/src/main/java/org/apache/hadoop/hbase/http/ProfileServlet.java b/hbase-http/src/main/java/org/apache/hadoop/hbase/http/ProfileServlet.java
index 642d05a314a..fc75530cc50 100644
--- a/hbase-http/src/main/java/org/apache/hadoop/hbase/http/ProfileServlet.java
+++ b/hbase-http/src/main/java/org/apache/hadoop/hbase/http/ProfileServlet.java
@@ -270,7 +270,7 @@ public class ProfileServlet extends HttpServlet {
resp.getWriter().write(
"Started [" + event.getInternalName() +
"] profiling. This page will automatically redirect to " +
- relativeUrl + " after " + duration + " seconds.\n\ncommand:\n" +
+ relativeUrl + " after " + duration + " seconds.\n\nCommand:\n" +
Joiner.on(" ").join(cmd));
// to avoid auto-refresh by ProfileOutputServlet, refreshDelay can be specified
@@ -395,4 +395,4 @@ public class ProfileServlet extends HttpServlet {
}
-}
\ No newline at end of file
+}
diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/BitSetNode.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/BitSetNode.java
index f42199b740c..78d2d91ca86 100644
--- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/BitSetNode.java
+++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/store/BitSetNode.java
@@ -407,7 +407,15 @@ class BitSetNode {
int wordIndex = bitmapIndex >> ADDRESS_BITS_PER_WORD;
long value = (1L << bitmapIndex);
- modified[wordIndex] |= value;
+ try {
+ modified[wordIndex] |= value;
+ } catch (ArrayIndexOutOfBoundsException aioobe) {
+ // We've gotten a AIOOBE in here; add detail to help debug.
+ ArrayIndexOutOfBoundsException aioobe2 =
+ new ArrayIndexOutOfBoundsException("pid=" + procId + ", deleted=" + isDeleted);
+ aioobe2.initCause(aioobe);
+ throw aioobe2;
+ }
if (isDeleted) {
deleted[wordIndex] |= value;
} else {
@@ -431,4 +439,4 @@ class BitSetNode {
private static long alignDown(final long x) {
return x & -BITS_PER_WORD;
}
-}
\ No newline at end of file
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java
index fc0111d71fb..cf4368581e7 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HbckChore.java
@@ -190,10 +190,10 @@ public class HbckChore extends ScheduledChore {
RegionInfo regionInfo = regionState.getRegion();
if (master.getTableStateManager()
.isTableState(regionInfo.getTable(), TableState.State.DISABLED)) {
- disabledTableRegions.add(regionInfo.getEncodedName());
+ disabledTableRegions.add(regionInfo.getRegionNameAsString());
}
if (regionInfo.isSplitParent()) {
- splitParentRegions.add(regionInfo.getEncodedName());
+ splitParentRegions.add(regionInfo.getRegionNameAsString());
}
HbckRegionInfo.MetaEntry metaEntry =
new HbckRegionInfo.MetaEntry(regionInfo, regionState.getServerName(),
@@ -212,7 +212,7 @@ public class HbckChore extends ScheduledChore {
String encodedRegionName = RegionInfo.encodeRegionName(regionName);
HbckRegionInfo hri = regionInfoMap.get(encodedRegionName);
if (hri == null) {
- orphanRegionsOnRS.put(encodedRegionName, serverName);
+ orphanRegionsOnRS.put(RegionInfo.getRegionNameAsString(regionName), serverName);
continue;
}
hri.addServer(hri.getMetaEntry(), serverName);
@@ -223,7 +223,6 @@ public class HbckChore extends ScheduledChore {
numRegions, rsReports.size(), orphanRegionsOnFS.size());
for (Map.Entry entry : regionInfoMap.entrySet()) {
- String encodedRegionName = entry.getKey();
HbckRegionInfo hri = entry.getValue();
ServerName locationInMeta = hri.getMetaEntry().getRegionServer();
if (hri.getDeployedOn().size() == 0) {
@@ -231,21 +230,24 @@ public class HbckChore extends ScheduledChore {
continue;
}
// skip the offline region which belong to disabled table.
- if (disabledTableRegions.contains(encodedRegionName)) {
+ if (disabledTableRegions.contains(hri.getRegionNameAsString())) {
continue;
}
// skip the split parent regions
- if (splitParentRegions.contains(encodedRegionName)) {
+ if (splitParentRegions.contains(hri.getRegionNameAsString())) {
continue;
}
// Master thought this region opened, but no regionserver reported it.
- inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, new LinkedList<>()));
+ inconsistentRegions.put(hri.getRegionNameAsString(),
+ new Pair<>(locationInMeta, new LinkedList<>()));
} else if (hri.getDeployedOn().size() > 1) {
// More than one regionserver reported opened this region
- inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn()));
+ inconsistentRegions.put(hri.getRegionNameAsString(),
+ new Pair<>(locationInMeta, hri.getDeployedOn()));
} else if (!hri.getDeployedOn().get(0).equals(locationInMeta)) {
// Master thought this region opened on Server1, but regionserver reported Server2
- inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn()));
+ inconsistentRegions.put(hri.getRegionNameAsString(),
+ new Pair<>(locationInMeta, hri.getDeployedOn()));
}
}
}
@@ -339,4 +341,4 @@ public class HbckChore extends ScheduledChore {
public long getCheckingEndTimestamp() {
return this.checkingEndTimestamp;
}
-}
\ No newline at end of file
+}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/SerialReplicationChecker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/SerialReplicationChecker.java
index 4b88050ee26..321bbb420bc 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/SerialReplicationChecker.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/SerialReplicationChecker.java
@@ -50,12 +50,13 @@ import org.apache.hbase.thirdparty.com.google.common.cache.LoadingCache;
*
*
* We record all the open sequence number for a region in a special family in meta, which is called
- * 'barrier', so there will be a sequence of open sequence number (b1, b2, b3, ...). We call [bn,
- * bn+1) a range, and it is obvious that a region will always be on the same RS within a range.
+ * 'rep_barrier', so there will be a sequence of open sequence number (b1, b2, b3, ...). We call
+ * [bn, bn+1) a range, and it is obvious that a region will always be on the same RS within a
+ * range.
*
* When split and merge, we will also record the parent for the generated region(s) in the special
- * family in meta. And also, we will write an extra 'open sequence number' for the parent region(s),
- * which is the max sequence id of the region plus one.
+ * family in meta. And also, we will write an extra 'open sequence number' for the parent
+ * region(s), which is the max sequence id of the region plus one.
*
*
*
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java
index 4f59cf33284..af9b879bfe6 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java
@@ -277,13 +277,15 @@ public class CanaryTool implements Tool, Canary {
public void publishReadFailure(ServerName serverName, RegionInfo region, Exception e) {
incReadFailureCount();
- LOG.error("Read from {} on {} failed", region.getRegionNameAsString(), serverName, e);
+ LOG.error("Read from {} on serverName={} failed",
+ region.getRegionNameAsString(), serverName, e);
}
public void publishReadFailure(ServerName serverName, RegionInfo region,
ColumnFamilyDescriptor column, Exception e) {
incReadFailureCount();
- LOG.error("Read from {} on {} {} failed", region.getRegionNameAsString(), serverName,
+ LOG.error("Read from {} on serverName={}, columnFamily={} failed",
+ region.getRegionNameAsString(), serverName,
column.getNameAsString(), e);
}
diff --git a/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp b/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp
index 0ac66782dc1..f89aac8bce6 100644
--- a/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp
+++ b/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp
@@ -78,7 +78,7 @@
@@ -119,7 +119,7 @@
- Region Encoded Name |
+ Region Name |
Location in META |
Reported Online RegionServers |
@@ -142,10 +142,18 @@
Orphan Regions on RegionServer
+
+
+ The below are Regions we've lost account of. To be safe, run bulk load of any data found in these Region orphan directories back into the HBase cluster.
+ First make sure hbase:meta is in healthy state; run 'hbkc2 fixMeta' to be sure. Once this is done, per Region below, run a bulk
+ load -- '$ hbase completebulkload REGION_DIR_PATH TABLE_NAME' -- and then delete the desiccated directory content (HFiles are removed upon successful load; all that is left are empty directories
+ and occasionally a seqid marking file).
+
+
- Region Encoded Name |
+ Region Name |
Reported Online RegionServer |
<% for (Map.Entry entry : orphanRegionsOnRS.entrySet()) { %>
diff --git a/hbase-server/src/main/resources/hbase-webapps/master/procedures.jsp b/hbase-server/src/main/resources/hbase-webapps/master/procedures.jsp
index c918b303939..ea252cff39a 100644
--- a/hbase-server/src/main/resources/hbase-webapps/master/procedures.jsp
+++ b/hbase-server/src/main/resources/hbase-webapps/master/procedures.jsp
@@ -81,11 +81,14 @@
Errors |
Parameters |
- <% for (Procedure> proc : procedures) {
+ <%
+ int displayCount = 0;
+ for (Procedure> proc : procedures) {
// Don't show SUCCESS procedures.
if (proc.isSuccess()) {
continue;
}
+ displayCount++;
%>
<%= proc.getProcId() %> |
@@ -99,9 +102,63 @@
<%= escapeXml(proc.toString()) %> |
<% } %>
+ <%
+ if (displayCount > 0) {
+ %>
+ <%= displayCount %> procedure(s).
+ <%
+ }
+ %>
+
+
+
+
+ <%
+ if (lockedResources.size() > 0) {
+ %>
+
<%= lockedResources.size() %> lock(s).
+ <%
+ }
+ %>
+ <% for (LockedResource lockedResource : lockedResources) { %>
+
<%= lockedResource.getResourceType() %>: <%= lockedResource.getResourceName() %>
+ <%
+ switch (lockedResource.getLockType()) {
+ case EXCLUSIVE:
+ %>
+
Lock type: EXCLUSIVE
+
Owner procedure: <%= escapeXml(lockedResource.getExclusiveLockOwnerProcedure().toStringDetails()) %>
+ <%
+ break;
+ case SHARED:
+ %>
+
Lock type: SHARED
+
Number of shared locks: <%= lockedResource.getSharedLockCount() %>
+ <%
+ break;
+ }
+
+ List
> waitingProcedures = lockedResource.getWaitingProcedures();
+
+ if (!waitingProcedures.isEmpty()) {
+ %>
+ Waiting procedures
+
+ <% for (Procedure> proc : procedures) { %>
+
+ <%= escapeXml(proc.toStringDetails()) %> |
+
+ <% } %>
+
+ <% } %>
+ <% } %>
+
+
-
-
-
-
- <% for (LockedResource lockedResource : lockedResources) { %>
-
<%= lockedResource.getResourceType() %>: <%= lockedResource.getResourceName() %>
- <%
- switch (lockedResource.getLockType()) {
- case EXCLUSIVE:
- %>
-
Lock type: EXCLUSIVE
-
Owner procedure: <%= escapeXml(lockedResource.getExclusiveLockOwnerProcedure().toStringDetails()) %>
- <%
- break;
- case SHARED:
- %>
-
Lock type: SHARED
-
Number of shared locks: <%= lockedResource.getSharedLockCount() %>
- <%
- break;
- }
-
- List
> waitingProcedures = lockedResource.getWaitingProcedures();
-
- if (!waitingProcedures.isEmpty()) {
- %>
- Waiting procedures
-
- <% for (Procedure> proc : procedures) { %>
-
- <%= escapeXml(proc.toStringDetails()) %> |
-
- <% } %>
-
- <% } %>
- <% } %>
-
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestHbckChore.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestHbckChore.java
index 4c0a194e4d1..ea705082494 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestHbckChore.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestHbckChore.java
@@ -69,7 +69,7 @@ public class TestHbckChore extends TestAssignmentManagerBase {
@Test
public void testForMeta() {
byte[] metaRegionNameAsBytes = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName();
- String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getEncodedName();
+ String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionNameAsString();
List
serverNames = master.getServerManager().getOnlineServersList();
assertEquals(NSERVERS, serverNames.size());
@@ -96,7 +96,7 @@ public class TestHbckChore extends TestAssignmentManagerBase {
public void testForUserTable() throws Exception {
TableName tableName = TableName.valueOf("testForUserTable");
RegionInfo hri = createRegionInfo(tableName, 1);
- String regionName = hri.getEncodedName();
+ String regionName = hri.getRegionNameAsString();
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
Future future = submitProcedure(createAssignProcedure(hri));
waitOnFuture(future);
@@ -154,7 +154,7 @@ public class TestHbckChore extends TestAssignmentManagerBase {
public void testForDisabledTable() throws Exception {
TableName tableName = TableName.valueOf("testForDisabledTable");
RegionInfo hri = createRegionInfo(tableName, 1);
- String regionName = hri.getEncodedName();
+ String regionName = hri.getRegionNameAsString();
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
Future future = submitProcedure(createAssignProcedure(hri));
waitOnFuture(future);