HBASE-23315 Miscellaneous HBCK Report page cleanup

* Add a bit of javadoc around SerialReplicationChecker.
 * Miniscule edit to the profiler jsp page and then a bit of doc on how to make it work that might help.
 * Add some detail if NPE getting BitSetNode to help w/ debug.
 * Change HbckChore to log region names instead of encoded names; helps doing diagnostics; can take region name and query in shell to find out all about the region according to hbase:meta.
 * Add some fix-it help inline in the HBCK Report page – how to fix.
 * Add counts in procedures page so can see if making progress; move listing of WALs to end of the page.
This commit is contained in:
stack 2019-11-18 15:03:10 -08:00
parent e83bb205f4
commit ca6e67a6de
9 changed files with 110 additions and 67 deletions

View File

@ -136,7 +136,7 @@ import org.apache.hbase.thirdparty.com.google.common.base.Throwables;
* columns: info:merge0001, info:merge0002. You make also see 'mergeA', * columns: info:merge0001, info:merge0002. You make also see 'mergeA',
* and 'mergeB'. This is old form replaced by the new format that allows * and 'mergeB'. This is old form replaced by the new format that allows
* for more than two parents to be merged at a time. * for more than two parents to be merged at a time.
* TODO: Add rep_barrier for serial replication explaination. * TODO: Add rep_barrier for serial replication explaination. See SerialReplicationChecker.
* </pre> * </pre>
* </p> * </p>
* <p> * <p>
@ -607,6 +607,7 @@ public class MetaTableAccessor {
* @param excludeOfflinedSplitParents don't return split parents * @param excludeOfflinedSplitParents don't return split parents
* @return Return list of regioninfos and server addresses. * @return Return list of regioninfos and server addresses.
*/ */
// What happens here when 1M regions in hbase:meta? This won't scale?
public static List<Pair<RegionInfo, ServerName>> getTableRegionsAndLocations( public static List<Pair<RegionInfo, ServerName>> getTableRegionsAndLocations(
Connection connection, @Nullable final TableName tableName, Connection connection, @Nullable final TableName tableName,
final boolean excludeOfflinedSplitParents) throws IOException { final boolean excludeOfflinedSplitParents) throws IOException {
@ -1928,6 +1929,9 @@ public class MetaTableAccessor {
return put; return put;
} }
/**
* See class comment on SerialReplicationChecker
*/
public static void addReplicationBarrier(Put put, long openSeqNum) throws IOException { public static void addReplicationBarrier(Put put, long openSeqNum) throws IOException {
put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY) put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY)
.setRow(put.getRow()) .setRow(put.getRow())

View File

@ -270,7 +270,7 @@ public class ProfileServlet extends HttpServlet {
resp.getWriter().write( resp.getWriter().write(
"Started [" + event.getInternalName() + "Started [" + event.getInternalName() +
"] profiling. This page will automatically redirect to " + "] profiling. This page will automatically redirect to " +
relativeUrl + " after " + duration + " seconds.\n\ncommand:\n" + relativeUrl + " after " + duration + " seconds.\n\nCommand:\n" +
Joiner.on(" ").join(cmd)); Joiner.on(" ").join(cmd));
// to avoid auto-refresh by ProfileOutputServlet, refreshDelay can be specified // to avoid auto-refresh by ProfileOutputServlet, refreshDelay can be specified
@ -395,4 +395,4 @@ public class ProfileServlet extends HttpServlet {
} }
} }

View File

@ -407,7 +407,15 @@ class BitSetNode {
int wordIndex = bitmapIndex >> ADDRESS_BITS_PER_WORD; int wordIndex = bitmapIndex >> ADDRESS_BITS_PER_WORD;
long value = (1L << bitmapIndex); long value = (1L << bitmapIndex);
modified[wordIndex] |= value; try {
modified[wordIndex] |= value;
} catch (ArrayIndexOutOfBoundsException aioobe) {
// We've gotten a AIOOBE in here; add detail to help debug.
ArrayIndexOutOfBoundsException aioobe2 =
new ArrayIndexOutOfBoundsException("pid=" + procId + ", deleted=" + isDeleted);
aioobe2.initCause(aioobe);
throw aioobe2;
}
if (isDeleted) { if (isDeleted) {
deleted[wordIndex] |= value; deleted[wordIndex] |= value;
} else { } else {
@ -431,4 +439,4 @@ class BitSetNode {
private static long alignDown(final long x) { private static long alignDown(final long x) {
return x & -BITS_PER_WORD; return x & -BITS_PER_WORD;
} }
} }

View File

@ -190,10 +190,10 @@ public class HbckChore extends ScheduledChore {
RegionInfo regionInfo = regionState.getRegion(); RegionInfo regionInfo = regionState.getRegion();
if (master.getTableStateManager() if (master.getTableStateManager()
.isTableState(regionInfo.getTable(), TableState.State.DISABLED)) { .isTableState(regionInfo.getTable(), TableState.State.DISABLED)) {
disabledTableRegions.add(regionInfo.getEncodedName()); disabledTableRegions.add(regionInfo.getRegionNameAsString());
} }
if (regionInfo.isSplitParent()) { if (regionInfo.isSplitParent()) {
splitParentRegions.add(regionInfo.getEncodedName()); splitParentRegions.add(regionInfo.getRegionNameAsString());
} }
HbckRegionInfo.MetaEntry metaEntry = HbckRegionInfo.MetaEntry metaEntry =
new HbckRegionInfo.MetaEntry(regionInfo, regionState.getServerName(), new HbckRegionInfo.MetaEntry(regionInfo, regionState.getServerName(),
@ -212,7 +212,7 @@ public class HbckChore extends ScheduledChore {
String encodedRegionName = RegionInfo.encodeRegionName(regionName); String encodedRegionName = RegionInfo.encodeRegionName(regionName);
HbckRegionInfo hri = regionInfoMap.get(encodedRegionName); HbckRegionInfo hri = regionInfoMap.get(encodedRegionName);
if (hri == null) { if (hri == null) {
orphanRegionsOnRS.put(encodedRegionName, serverName); orphanRegionsOnRS.put(RegionInfo.getRegionNameAsString(regionName), serverName);
continue; continue;
} }
hri.addServer(hri.getMetaEntry(), serverName); hri.addServer(hri.getMetaEntry(), serverName);
@ -223,7 +223,6 @@ public class HbckChore extends ScheduledChore {
numRegions, rsReports.size(), orphanRegionsOnFS.size()); numRegions, rsReports.size(), orphanRegionsOnFS.size());
for (Map.Entry<String, HbckRegionInfo> entry : regionInfoMap.entrySet()) { for (Map.Entry<String, HbckRegionInfo> entry : regionInfoMap.entrySet()) {
String encodedRegionName = entry.getKey();
HbckRegionInfo hri = entry.getValue(); HbckRegionInfo hri = entry.getValue();
ServerName locationInMeta = hri.getMetaEntry().getRegionServer(); ServerName locationInMeta = hri.getMetaEntry().getRegionServer();
if (hri.getDeployedOn().size() == 0) { if (hri.getDeployedOn().size() == 0) {
@ -231,21 +230,24 @@ public class HbckChore extends ScheduledChore {
continue; continue;
} }
// skip the offline region which belong to disabled table. // skip the offline region which belong to disabled table.
if (disabledTableRegions.contains(encodedRegionName)) { if (disabledTableRegions.contains(hri.getRegionNameAsString())) {
continue; continue;
} }
// skip the split parent regions // skip the split parent regions
if (splitParentRegions.contains(encodedRegionName)) { if (splitParentRegions.contains(hri.getRegionNameAsString())) {
continue; continue;
} }
// Master thought this region opened, but no regionserver reported it. // Master thought this region opened, but no regionserver reported it.
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, new LinkedList<>())); inconsistentRegions.put(hri.getRegionNameAsString(),
new Pair<>(locationInMeta, new LinkedList<>()));
} else if (hri.getDeployedOn().size() > 1) { } else if (hri.getDeployedOn().size() > 1) {
// More than one regionserver reported opened this region // More than one regionserver reported opened this region
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn())); inconsistentRegions.put(hri.getRegionNameAsString(),
new Pair<>(locationInMeta, hri.getDeployedOn()));
} else if (!hri.getDeployedOn().get(0).equals(locationInMeta)) { } else if (!hri.getDeployedOn().get(0).equals(locationInMeta)) {
// Master thought this region opened on Server1, but regionserver reported Server2 // Master thought this region opened on Server1, but regionserver reported Server2
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn())); inconsistentRegions.put(hri.getRegionNameAsString(),
new Pair<>(locationInMeta, hri.getDeployedOn()));
} }
} }
} }
@ -339,4 +341,4 @@ public class HbckChore extends ScheduledChore {
public long getCheckingEndTimestamp() { public long getCheckingEndTimestamp() {
return this.checkingEndTimestamp; return this.checkingEndTimestamp;
} }
} }

View File

@ -50,12 +50,13 @@ import org.apache.hbase.thirdparty.com.google.common.cache.LoadingCache;
* </p> * </p>
* <p> * <p>
* We record all the open sequence number for a region in a special family in meta, which is called * We record all the open sequence number for a region in a special family in meta, which is called
* 'barrier', so there will be a sequence of open sequence number (b1, b2, b3, ...). We call [bn, * 'rep_barrier', so there will be a sequence of open sequence number (b1, b2, b3, ...). We call
* bn+1) a range, and it is obvious that a region will always be on the same RS within a range. * [bn, bn+1) a range, and it is obvious that a region will always be on the same RS within a
* range.
* <p> * <p>
* When split and merge, we will also record the parent for the generated region(s) in the special * When split and merge, we will also record the parent for the generated region(s) in the special
* family in meta. And also, we will write an extra 'open sequence number' for the parent region(s), * family in meta. And also, we will write an extra 'open sequence number' for the parent
* which is the max sequence id of the region plus one. * region(s), which is the max sequence id of the region plus one.
* </p> * </p>
* </p> * </p>
* <p> * <p>

View File

@ -277,13 +277,15 @@ public class CanaryTool implements Tool, Canary {
public void publishReadFailure(ServerName serverName, RegionInfo region, Exception e) { public void publishReadFailure(ServerName serverName, RegionInfo region, Exception e) {
incReadFailureCount(); incReadFailureCount();
LOG.error("Read from {} on {} failed", region.getRegionNameAsString(), serverName, e); LOG.error("Read from {} on serverName={} failed",
region.getRegionNameAsString(), serverName, e);
} }
public void publishReadFailure(ServerName serverName, RegionInfo region, public void publishReadFailure(ServerName serverName, RegionInfo region,
ColumnFamilyDescriptor column, Exception e) { ColumnFamilyDescriptor column, Exception e) {
incReadFailureCount(); incReadFailureCount();
LOG.error("Read from {} on {} {} failed", region.getRegionNameAsString(), serverName, LOG.error("Read from {} on serverName={}, columnFamily={} failed",
region.getRegionNameAsString(), serverName,
column.getNameAsString(), e); column.getNameAsString(), e);
} }

View File

@ -78,7 +78,7 @@
<div class="row"> <div class="row">
<div class="page-header"> <div class="page-header">
<p><span>This page displays two reports: the 'HBCK Chore Report' and the 'CatalogJanitor Consistency Issues' report. Only titles show if there are no problems to report. Note some conditions are transitory as regions migrate.</span></p> <p><span>This page displays two reports: the 'HBCK Chore Report' and the 'CatalogJanitor Consistency Issues' report. Only titles show if there are no problems to report. Note some conditions are <em>transitory</em> as regions migrate.</span></p>
</div> </div>
</div> </div>
<div class="row"> <div class="row">
@ -119,7 +119,7 @@
<table class="table table-striped"> <table class="table table-striped">
<tr> <tr>
<th>Region Encoded Name</th> <th>Region Name</th>
<th>Location in META</th> <th>Location in META</th>
<th>Reported Online RegionServers</th> <th>Reported Online RegionServers</th>
</tr> </tr>
@ -142,10 +142,18 @@
<h2>Orphan Regions on RegionServer</h2> <h2>Orphan Regions on RegionServer</h2>
</div> </div>
</div> </div>
<p>
<span>
The below are Regions we've lost account of. To be safe, run bulk load of any data found in these Region orphan directories back into the HBase cluster.
First make sure hbase:meta is in healthy state; run 'hbkc2 fixMeta' to be sure. Once this is done, per Region below, run a bulk
load -- '$ hbase completebulkload REGION_DIR_PATH TABLE_NAME' -- and then delete the desiccated directory content (HFiles are removed upon successful load; all that is left are empty directories
and occasionally a seqid marking file).
</span>
</p>
<table class="table table-striped"> <table class="table table-striped">
<tr> <tr>
<th>Region Encoded Name</th> <th>Region Name</th>
<th>Reported Online RegionServer</th> <th>Reported Online RegionServer</th>
</tr> </tr>
<% for (Map.Entry<String, ServerName> entry : orphanRegionsOnRS.entrySet()) { %> <% for (Map.Entry<String, ServerName> entry : orphanRegionsOnRS.entrySet()) { %>

View File

@ -81,11 +81,14 @@
<th>Errors</th> <th>Errors</th>
<th>Parameters</th> <th>Parameters</th>
</tr> </tr>
<% for (Procedure<?> proc : procedures) { <%
int displayCount = 0;
for (Procedure<?> proc : procedures) {
// Don't show SUCCESS procedures. // Don't show SUCCESS procedures.
if (proc.isSuccess()) { if (proc.isSuccess()) {
continue; continue;
} }
displayCount++;
%> %>
<tr> <tr>
<td><%= proc.getProcId() %></td> <td><%= proc.getProcId() %></td>
@ -99,9 +102,63 @@
<td><%= escapeXml(proc.toString()) %></td> <td><%= escapeXml(proc.toString()) %></td>
</tr> </tr>
<% } %> <% } %>
<%
if (displayCount > 0) {
%>
<p><%= displayCount %> procedure(s).</p>
<%
}
%>
</table> </table>
</div> </div>
<br /> <br />
<div class="container-fluid content">
<div class="row">
<div class="page-header">
<h1>Locks</h1>
</div>
</div>
<%
if (lockedResources.size() > 0) {
%>
<p><%= lockedResources.size() %> lock(s).</p>
<%
}
%>
<% for (LockedResource lockedResource : lockedResources) { %>
<h2><%= lockedResource.getResourceType() %>: <%= lockedResource.getResourceName() %></h2>
<%
switch (lockedResource.getLockType()) {
case EXCLUSIVE:
%>
<p>Lock type: EXCLUSIVE</p>
<p>Owner procedure: <%= escapeXml(lockedResource.getExclusiveLockOwnerProcedure().toStringDetails()) %></p>
<%
break;
case SHARED:
%>
<p>Lock type: SHARED</p>
<p>Number of shared locks: <%= lockedResource.getSharedLockCount() %></p>
<%
break;
}
List<Procedure<?>> waitingProcedures = lockedResource.getWaitingProcedures();
if (!waitingProcedures.isEmpty()) {
%>
<h3>Waiting procedures</h3>
<table class="table table-striped" width="90%" >
<% for (Procedure<?> proc : procedures) { %>
<tr>
<td><%= escapeXml(proc.toStringDetails()) %></td>
</tr>
<% } %>
</table>
<% } %>
<% } %>
</div>
<br />
<div class="container-fluid content"> <div class="container-fluid content">
<div class="row"> <div class="row">
<div class="page-header"> <div class="page-header">
@ -206,44 +263,5 @@
</div> </div>
</div> </div>
<br /> <br />
<div class="container-fluid content">
<div class="row">
<div class="page-header">
<h1>Locks</h1>
</div>
</div>
<% for (LockedResource lockedResource : lockedResources) { %>
<h2><%= lockedResource.getResourceType() %>: <%= lockedResource.getResourceName() %></h2>
<%
switch (lockedResource.getLockType()) {
case EXCLUSIVE:
%>
<p>Lock type: EXCLUSIVE</p>
<p>Owner procedure: <%= escapeXml(lockedResource.getExclusiveLockOwnerProcedure().toStringDetails()) %></p>
<%
break;
case SHARED:
%>
<p>Lock type: SHARED</p>
<p>Number of shared locks: <%= lockedResource.getSharedLockCount() %></p>
<%
break;
}
List<Procedure<?>> waitingProcedures = lockedResource.getWaitingProcedures();
if (!waitingProcedures.isEmpty()) {
%>
<h3>Waiting procedures</h3>
<table class="table table-striped" width="90%" >
<% for (Procedure<?> proc : procedures) { %>
<tr>
<td><%= escapeXml(proc.toStringDetails()) %></td>
</tr>
<% } %>
</table>
<% } %>
<% } %>
</div>
<jsp:include page="footer.jsp" /> <jsp:include page="footer.jsp" />

View File

@ -69,7 +69,7 @@ public class TestHbckChore extends TestAssignmentManagerBase {
@Test @Test
public void testForMeta() { public void testForMeta() {
byte[] metaRegionNameAsBytes = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName(); byte[] metaRegionNameAsBytes = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName();
String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getEncodedName(); String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionNameAsString();
List<ServerName> serverNames = master.getServerManager().getOnlineServersList(); List<ServerName> serverNames = master.getServerManager().getOnlineServersList();
assertEquals(NSERVERS, serverNames.size()); assertEquals(NSERVERS, serverNames.size());
@ -96,7 +96,7 @@ public class TestHbckChore extends TestAssignmentManagerBase {
public void testForUserTable() throws Exception { public void testForUserTable() throws Exception {
TableName tableName = TableName.valueOf("testForUserTable"); TableName tableName = TableName.valueOf("testForUserTable");
RegionInfo hri = createRegionInfo(tableName, 1); RegionInfo hri = createRegionInfo(tableName, 1);
String regionName = hri.getEncodedName(); String regionName = hri.getRegionNameAsString();
rsDispatcher.setMockRsExecutor(new GoodRsExecutor()); rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
Future<byte[]> future = submitProcedure(createAssignProcedure(hri)); Future<byte[]> future = submitProcedure(createAssignProcedure(hri));
waitOnFuture(future); waitOnFuture(future);
@ -154,7 +154,7 @@ public class TestHbckChore extends TestAssignmentManagerBase {
public void testForDisabledTable() throws Exception { public void testForDisabledTable() throws Exception {
TableName tableName = TableName.valueOf("testForDisabledTable"); TableName tableName = TableName.valueOf("testForDisabledTable");
RegionInfo hri = createRegionInfo(tableName, 1); RegionInfo hri = createRegionInfo(tableName, 1);
String regionName = hri.getEncodedName(); String regionName = hri.getRegionNameAsString();
rsDispatcher.setMockRsExecutor(new GoodRsExecutor()); rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
Future<byte[]> future = submitProcedure(createAssignProcedure(hri)); Future<byte[]> future = submitProcedure(createAssignProcedure(hri));
waitOnFuture(future); waitOnFuture(future);