HBASE-23315 Miscellaneous HBCK Report page cleanup
* Add a bit of javadoc around SerialReplicationChecker. * Miniscule edit to the profiler jsp page and then a bit of doc on how to make it work that might help. * Add some detail if NPE getting BitSetNode to help w/ debug. * Change HbckChore to log region names instead of encoded names; helps doing diagnostics; can take region name and query in shell to find out all about the region according to hbase:meta. * Add some fix-it help inline in the HBCK Report page – how to fix. * Add counts in procedures page so can see if making progress; move listing of WALs to end of the page.
This commit is contained in:
parent
c6ad71e256
commit
70771b603e
|
@ -137,7 +137,7 @@ import org.apache.hbase.thirdparty.com.google.common.base.Throwables;
|
|||
* columns: info:merge0001, info:merge0002. You make also see 'mergeA',
|
||||
* and 'mergeB'. This is old form replaced by the new format that allows
|
||||
* for more than two parents to be merged at a time.
|
||||
* TODO: Add rep_barrier for serial replication explaination.
|
||||
* TODO: Add rep_barrier for serial replication explaination. See SerialReplicationChecker.
|
||||
* </pre>
|
||||
* </p>
|
||||
* <p>
|
||||
|
@ -608,6 +608,7 @@ public class MetaTableAccessor {
|
|||
* @param excludeOfflinedSplitParents don't return split parents
|
||||
* @return Return list of regioninfos and server addresses.
|
||||
*/
|
||||
// What happens here when 1M regions in hbase:meta? This won't scale?
|
||||
public static List<Pair<RegionInfo, ServerName>> getTableRegionsAndLocations(
|
||||
Connection connection, @Nullable final TableName tableName,
|
||||
final boolean excludeOfflinedSplitParents) throws IOException {
|
||||
|
@ -1988,6 +1989,9 @@ public class MetaTableAccessor {
|
|||
return put;
|
||||
}
|
||||
|
||||
/**
|
||||
* See class comment on SerialReplicationChecker
|
||||
*/
|
||||
public static void addReplicationBarrier(Put put, long openSeqNum) throws IOException {
|
||||
put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY)
|
||||
.setRow(put.getRow())
|
||||
|
|
|
@ -270,7 +270,7 @@ public class ProfileServlet extends HttpServlet {
|
|||
resp.getWriter().write(
|
||||
"Started [" + event.getInternalName() +
|
||||
"] profiling. This page will automatically redirect to " +
|
||||
relativeUrl + " after " + duration + " seconds.\n\ncommand:\n" +
|
||||
relativeUrl + " after " + duration + " seconds.\n\nCommand:\n" +
|
||||
Joiner.on(" ").join(cmd));
|
||||
|
||||
// to avoid auto-refresh by ProfileOutputServlet, refreshDelay can be specified
|
||||
|
@ -395,4 +395,4 @@ public class ProfileServlet extends HttpServlet {
|
|||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -407,7 +407,15 @@ class BitSetNode {
|
|||
int wordIndex = bitmapIndex >> ADDRESS_BITS_PER_WORD;
|
||||
long value = (1L << bitmapIndex);
|
||||
|
||||
modified[wordIndex] |= value;
|
||||
try {
|
||||
modified[wordIndex] |= value;
|
||||
} catch (ArrayIndexOutOfBoundsException aioobe) {
|
||||
// We've gotten a AIOOBE in here; add detail to help debug.
|
||||
ArrayIndexOutOfBoundsException aioobe2 =
|
||||
new ArrayIndexOutOfBoundsException("pid=" + procId + ", deleted=" + isDeleted);
|
||||
aioobe2.initCause(aioobe);
|
||||
throw aioobe2;
|
||||
}
|
||||
if (isDeleted) {
|
||||
deleted[wordIndex] |= value;
|
||||
} else {
|
||||
|
@ -431,4 +439,4 @@ class BitSetNode {
|
|||
private static long alignDown(final long x) {
|
||||
return x & -BITS_PER_WORD;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -190,10 +190,10 @@ public class HbckChore extends ScheduledChore {
|
|||
RegionInfo regionInfo = regionState.getRegion();
|
||||
if (master.getTableStateManager()
|
||||
.isTableState(regionInfo.getTable(), TableState.State.DISABLED)) {
|
||||
disabledTableRegions.add(regionInfo.getEncodedName());
|
||||
disabledTableRegions.add(regionInfo.getRegionNameAsString());
|
||||
}
|
||||
if (regionInfo.isSplitParent()) {
|
||||
splitParentRegions.add(regionInfo.getEncodedName());
|
||||
splitParentRegions.add(regionInfo.getRegionNameAsString());
|
||||
}
|
||||
HbckRegionInfo.MetaEntry metaEntry =
|
||||
new HbckRegionInfo.MetaEntry(regionInfo, regionState.getServerName(),
|
||||
|
@ -212,7 +212,7 @@ public class HbckChore extends ScheduledChore {
|
|||
String encodedRegionName = RegionInfo.encodeRegionName(regionName);
|
||||
HbckRegionInfo hri = regionInfoMap.get(encodedRegionName);
|
||||
if (hri == null) {
|
||||
orphanRegionsOnRS.put(encodedRegionName, serverName);
|
||||
orphanRegionsOnRS.put(RegionInfo.getRegionNameAsString(regionName), serverName);
|
||||
continue;
|
||||
}
|
||||
hri.addServer(hri.getMetaEntry(), serverName);
|
||||
|
@ -223,7 +223,6 @@ public class HbckChore extends ScheduledChore {
|
|||
numRegions, rsReports.size(), orphanRegionsOnFS.size());
|
||||
|
||||
for (Map.Entry<String, HbckRegionInfo> entry : regionInfoMap.entrySet()) {
|
||||
String encodedRegionName = entry.getKey();
|
||||
HbckRegionInfo hri = entry.getValue();
|
||||
ServerName locationInMeta = hri.getMetaEntry().getRegionServer();
|
||||
if (hri.getDeployedOn().size() == 0) {
|
||||
|
@ -231,21 +230,24 @@ public class HbckChore extends ScheduledChore {
|
|||
continue;
|
||||
}
|
||||
// skip the offline region which belong to disabled table.
|
||||
if (disabledTableRegions.contains(encodedRegionName)) {
|
||||
if (disabledTableRegions.contains(hri.getRegionNameAsString())) {
|
||||
continue;
|
||||
}
|
||||
// skip the split parent regions
|
||||
if (splitParentRegions.contains(encodedRegionName)) {
|
||||
if (splitParentRegions.contains(hri.getRegionNameAsString())) {
|
||||
continue;
|
||||
}
|
||||
// Master thought this region opened, but no regionserver reported it.
|
||||
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, new LinkedList<>()));
|
||||
inconsistentRegions.put(hri.getRegionNameAsString(),
|
||||
new Pair<>(locationInMeta, new LinkedList<>()));
|
||||
} else if (hri.getDeployedOn().size() > 1) {
|
||||
// More than one regionserver reported opened this region
|
||||
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn()));
|
||||
inconsistentRegions.put(hri.getRegionNameAsString(),
|
||||
new Pair<>(locationInMeta, hri.getDeployedOn()));
|
||||
} else if (!hri.getDeployedOn().get(0).equals(locationInMeta)) {
|
||||
// Master thought this region opened on Server1, but regionserver reported Server2
|
||||
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn()));
|
||||
inconsistentRegions.put(hri.getRegionNameAsString(),
|
||||
new Pair<>(locationInMeta, hri.getDeployedOn()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -339,4 +341,4 @@ public class HbckChore extends ScheduledChore {
|
|||
public long getCheckingEndTimestamp() {
|
||||
return this.checkingEndTimestamp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -50,12 +50,13 @@ import org.apache.hbase.thirdparty.com.google.common.cache.LoadingCache;
|
|||
* </p>
|
||||
* <p>
|
||||
* We record all the open sequence number for a region in a special family in meta, which is called
|
||||
* 'barrier', so there will be a sequence of open sequence number (b1, b2, b3, ...). We call [bn,
|
||||
* bn+1) a range, and it is obvious that a region will always be on the same RS within a range.
|
||||
* 'rep_barrier', so there will be a sequence of open sequence number (b1, b2, b3, ...). We call
|
||||
* [bn, bn+1) a range, and it is obvious that a region will always be on the same RS within a
|
||||
* range.
|
||||
* <p>
|
||||
* When split and merge, we will also record the parent for the generated region(s) in the special
|
||||
* family in meta. And also, we will write an extra 'open sequence number' for the parent region(s),
|
||||
* which is the max sequence id of the region plus one.
|
||||
* family in meta. And also, we will write an extra 'open sequence number' for the parent
|
||||
* region(s), which is the max sequence id of the region plus one.
|
||||
* </p>
|
||||
* </p>
|
||||
* <p>
|
||||
|
|
|
@ -277,13 +277,15 @@ public class CanaryTool implements Tool, Canary {
|
|||
|
||||
public void publishReadFailure(ServerName serverName, RegionInfo region, Exception e) {
|
||||
incReadFailureCount();
|
||||
LOG.error("Read from {} on {} failed", region.getRegionNameAsString(), serverName, e);
|
||||
LOG.error("Read from {} on serverName={} failed",
|
||||
region.getRegionNameAsString(), serverName, e);
|
||||
}
|
||||
|
||||
public void publishReadFailure(ServerName serverName, RegionInfo region,
|
||||
ColumnFamilyDescriptor column, Exception e) {
|
||||
incReadFailureCount();
|
||||
LOG.error("Read from {} on {} {} failed", region.getRegionNameAsString(), serverName,
|
||||
LOG.error("Read from {} on serverName={}, columnFamily={} failed",
|
||||
region.getRegionNameAsString(), serverName,
|
||||
column.getNameAsString(), e);
|
||||
}
|
||||
|
||||
|
|
|
@ -78,7 +78,7 @@
|
|||
|
||||
<div class="row">
|
||||
<div class="page-header">
|
||||
<p><span>This page displays two reports: the 'HBCK Chore Report' and the 'CatalogJanitor Consistency Issues' report. Only titles show if there are no problems to report. Note some conditions are transitory as regions migrate.</span></p>
|
||||
<p><span>This page displays two reports: the 'HBCK Chore Report' and the 'CatalogJanitor Consistency Issues' report. Only titles show if there are no problems to report. Note some conditions are <em>transitory</em> as regions migrate.</span></p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="row">
|
||||
|
@ -119,7 +119,7 @@
|
|||
|
||||
<table class="table table-striped">
|
||||
<tr>
|
||||
<th>Region Encoded Name</th>
|
||||
<th>Region Name</th>
|
||||
<th>Location in META</th>
|
||||
<th>Reported Online RegionServers</th>
|
||||
</tr>
|
||||
|
@ -142,10 +142,18 @@
|
|||
<h2>Orphan Regions on RegionServer</h2>
|
||||
</div>
|
||||
</div>
|
||||
<p>
|
||||
<span>
|
||||
The below are Regions we've lost account of. To be safe, run bulk load of any data found in these Region orphan directories back into the HBase cluster.
|
||||
First make sure hbase:meta is in healthy state; run 'hbkc2 fixMeta' to be sure. Once this is done, per Region below, run a bulk
|
||||
load -- '$ hbase completebulkload REGION_DIR_PATH TABLE_NAME' -- and then delete the desiccated directory content (HFiles are removed upon successful load; all that is left are empty directories
|
||||
and occasionally a seqid marking file).
|
||||
</span>
|
||||
</p>
|
||||
|
||||
<table class="table table-striped">
|
||||
<tr>
|
||||
<th>Region Encoded Name</th>
|
||||
<th>Region Name</th>
|
||||
<th>Reported Online RegionServer</th>
|
||||
</tr>
|
||||
<% for (Map.Entry<String, ServerName> entry : orphanRegionsOnRS.entrySet()) { %>
|
||||
|
|
|
@ -81,11 +81,14 @@
|
|||
<th>Errors</th>
|
||||
<th>Parameters</th>
|
||||
</tr>
|
||||
<% for (Procedure<?> proc : procedures) {
|
||||
<%
|
||||
int displayCount = 0;
|
||||
for (Procedure<?> proc : procedures) {
|
||||
// Don't show SUCCESS procedures.
|
||||
if (proc.isSuccess()) {
|
||||
continue;
|
||||
}
|
||||
displayCount++;
|
||||
%>
|
||||
<tr>
|
||||
<td><%= proc.getProcId() %></td>
|
||||
|
@ -99,9 +102,63 @@
|
|||
<td><%= escapeXml(proc.toString()) %></td>
|
||||
</tr>
|
||||
<% } %>
|
||||
<%
|
||||
if (displayCount > 0) {
|
||||
%>
|
||||
<p><%= displayCount %> procedure(s).</p>
|
||||
<%
|
||||
}
|
||||
%>
|
||||
</table>
|
||||
</div>
|
||||
<br />
|
||||
<div class="container-fluid content">
|
||||
<div class="row">
|
||||
<div class="page-header">
|
||||
<h1>Locks</h1>
|
||||
</div>
|
||||
</div>
|
||||
<%
|
||||
if (lockedResources.size() > 0) {
|
||||
%>
|
||||
<p><%= lockedResources.size() %> lock(s).</p>
|
||||
<%
|
||||
}
|
||||
%>
|
||||
<% for (LockedResource lockedResource : lockedResources) { %>
|
||||
<h2><%= lockedResource.getResourceType() %>: <%= lockedResource.getResourceName() %></h2>
|
||||
<%
|
||||
switch (lockedResource.getLockType()) {
|
||||
case EXCLUSIVE:
|
||||
%>
|
||||
<p>Lock type: EXCLUSIVE</p>
|
||||
<p>Owner procedure: <%= escapeXml(lockedResource.getExclusiveLockOwnerProcedure().toStringDetails()) %></p>
|
||||
<%
|
||||
break;
|
||||
case SHARED:
|
||||
%>
|
||||
<p>Lock type: SHARED</p>
|
||||
<p>Number of shared locks: <%= lockedResource.getSharedLockCount() %></p>
|
||||
<%
|
||||
break;
|
||||
}
|
||||
|
||||
List<Procedure<?>> waitingProcedures = lockedResource.getWaitingProcedures();
|
||||
|
||||
if (!waitingProcedures.isEmpty()) {
|
||||
%>
|
||||
<h3>Waiting procedures</h3>
|
||||
<table class="table table-striped" width="90%" >
|
||||
<% for (Procedure<?> proc : procedures) { %>
|
||||
<tr>
|
||||
<td><%= escapeXml(proc.toStringDetails()) %></td>
|
||||
</tr>
|
||||
<% } %>
|
||||
</table>
|
||||
<% } %>
|
||||
<% } %>
|
||||
</div>
|
||||
<br />
|
||||
<div class="container-fluid content">
|
||||
<div class="row">
|
||||
<div class="page-header">
|
||||
|
@ -206,44 +263,5 @@
|
|||
</div>
|
||||
</div>
|
||||
<br />
|
||||
<div class="container-fluid content">
|
||||
<div class="row">
|
||||
<div class="page-header">
|
||||
<h1>Locks</h1>
|
||||
</div>
|
||||
</div>
|
||||
<% for (LockedResource lockedResource : lockedResources) { %>
|
||||
<h2><%= lockedResource.getResourceType() %>: <%= lockedResource.getResourceName() %></h2>
|
||||
<%
|
||||
switch (lockedResource.getLockType()) {
|
||||
case EXCLUSIVE:
|
||||
%>
|
||||
<p>Lock type: EXCLUSIVE</p>
|
||||
<p>Owner procedure: <%= escapeXml(lockedResource.getExclusiveLockOwnerProcedure().toStringDetails()) %></p>
|
||||
<%
|
||||
break;
|
||||
case SHARED:
|
||||
%>
|
||||
<p>Lock type: SHARED</p>
|
||||
<p>Number of shared locks: <%= lockedResource.getSharedLockCount() %></p>
|
||||
<%
|
||||
break;
|
||||
}
|
||||
|
||||
List<Procedure<?>> waitingProcedures = lockedResource.getWaitingProcedures();
|
||||
|
||||
if (!waitingProcedures.isEmpty()) {
|
||||
%>
|
||||
<h3>Waiting procedures</h3>
|
||||
<table class="table table-striped" width="90%" >
|
||||
<% for (Procedure<?> proc : procedures) { %>
|
||||
<tr>
|
||||
<td><%= escapeXml(proc.toStringDetails()) %></td>
|
||||
</tr>
|
||||
<% } %>
|
||||
</table>
|
||||
<% } %>
|
||||
<% } %>
|
||||
</div>
|
||||
|
||||
<jsp:include page="footer.jsp" />
|
||||
|
|
|
@ -69,7 +69,7 @@ public class TestHbckChore extends TestAssignmentManagerBase {
|
|||
@Test
|
||||
public void testForMeta() {
|
||||
byte[] metaRegionNameAsBytes = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName();
|
||||
String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getEncodedName();
|
||||
String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionNameAsString();
|
||||
List<ServerName> serverNames = master.getServerManager().getOnlineServersList();
|
||||
assertEquals(NSERVERS, serverNames.size());
|
||||
|
||||
|
@ -96,7 +96,7 @@ public class TestHbckChore extends TestAssignmentManagerBase {
|
|||
public void testForUserTable() throws Exception {
|
||||
TableName tableName = TableName.valueOf("testForUserTable");
|
||||
RegionInfo hri = createRegionInfo(tableName, 1);
|
||||
String regionName = hri.getEncodedName();
|
||||
String regionName = hri.getRegionNameAsString();
|
||||
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
|
||||
Future<byte[]> future = submitProcedure(createAssignProcedure(hri));
|
||||
waitOnFuture(future);
|
||||
|
@ -154,7 +154,7 @@ public class TestHbckChore extends TestAssignmentManagerBase {
|
|||
public void testForDisabledTable() throws Exception {
|
||||
TableName tableName = TableName.valueOf("testForDisabledTable");
|
||||
RegionInfo hri = createRegionInfo(tableName, 1);
|
||||
String regionName = hri.getEncodedName();
|
||||
String regionName = hri.getRegionNameAsString();
|
||||
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
|
||||
Future<byte[]> future = submitProcedure(createAssignProcedure(hri));
|
||||
waitOnFuture(future);
|
||||
|
|
Loading…
Reference in New Issue