HBASE-22741 Show catalogjanitor consistency complaints in new 'HBCK Report' page

Signed-off-by: huzheng <openinx@gmail.com>
Signed-off-by: Guanghao Zhang <zghao@apache.org>
This commit is contained in:
stack 2019-07-29 17:10:58 -07:00
parent 4587b39e63
commit 9799e0b6b2
3 changed files with 209 additions and 48 deletions

View File

@ -22,7 +22,6 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -30,6 +29,7 @@ import java.util.Properties;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HBaseConfiguration;
@ -42,8 +42,11 @@ import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.client.TableDescriptor; import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.client.TableState; import org.apache.hadoop.hbase.client.TableState;
import org.apache.hadoop.hbase.master.assignment.AssignmentManager; import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
@ -239,7 +242,7 @@ public class CatalogJanitor extends ScheduledChore {
* @return Returns last published Report that comes of last successful scan * @return Returns last published Report that comes of last successful scan
* of hbase:meta. * of hbase:meta.
*/ */
Report getLastReport() { public Report getLastReport() {
return this.lastReport; return this.lastReport;
} }
@ -443,29 +446,49 @@ public class CatalogJanitor extends ScheduledChore {
} }
/** /**
* Report made by {@link ReportMakingVisitor}. * Report made by ReportMakingVisitor
*/ */
static class Report { public static class Report {
private final long now = EnvironmentEdgeManager.currentTime(); private final long now = EnvironmentEdgeManager.currentTime();
// Keep Map of found split parents. These are candidates for cleanup. // Keep Map of found split parents. These are candidates for cleanup.
// Use a comparator that has split parents come before its daughters. // Use a comparator that has split parents come before its daughters.
final Map<RegionInfo, Result> splitParents = new TreeMap<>(new SplitParentFirstComparator()); final Map<RegionInfo, Result> splitParents = new TreeMap<>(new SplitParentFirstComparator());
final Map<RegionInfo, Result> mergedRegions = new TreeMap<>(RegionInfo.COMPARATOR); final Map<RegionInfo, Result> mergedRegions = new TreeMap<>(RegionInfo.COMPARATOR);
final List<Pair<MetaRow, MetaRow>> holes = new ArrayList<>();
final List<Pair<MetaRow, MetaRow>> overlaps = new ArrayList<>();
final Map<ServerName, RegionInfo> unknownServers = new HashMap<ServerName, RegionInfo>();
final List<byte []> emptyRegionInfo = new ArrayList<>();
int count = 0; int count = 0;
private final List<Pair<MetaRow, MetaRow>> holes = new ArrayList<>();
private final List<Pair<MetaRow, MetaRow>> overlaps = new ArrayList<>();
private final List<Pair<MetaRow, ServerName>> unknownServers = new ArrayList<>();
private final List<byte []> emptyRegionInfo = new ArrayList<>();
@VisibleForTesting @VisibleForTesting
Report() {} Report() {}
public long getCreateTime() {
return this.now;
}
public List<Pair<MetaRow, MetaRow>> getHoles() {
return this.holes;
}
public List<Pair<MetaRow, MetaRow>> getOverlaps() {
return this.overlaps;
}
public List<Pair<MetaRow, ServerName>> getUnknownServers() {
return unknownServers;
}
public List<byte[]> getEmptyRegionInfo() {
return emptyRegionInfo;
}
/** /**
* @return True if an 'empty' lastReport -- no problems found. * @return True if an 'empty' lastReport -- no problems found.
*/ */
boolean isEmpty() { public boolean isEmpty() {
return this.holes.isEmpty() && this.overlaps.isEmpty() && this.unknownServers.isEmpty() && return this.holes.isEmpty() && this.overlaps.isEmpty() && this.unknownServers.isEmpty() &&
this.emptyRegionInfo.isEmpty(); this.emptyRegionInfo.isEmpty();
} }
@ -477,28 +500,28 @@ public class CatalogJanitor extends ScheduledChore {
if (sb.length() > 0) { if (sb.length() > 0) {
sb.append(", "); sb.append(", ");
} }
sb.append("hole=" + Bytes.toString(p.getFirst().metaRow) + "/" + sb.append("hole=" + Bytes.toStringBinary(p.getFirst().metaRow) + "/" +
Bytes.toString(p.getSecond().metaRow)); Bytes.toStringBinary(p.getSecond().metaRow));
} }
for (Pair<MetaRow, MetaRow> p: this.overlaps) { for (Pair<MetaRow, MetaRow> p: this.overlaps) {
if (sb.length() > 0) { if (sb.length() > 0) {
sb.append(", "); sb.append(", ");
} }
sb.append("overlap=").append(Bytes.toString(p.getFirst().metaRow)).append("/"). sb.append("overlap=").append(Bytes.toStringBinary(p.getFirst().metaRow)).append("/").
append(Bytes.toString(p.getSecond().metaRow)); append(Bytes.toStringBinary(p.getSecond().metaRow));
} }
for (byte [] r: this.emptyRegionInfo) { for (byte [] r: this.emptyRegionInfo) {
if (sb.length() > 0) { if (sb.length() > 0) {
sb.append(", "); sb.append(", ");
} }
sb.append("empty=").append(Bytes.toString(r)); sb.append("empty=").append(Bytes.toStringBinary(r));
} }
for (Map.Entry<ServerName, RegionInfo> e: this.unknownServers.entrySet()) { for (Pair<MetaRow, ServerName> p: this.unknownServers) {
if (sb.length() > 0) { if (sb.length() > 0) {
sb.append(", "); sb.append(", ");
} }
sb.append("unknown_server=").append(e.getKey()).append("/"). sb.append("unknown_server=").append(p.getSecond()).append("/").
append(e.getValue().getRegionNameAsString()); append(Bytes.toStringBinary(p.getFirst().metaRow));
} }
return sb.toString(); return sb.toString();
} }
@ -507,7 +530,7 @@ public class CatalogJanitor extends ScheduledChore {
/** /**
* Simple datastructure to hold a MetaRow content. * Simple datastructure to hold a MetaRow content.
*/ */
static class MetaRow { public static class MetaRow {
/** /**
* A marker for use in case where there is a hole at the very * A marker for use in case where there is a hole at the very
* first row in hbase:meta. Should never happen. * first row in hbase:meta. Should never happen.
@ -518,17 +541,25 @@ public class CatalogJanitor extends ScheduledChore {
/** /**
* Row from hbase:meta table. * Row from hbase:meta table.
*/ */
final byte [] metaRow; private final byte [] metaRow;
/** /**
* The decoded RegionInfo gotten from hbase:meta. * The decoded RegionInfo gotten from hbase:meta.
*/ */
final RegionInfo regionInfo; private final RegionInfo regionInfo;
MetaRow(byte [] metaRow, RegionInfo regionInfo) { MetaRow(byte [] metaRow, RegionInfo regionInfo) {
this.metaRow = metaRow; this.metaRow = metaRow;
this.regionInfo = regionInfo; this.regionInfo = regionInfo;
} }
public RegionInfo getRegionInfo() {
return regionInfo;
}
public byte[] getMetaRow() {
return metaRow;
}
} }
/** /**
@ -609,6 +640,7 @@ public class CatalogJanitor extends ScheduledChore {
this.report.emptyRegionInfo.add(metaTableRow.getRow()); this.report.emptyRegionInfo.add(metaTableRow.getRow());
return ri; return ri;
} }
MetaRow mrri = new MetaRow(metaTableRow.getRow(), ri); MetaRow mrri = new MetaRow(metaTableRow.getRow(), ri);
// If table is disabled, skip integrity check. // If table is disabled, skip integrity check.
if (!isTableDisabled(ri)) { if (!isTableDisabled(ri)) {
@ -700,20 +732,22 @@ public class CatalogJanitor extends ScheduledChore {
public static void main(String [] args) throws IOException { public static void main(String [] args) throws IOException {
checkLog4jProperties(); checkLog4jProperties();
ReportMakingVisitor visitor = new ReportMakingVisitor(null); ReportMakingVisitor visitor = new ReportMakingVisitor(null);
try (Connection connection = ConnectionFactory.createConnection(HBaseConfiguration.create())) { Configuration configuration = HBaseConfiguration.create();
configuration.setBoolean("hbase.defaults.for.version.skip", true);
try (Connection connection = ConnectionFactory.createConnection(configuration)) {
/* Used to generate an overlap. /* Used to generate an overlap.
Get g = new Get(Bytes.toBytes("t2,40,1563939166317.5a8be963741d27e9649e5c67a34259d9.")); */
Get g = new Get(Bytes.toBytes("t2,40,1564119846424.1db8c57d64e0733e0f027aaeae7a0bf0."));
g.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER); g.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
try (Table t = connection.getTable(TableName.META_TABLE_NAME)) { try (Table t = connection.getTable(TableName.META_TABLE_NAME)) {
Result r = t.get(g); Result r = t.get(g);
byte [] row = g.getRow(); byte [] row = g.getRow();
row[row.length - 3] <<= ((byte)row[row.length -3]); row[row.length - 2] <<= ((byte)row[row.length - 2]);
Put p = new Put(g.getRow()); Put p = new Put(g.getRow());
p.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER, p.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER,
r.getValue(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER)); r.getValue(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER));
t.put(p); t.put(p);
} }
*/
MetaTableAccessor.scanMetaForTableRegions(connection, visitor, null); MetaTableAccessor.scanMetaForTableRegions(connection, visitor, null);
Report report = visitor.getReport(); Report report = visitor.getReport();
LOG.info(report != null? report.toString(): "empty"); LOG.info(report != null? report.toString(): "empty");

View File

@ -18,15 +18,23 @@
*/ */
--%> --%>
<%@ page contentType="text/html;charset=UTF-8" <%@ page contentType="text/html;charset=UTF-8"
import="java.time.Instant"
import="java.time.ZoneId"
import="java.util.Date" import="java.util.Date"
import="java.util.List" import="java.util.List"
import="java.util.Map" import="java.util.Map"
import="java.util.stream.Collectors" import="java.util.stream.Collectors"
import="java.time.ZonedDateTime"
import="java.time.format.DateTimeFormatter"
%> %>
<%@ page import="org.apache.hadoop.hbase.master.HbckChecker" %> <%@ page import="org.apache.hadoop.hbase.master.HbckChecker" %>
<%@ page import="org.apache.hadoop.hbase.master.HMaster" %> <%@ page import="org.apache.hadoop.hbase.master.HMaster" %>
<%@ page import="org.apache.hadoop.hbase.ServerName" %> <%@ page import="org.apache.hadoop.hbase.ServerName" %>
<%@ page import="org.apache.hadoop.hbase.util.Bytes" %>
<%@ page import="org.apache.hadoop.hbase.util.Pair" %> <%@ page import="org.apache.hadoop.hbase.util.Pair" %>
<%@ page import="org.apache.hadoop.hbase.master.CatalogJanitor" %>
<%@ page import="org.apache.hadoop.hbase.master.CatalogJanitor.Report" %>
<%@ page import="org.apache.hadoop.hbase.master.CatalogJanitor.MetaRow" %>
<% <%
HMaster master = (HMaster) getServletContext().getAttribute(HMaster.MASTER); HMaster master = (HMaster) getServletContext().getAttribute(HMaster.MASTER);
pageContext.setAttribute("pageTitle", "HBase Master HBCK Report: " + master.getServerName()); pageContext.setAttribute("pageTitle", "HBase Master HBCK Report: " + master.getServerName());
@ -43,6 +51,14 @@
startTimestamp = hbckChecker.getCheckingStartTimestamp(); startTimestamp = hbckChecker.getCheckingStartTimestamp();
endTimestamp = hbckChecker.getCheckingEndTimestamp(); endTimestamp = hbckChecker.getCheckingEndTimestamp();
} }
ZonedDateTime zdt = ZonedDateTime.ofInstant(Instant.ofEpochMilli(startTimestamp),
ZoneId.systemDefault());
String iso8601start = startTimestamp == 0? "-1": zdt.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME);
zdt = ZonedDateTime.ofInstant(Instant.ofEpochMilli(endTimestamp),
ZoneId.systemDefault());
String iso8601end = startTimestamp == 0? "-1": zdt.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME);
CatalogJanitor cj = master.getCatalogJanitor();
CatalogJanitor.Report report = cj == null? null: cj.getLastReport();
%> %>
<jsp:include page="header.jsp"> <jsp:include page="header.jsp">
<jsp:param name="pageTitle" value="${pageTitle}"/> <jsp:param name="pageTitle" value="${pageTitle}"/>
@ -61,29 +77,32 @@
<div class="row"> <div class="row">
<div class="page-header"> <div class="page-header">
<h1>HBCK Report</h1> <h1>HBCK Chore Report</h1>
<p> <p>
<span>Checking started at <%= new Date(startTimestamp) %> and generated report at <%= new Date(endTimestamp) %></span> <span>Checking started at <%= iso8601start %> and generated report at <%= iso8601end %>. Execute 'hbck_chore_run' in hbase shell to generate a new sub-report.</span>
</p> </p>
</div> </div>
</div> </div>
<div class="row"> <div class="row">
<div class="page-header"> <div class="page-header">
<h2>Inconsistent Regions</h2> <h2>Inconsistent Regions</h2>
<p>
<span>
There are three case: 1. Master thought this region opened, but no regionserver reported it.
2. Master thought this region opened on Server1, but regionserver reported Server2.
3. More than one regionservers reported opened this region.
Notice: the reported online regionservers may be not right when there are regions in transition.
Please check them in regionserver's web UI.
</span>
</p>
</div> </div>
</div> </div>
<% if (inconsistentRegions != null && inconsistentRegions.size() > 0) { %> <% if (inconsistentRegions != null && inconsistentRegions.size() > 0) { %>
<p>
<span>
There are three cases: 1. Master thought this region opened, but no regionserver reported it (Fix: use assigns
command; 2. Master thought this region opened on Server1, but regionserver reported Server2 (Fix:
need to check the server is still exist. If not, schedule SCP for it. If exist, restart Server2 and Server1):
3. More than one regionservers reported opened this region (Fix: restart the RegionServers).
Notice: the reported online regionservers may be not right when there are regions in transition.
Please check them in regionserver's web UI.
</span>
</p>
<table class="table table-striped"> <table class="table table-striped">
<tr> <tr>
<th>Region</th> <th>Region</th>
@ -147,6 +166,114 @@
</table> </table>
<% } %> <% } %>
<div class="row inner_header">
<div class="page-header">
<h1>CatalogJanitor <em>hbase:meta</em> Consistency Issues</h1>
</div>
</div>
<% if (report != null && !report.isEmpty()) {
zdt = ZonedDateTime.ofInstant(Instant.ofEpochMilli(report.getCreateTime()),
ZoneId.systemDefault());
String iso8601reportTime = zdt.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME);
zdt = ZonedDateTime.ofInstant(Instant.ofEpochMilli(System.currentTimeMillis()),
ZoneId.systemDefault());
String iso8601Now = zdt.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME);
%>
<p>Report created: <%= iso8601reportTime %> (now=<%= iso8601Now %>). Run <i>catalogjanitor_run</i> in hbase shell to generate a new sub-report.</p>
<% if (!report.getHoles().isEmpty()) { %>
<div class="row inner_header">
<div class="page-header">
<h2>Holes</h2>
</div>
</div>
<table class="table table-striped">
<tr>
<th>Row before hole</th>
<th>RegionInfo</th>
<th>Row after hole</th>
<th>RegionInfo</th>
</tr>
<% for (Pair<MetaRow, MetaRow> p : report.getHoles()) { %>
<tr>
<td><%= Bytes.toStringBinary(p.getFirst().getMetaRow()) %></td>
<td><%= p.getFirst().getRegionInfo() %></td>
<td><%= Bytes.toStringBinary(p.getSecond().getMetaRow()) %></td>
<td><%= p.getSecond().getRegionInfo() %></td>
</tr>
<% } %>
<p><%= report.getHoles().size() %> hole(s).</p>
</table>
<% } %>
<% if (!report.getOverlaps().isEmpty()) { %>
<div class="row inner_header">
<div class="page-header">
<h2>Overlaps</h2>
</div>
</div>
<table class="table table-striped">
<tr>
<th>Row</th>
<th>RegionInfo</th>
<th>Other Row</th>
<th>Other RegionInfo</th>
</tr>
<% for (Pair<MetaRow, MetaRow> p : report.getOverlaps()) { %>
<tr>
<td><%= Bytes.toStringBinary(p.getFirst().getMetaRow()) %></td>
<td><%= p.getFirst().getRegionInfo() %></td>
<td><%= Bytes.toStringBinary(p.getSecond().getMetaRow()) %></td>
<td><%= p.getSecond().getRegionInfo() %></td>
</tr>
<% } %>
<p><%= report.getOverlaps().size() %> overlap(s).</p>
</table>
<% } %>
<% if (!report.getUnknownServers().isEmpty()) { %>
<div class="row inner_header">
<div class="page-header">
<h2>Unknown Servers</h2>
</div>
</div>
<table class="table table-striped">
<tr>
<th>Row</th>
<th>ServerName</th>
<th>RegionInfo</th>
</tr>
<% for (Pair<MetaRow, ServerName> p: report.getUnknownServers()) { %>
<tr>
<td><%= Bytes.toStringBinary(p.getFirst().getMetaRow()) %></td>
<td><%= p.getSecond() %></td>
<td><%= p.getFirst().getRegionInfo() %></td>
</tr>
<% } %>
<p><%= report.getUnknownServers().size() %> unknown servers(s).</p>
</table>
<% } %>
<% if (!report.getEmptyRegionInfo().isEmpty()) { %>
<div class="row inner_header">
<div class="page-header">
<h2>Empty <em>info:regioninfo</em></h2>
</div>
</div>
<table class="table table-striped">
<tr>
<th>Row</th>
</tr>
<% for (byte [] row: report.getEmptyRegionInfo()) { %>
<tr>
<td><%= Bytes.toStringBinary(row) %></td>
</tr>
<% } %>
<p><%= report.getEmptyRegionInfo().size() %> emptyRegionInfo(s).</p>
</table>
<% } %>
<% } %>
<% } %> <% } %>
</div> </div>

View File

@ -92,11 +92,11 @@ public class TestCatalogJanitorCluster {
gc = janitor.scan(); gc = janitor.scan();
report = janitor.getLastReport(); report = janitor.getLastReport();
assertFalse(report.isEmpty()); assertFalse(report.isEmpty());
assertEquals(1, report.holes.size()); assertEquals(1, report.getHoles().size());
assertTrue(report.holes.get(0).getFirst().regionInfo.getTable().equals(T1)); assertTrue(report.getHoles().get(0).getFirst().getRegionInfo().getTable().equals(T1));
assertTrue(report.holes.get(0).getFirst().regionInfo.isLast()); assertTrue(report.getHoles().get(0).getFirst().getRegionInfo().isLast());
assertTrue(report.holes.get(0).getSecond().regionInfo.getTable().equals(T2)); assertTrue(report.getHoles().get(0).getSecond().getRegionInfo().getTable().equals(T2));
assertEquals(0, report.overlaps.size()); assertEquals(0, report.getOverlaps().size());
// Next, add overlaps to first row in t3 // Next, add overlaps to first row in t3
List<RegionInfo> t3Ris = MetaTableAccessor.getTableRegions(TEST_UTIL.getConnection(), T3); List<RegionInfo> t3Ris = MetaTableAccessor.getTableRegions(TEST_UTIL.getConnection(), T3);
RegionInfo ri = t3Ris.get(0); RegionInfo ri = t3Ris.get(0);
@ -113,12 +113,12 @@ public class TestCatalogJanitorCluster {
report = janitor.getLastReport(); report = janitor.getLastReport();
assertFalse(report.isEmpty()); assertFalse(report.isEmpty());
// We added two overlaps so total three. // We added two overlaps so total three.
assertEquals(3, report.overlaps.size()); assertEquals(3, report.getOverlaps().size());
// Assert hole is still there. // Assert hole is still there.
assertEquals(1, report.holes.size()); assertEquals(1, report.getHoles().size());
// Assert other attributes are empty still. // Assert other attributes are empty still.
assertTrue(report.emptyRegionInfo.isEmpty()); assertTrue(report.getEmptyRegionInfo().isEmpty());
assertTrue(report.unknownServers.isEmpty()); assertTrue(report.getUnknownServers().isEmpty());
// Now make bad server in t1. // Now make bad server in t1.
List<RegionInfo> t1Ris = MetaTableAccessor.getTableRegions(TEST_UTIL.getConnection(), T1); List<RegionInfo> t1Ris = MetaTableAccessor.getTableRegions(TEST_UTIL.getConnection(), T1);
RegionInfo t1Ri1 = t1Ris.get(1); RegionInfo t1Ri1 = t1Ris.get(1);
@ -129,7 +129,7 @@ public class TestCatalogJanitorCluster {
gc = janitor.scan(); gc = janitor.scan();
report = janitor.getLastReport(); report = janitor.getLastReport();
assertFalse(report.isEmpty()); assertFalse(report.isEmpty());
assertEquals(1, report.unknownServers.size()); assertEquals(1, report.getUnknownServers().size());
// Finally, make an empty regioninfo in t1. // Finally, make an empty regioninfo in t1.
RegionInfo t1Ri2 = t1Ris.get(2); RegionInfo t1Ri2 = t1Ris.get(2);
Put pEmptyRI = new Put(t1Ri2.getRegionName()); Put pEmptyRI = new Put(t1Ri2.getRegionName());
@ -138,7 +138,7 @@ public class TestCatalogJanitorCluster {
MetaTableAccessor.putsToMetaTable(TEST_UTIL.getConnection(), Arrays.asList(pEmptyRI)); MetaTableAccessor.putsToMetaTable(TEST_UTIL.getConnection(), Arrays.asList(pEmptyRI));
gc = janitor.scan(); gc = janitor.scan();
report = janitor.getLastReport(); report = janitor.getLastReport();
assertEquals(1, report.emptyRegionInfo.size()); assertEquals(1, report.getEmptyRegionInfo().size());
} }
/** /**