HBASE-22709 Add a chore thread in master to do hbck checking (#404)
Signed-off-by: stack <stack@apache.org>
This commit is contained in:
parent
f8b592e9f8
commit
e54c4018e7
|
@ -42,84 +42,8 @@ int limit = 100;
|
||||||
<%java>
|
<%java>
|
||||||
SortedSet<RegionState> rit = assignmentManager.getRegionStates()
|
SortedSet<RegionState> rit = assignmentManager.getRegionStates()
|
||||||
.getRegionsInTransitionOrderedByTimestamp();
|
.getRegionsInTransitionOrderedByTimestamp();
|
||||||
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = assignmentManager
|
|
||||||
.getProblematicRegions();
|
|
||||||
</%java>
|
</%java>
|
||||||
|
|
||||||
<%if !problematicRegions.isEmpty() %>
|
|
||||||
<%java>
|
|
||||||
int totalSize = problematicRegions.size();
|
|
||||||
int sizePerPage = Math.min(10, totalSize);
|
|
||||||
int numOfPages = (int) Math.ceil(totalSize * 1.0 / sizePerPage);
|
|
||||||
</%java>
|
|
||||||
<section>
|
|
||||||
<h2><a name="problem-regions">Problematic Regions</a></h2>
|
|
||||||
<p>
|
|
||||||
<span>
|
|
||||||
<% problematicRegions.size() %> problematic region(s). There are three case: 1. Master
|
|
||||||
thought this region opened, but no regionserver reported it. 2. Master thought this
|
|
||||||
region opened on Server1, but regionserver reported Server2. 3. More than one
|
|
||||||
regionservers reported opened this region. Notice: the reported online regionservers
|
|
||||||
may be not right when there are regions in transition. Please check them in
|
|
||||||
regionserver's web UI.
|
|
||||||
</span>
|
|
||||||
</p>
|
|
||||||
<div class="tabbable">
|
|
||||||
<div class="tab-content">
|
|
||||||
<%java int recordItr = 0; %>
|
|
||||||
<%for Map.Entry<String, Pair<ServerName, Set<ServerName>>> entry : problematicRegions.entrySet() %>
|
|
||||||
<%if (recordItr % sizePerPage) == 0 %>
|
|
||||||
<%if recordItr == 0 %>
|
|
||||||
<div class="tab-pane active" id="tab_prs<% (recordItr / sizePerPage) + 1 %>">
|
|
||||||
<%else>
|
|
||||||
<div class="tab-pane" id="tab_prs<% (recordItr / sizePerPage) + 1 %>">
|
|
||||||
</%if>
|
|
||||||
<table class="table table-striped" style="margin-bottom:0px;">
|
|
||||||
<tr>
|
|
||||||
<th>Region</th>
|
|
||||||
<th>Location in META</th>
|
|
||||||
<th>Reported Online Region Servers</th>
|
|
||||||
</tr>
|
|
||||||
</%if>
|
|
||||||
|
|
||||||
<tr>
|
|
||||||
<td><% entry.getKey() %></td>
|
|
||||||
<td><% entry.getValue().getFirst() %></td>
|
|
||||||
<td><% entry.getValue().getSecond().stream().map(ServerName::getServerName)
|
|
||||||
.collect(Collectors.joining(", ")) %></td>
|
|
||||||
</tr>
|
|
||||||
<%java recordItr++; %>
|
|
||||||
<%if (recordItr % sizePerPage) == 0 %>
|
|
||||||
</table>
|
|
||||||
</div>
|
|
||||||
</%if>
|
|
||||||
</%for>
|
|
||||||
|
|
||||||
<%if (recordItr % sizePerPage) != 0 %>
|
|
||||||
<%for ; (recordItr % sizePerPage) != 0 ; recordItr++ %>
|
|
||||||
<tr><td colspan="3" style="height:61px"></td></tr>
|
|
||||||
</%for>
|
|
||||||
</table>
|
|
||||||
</div>
|
|
||||||
</%if>
|
|
||||||
|
|
||||||
</div>
|
|
||||||
<nav>
|
|
||||||
<ul class="nav nav-pills pagination">
|
|
||||||
<%for int i = 1 ; i <= numOfPages; i++ %>
|
|
||||||
<%if i == 1 %>
|
|
||||||
<li class="active">
|
|
||||||
<%else>
|
|
||||||
<li>
|
|
||||||
</%if>
|
|
||||||
<a href="#tab_prs<% i %>"><% i %></a></li>
|
|
||||||
</%for>
|
|
||||||
</ul>
|
|
||||||
</nav>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
</%if>
|
|
||||||
|
|
||||||
<%if !rit.isEmpty() %>
|
<%if !rit.isEmpty() %>
|
||||||
<%java>
|
<%java>
|
||||||
long currentTime = System.currentTimeMillis();
|
long currentTime = System.currentTimeMillis();
|
||||||
|
|
|
@ -149,7 +149,8 @@ AssignmentManager assignmentManager = master.getAssignmentManager();
|
||||||
<li class="active"><a href="/master-status">Home</a></li>
|
<li class="active"><a href="/master-status">Home</a></li>
|
||||||
<li><a href="/tablesDetailed.jsp">Table Details</a></li>
|
<li><a href="/tablesDetailed.jsp">Table Details</a></li>
|
||||||
<%if master.isActiveMaster() %>
|
<%if master.isActiveMaster() %>
|
||||||
<li><a href="/procedures.jsp">Procedures & Locks</a></li>
|
<li><a href="/procedures.jsp">Procedures & Locks</a></li>
|
||||||
|
<li><a href="/hbck.jsp">HBCK Report</a></li>
|
||||||
</%if>
|
</%if>
|
||||||
<li><a href="/processMaster.jsp">Process Metrics</a></li>
|
<li><a href="/processMaster.jsp">Process Metrics</a></li>
|
||||||
<li><a href="/logs/">Local Logs</a></li>
|
<li><a href="/logs/">Local Logs</a></li>
|
||||||
|
|
|
@ -371,6 +371,7 @@ public class HMaster extends HRegionServer implements MasterServices {
|
||||||
private ClusterStatusChore clusterStatusChore;
|
private ClusterStatusChore clusterStatusChore;
|
||||||
private ClusterStatusPublisher clusterStatusPublisherChore = null;
|
private ClusterStatusPublisher clusterStatusPublisherChore = null;
|
||||||
|
|
||||||
|
private HbckChecker hbckChecker;
|
||||||
CatalogJanitor catalogJanitorChore;
|
CatalogJanitor catalogJanitorChore;
|
||||||
private LogCleaner logCleaner;
|
private LogCleaner logCleaner;
|
||||||
private HFileCleaner hfileCleaner;
|
private HFileCleaner hfileCleaner;
|
||||||
|
@ -1030,6 +1031,8 @@ public class HMaster extends HRegionServer implements MasterServices {
|
||||||
getChoreService().scheduleChore(normalizerChore);
|
getChoreService().scheduleChore(normalizerChore);
|
||||||
this.catalogJanitorChore = new CatalogJanitor(this);
|
this.catalogJanitorChore = new CatalogJanitor(this);
|
||||||
getChoreService().scheduleChore(catalogJanitorChore);
|
getChoreService().scheduleChore(catalogJanitorChore);
|
||||||
|
this.hbckChecker = new HbckChecker(this);
|
||||||
|
getChoreService().scheduleChore(hbckChecker);
|
||||||
|
|
||||||
// NAMESPACE READ!!!!
|
// NAMESPACE READ!!!!
|
||||||
// Here we expect hbase:namespace to be online. See inside initClusterSchemaService.
|
// Here we expect hbase:namespace to be online. See inside initClusterSchemaService.
|
||||||
|
@ -1495,6 +1498,7 @@ public class HMaster extends HRegionServer implements MasterServices {
|
||||||
choreService.cancelChore(this.logCleaner);
|
choreService.cancelChore(this.logCleaner);
|
||||||
choreService.cancelChore(this.hfileCleaner);
|
choreService.cancelChore(this.hfileCleaner);
|
||||||
choreService.cancelChore(this.replicationBarrierCleaner);
|
choreService.cancelChore(this.replicationBarrierCleaner);
|
||||||
|
choreService.cancelChore(this.hbckChecker);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3861,4 +3865,8 @@ public class HMaster extends HRegionServer implements MasterServices {
|
||||||
}
|
}
|
||||||
return super.getWalGroupsReplicationStatus();
|
return super.getWalGroupsReplicationStatus();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public HbckChecker getHbckChecker() {
|
||||||
|
return this.hbckChecker;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,282 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.hbase.master;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.hbase.ScheduledChore;
|
||||||
|
import org.apache.hadoop.hbase.ServerName;
|
||||||
|
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||||
|
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
|
||||||
|
import org.apache.hadoop.hbase.util.FSUtils;
|
||||||
|
import org.apache.hadoop.hbase.util.HbckRegionInfo;
|
||||||
|
import org.apache.hadoop.hbase.util.Pair;
|
||||||
|
import org.apache.yetus.audience.InterfaceAudience;
|
||||||
|
import org.apache.yetus.audience.InterfaceStability;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Used to do the hbck checking job at master side.
|
||||||
|
*/
|
||||||
|
@InterfaceAudience.Private
|
||||||
|
@InterfaceStability.Evolving
|
||||||
|
public class HbckChecker extends ScheduledChore {
|
||||||
|
private static final Logger LOG = LoggerFactory.getLogger(HbckChecker.class.getName());
|
||||||
|
|
||||||
|
private static final String HBCK_CHECKER_INTERVAL = "hbase.master.hbck.checker.interval";
|
||||||
|
private static final int DEFAULT_HBCK_CHECKER_INTERVAL = 60 * 60 * 1000;
|
||||||
|
|
||||||
|
private final MasterServices master;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This map contains the state of all hbck items. It maps from encoded region
|
||||||
|
* name to HbckRegionInfo structure. The information contained in HbckRegionInfo is used
|
||||||
|
* to detect and correct consistency (hdfs/meta/deployment) problems.
|
||||||
|
*/
|
||||||
|
private final Map<String, HbckRegionInfo> regionInfoMap = new HashMap<>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The regions only opened on RegionServers, but no region info in meta.
|
||||||
|
*/
|
||||||
|
private final Map<String, ServerName> orphanRegionsOnRS = new HashMap<>();
|
||||||
|
/**
|
||||||
|
* The regions have directory on FileSystem, but no region info in meta.
|
||||||
|
*/
|
||||||
|
private final List<String> orphanRegionsOnFS = new LinkedList<>();
|
||||||
|
/**
|
||||||
|
* The inconsistent regions. There are three case:
|
||||||
|
* case 1. Master thought this region opened, but no regionserver reported it.
|
||||||
|
* case 2. Master thought this region opened on Server1, but regionserver reported Server2
|
||||||
|
* case 3. More than one regionservers reported opened this region
|
||||||
|
*/
|
||||||
|
private final Map<String, Pair<ServerName, List<ServerName>>> inconsistentRegions =
|
||||||
|
new HashMap<>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The "snapshot" is used to save the last round's HBCK checking report.
|
||||||
|
*/
|
||||||
|
private final Map<String, ServerName> orphanRegionsOnRSSnapshot = new HashMap<>();
|
||||||
|
private final List<String> orphanRegionsOnFSSnapshot = new LinkedList<>();
|
||||||
|
private final Map<String, Pair<ServerName, List<ServerName>>> inconsistentRegionsSnapshot =
|
||||||
|
new HashMap<>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The "snapshot" may be changed after checking. And this checking report "snapshot" may be
|
||||||
|
* accessed by web ui. Use this rwLock to synchronize.
|
||||||
|
*/
|
||||||
|
ReentrantReadWriteLock rwLock = new ReentrantReadWriteLock();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When running, the "snapshot" may be changed when this round's checking finish.
|
||||||
|
*/
|
||||||
|
private volatile boolean running = false;
|
||||||
|
private volatile long checkingStartTimestamp = 0;
|
||||||
|
private volatile long checkingEndTimestamp = 0;
|
||||||
|
|
||||||
|
public HbckChecker(MasterServices master) {
|
||||||
|
super("HbckChecker-", master,
|
||||||
|
master.getConfiguration().getInt(HBCK_CHECKER_INTERVAL, DEFAULT_HBCK_CHECKER_INTERVAL));
|
||||||
|
this.master = master;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void chore() {
|
||||||
|
running = true;
|
||||||
|
regionInfoMap.clear();
|
||||||
|
orphanRegionsOnRS.clear();
|
||||||
|
orphanRegionsOnFS.clear();
|
||||||
|
inconsistentRegions.clear();
|
||||||
|
checkingStartTimestamp = EnvironmentEdgeManager.currentTime();
|
||||||
|
loadRegionsFromInMemoryState();
|
||||||
|
loadRegionsFromRSReport();
|
||||||
|
try {
|
||||||
|
loadRegionsFromFS();
|
||||||
|
} catch (IOException e) {
|
||||||
|
LOG.warn("Failed to load the regions from filesystem", e);
|
||||||
|
}
|
||||||
|
saveCheckResultToSnapshot();
|
||||||
|
running = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void saveCheckResultToSnapshot() {
|
||||||
|
// Need synchronized here, as this "snapshot" may be access by web ui.
|
||||||
|
rwLock.writeLock().lock();
|
||||||
|
try {
|
||||||
|
orphanRegionsOnRSSnapshot.clear();
|
||||||
|
orphanRegionsOnRS.entrySet()
|
||||||
|
.forEach(e -> orphanRegionsOnRSSnapshot.put(e.getKey(), e.getValue()));
|
||||||
|
orphanRegionsOnFSSnapshot.clear();
|
||||||
|
orphanRegionsOnFSSnapshot.addAll(orphanRegionsOnFS);
|
||||||
|
inconsistentRegionsSnapshot.clear();
|
||||||
|
inconsistentRegions.entrySet()
|
||||||
|
.forEach(e -> inconsistentRegionsSnapshot.put(e.getKey(), e.getValue()));
|
||||||
|
checkingEndTimestamp = EnvironmentEdgeManager.currentTime();
|
||||||
|
} finally {
|
||||||
|
rwLock.writeLock().unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void loadRegionsFromInMemoryState() {
|
||||||
|
List<RegionState> regionStates =
|
||||||
|
master.getAssignmentManager().getRegionStates().getRegionStates();
|
||||||
|
for (RegionState regionState : regionStates) {
|
||||||
|
RegionInfo regionInfo = regionState.getRegion();
|
||||||
|
HbckRegionInfo.MetaEntry metaEntry =
|
||||||
|
new HbckRegionInfo.MetaEntry(regionInfo, regionState.getServerName(),
|
||||||
|
regionState.getStamp());
|
||||||
|
regionInfoMap.put(regionInfo.getEncodedName(), new HbckRegionInfo(metaEntry));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void loadRegionsFromRSReport() {
|
||||||
|
Map<ServerName, Set<byte[]>> rsReports = master.getAssignmentManager().getRSReports();
|
||||||
|
for (Map.Entry<ServerName, Set<byte[]>> entry : rsReports.entrySet()) {
|
||||||
|
ServerName serverName = entry.getKey();
|
||||||
|
for (byte[] regionName : entry.getValue()) {
|
||||||
|
String encodedRegionName = RegionInfo.encodeRegionName(regionName);
|
||||||
|
HbckRegionInfo hri = regionInfoMap.get(encodedRegionName);
|
||||||
|
if (hri == null) {
|
||||||
|
orphanRegionsOnRS.put(encodedRegionName, serverName);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
hri.addServer(hri.getMetaEntry(), serverName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (Map.Entry<String, HbckRegionInfo> entry : regionInfoMap.entrySet()) {
|
||||||
|
String encodedRegionName = entry.getKey();
|
||||||
|
HbckRegionInfo hri = entry.getValue();
|
||||||
|
ServerName locationInMeta = hri.getMetaEntry().getRegionServer();
|
||||||
|
if (hri.getDeployedOn().size() == 0) {
|
||||||
|
// Master thought this region opened, but no regionserver reported it.
|
||||||
|
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, new LinkedList<>()));
|
||||||
|
} else if (hri.getDeployedOn().size() > 1) {
|
||||||
|
// More than one regionserver reported opened this region
|
||||||
|
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn()));
|
||||||
|
} else if (!hri.getDeployedOn().get(0).equals(locationInMeta)) {
|
||||||
|
// Master thought this region opened on Server1, but regionserver reported Server2
|
||||||
|
inconsistentRegions.put(encodedRegionName, new Pair<>(locationInMeta, hri.getDeployedOn()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void loadRegionsFromFS() throws IOException {
|
||||||
|
Path rootDir = master.getMasterFileSystem().getRootDir();
|
||||||
|
FileSystem fs = master.getMasterFileSystem().getFileSystem();
|
||||||
|
|
||||||
|
// list all tables from HDFS
|
||||||
|
List<FileStatus> tableDirs = Lists.newArrayList();
|
||||||
|
List<Path> paths = FSUtils.getTableDirs(fs, rootDir);
|
||||||
|
for (Path path : paths) {
|
||||||
|
tableDirs.add(fs.getFileStatus(path));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (FileStatus tableDir : tableDirs) {
|
||||||
|
FileStatus[] regionDirs = fs.listStatus(tableDir.getPath());
|
||||||
|
for (FileStatus regionDir : regionDirs) {
|
||||||
|
String encodedRegionName = regionDir.getPath().getName();
|
||||||
|
HbckRegionInfo hri = regionInfoMap.get(encodedRegionName);
|
||||||
|
if (hri == null) {
|
||||||
|
orphanRegionsOnFS.add(encodedRegionName);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
HbckRegionInfo.HdfsEntry hdfsEntry =
|
||||||
|
new HbckRegionInfo.HdfsEntry(regionDir.getPath(), regionDir.getModificationTime());
|
||||||
|
hri.setHdfsEntry(hdfsEntry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When running, the HBCK report may be changed later.
|
||||||
|
*/
|
||||||
|
public boolean isRunning() {
|
||||||
|
return running;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the regions only opened on RegionServers, but no region info in meta.
|
||||||
|
*/
|
||||||
|
public Map<String, ServerName> getOrphanRegionsOnRS() {
|
||||||
|
// Need synchronized here, as this "snapshot" may be changed after checking.
|
||||||
|
rwLock.readLock().lock();
|
||||||
|
try {
|
||||||
|
return this.orphanRegionsOnRSSnapshot;
|
||||||
|
} finally {
|
||||||
|
rwLock.readLock().unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the regions have directory on FileSystem, but no region info in meta.
|
||||||
|
*/
|
||||||
|
public List<String> getOrphanRegionsOnFS() {
|
||||||
|
// Need synchronized here, as this "snapshot" may be changed after checking.
|
||||||
|
rwLock.readLock().lock();
|
||||||
|
try {
|
||||||
|
return this.orphanRegionsOnFSSnapshot;
|
||||||
|
} finally {
|
||||||
|
rwLock.readLock().unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Found the inconsistent regions. There are three case:
|
||||||
|
* case 1. Master thought this region opened, but no regionserver reported it.
|
||||||
|
* case 2. Master thought this region opened on Server1, but regionserver reported Server2
|
||||||
|
* case 3. More than one regionservers reported opened this region
|
||||||
|
*
|
||||||
|
* @return the map of inconsistent regions. Key is the region name. Value is a pair of location in
|
||||||
|
* meta and the regionservers which reported opened this region.
|
||||||
|
*/
|
||||||
|
public Map<String, Pair<ServerName, List<ServerName>>> getInconsistentRegions() {
|
||||||
|
// Need synchronized here, as this "snapshot" may be changed after checking.
|
||||||
|
rwLock.readLock().lock();
|
||||||
|
try {
|
||||||
|
return this.inconsistentRegionsSnapshot;
|
||||||
|
} finally {
|
||||||
|
rwLock.readLock().unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Used for web ui to show when the HBCK checking started.
|
||||||
|
*/
|
||||||
|
public long getCheckingStartTimestamp() {
|
||||||
|
return this.checkingStartTimestamp;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Used for web ui to show when the HBCK checking report generated.
|
||||||
|
*/
|
||||||
|
public long getCheckingEndTimestamp() {
|
||||||
|
return this.checkingStartTimestamp;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1339,6 +1339,12 @@ public class AssignmentManager implements ServerListener {
|
||||||
|
|
||||||
public long submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) {
|
public long submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) {
|
||||||
boolean carryingMeta = isCarryingMeta(serverName);
|
boolean carryingMeta = isCarryingMeta(serverName);
|
||||||
|
|
||||||
|
// Remove the in-memory rsReports result
|
||||||
|
synchronized (rsReports) {
|
||||||
|
rsReports.remove(serverName);
|
||||||
|
}
|
||||||
|
|
||||||
ProcedureExecutor<MasterProcedureEnv> procExec = this.master.getMasterProcedureExecutor();
|
ProcedureExecutor<MasterProcedureEnv> procExec = this.master.getMasterProcedureExecutor();
|
||||||
long pid = procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(),
|
long pid = procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(),
|
||||||
serverName, shouldSplitWal, carryingMeta));
|
serverName, shouldSplitWal, carryingMeta));
|
||||||
|
@ -1892,51 +1898,13 @@ public class AssignmentManager implements ServerListener {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Found the potentially problematic opened regions. There are three case:
|
* @return a snapshot of rsReports
|
||||||
* case 1. Master thought this region opened, but no regionserver reported it.
|
|
||||||
* case 2. Master thought this region opened on Server1, but regionserver reported Server2
|
|
||||||
* case 3. More than one regionservers reported opened this region
|
|
||||||
*
|
|
||||||
* @return the map of potentially problematic opened regions. Key is the region name. Value is
|
|
||||||
* a pair of location in meta and the regionservers which reported opened this region.
|
|
||||||
*/
|
*/
|
||||||
public Map<String, Pair<ServerName, Set<ServerName>>> getProblematicRegions() {
|
public Map<ServerName, Set<byte[]>> getRSReports() {
|
||||||
Map<String, Set<ServerName>> reportedOnlineRegions = new HashMap<>();
|
Map<ServerName, Set<byte[]>> rsReportsSnapshot = new HashMap<>();
|
||||||
synchronized (rsReports) {
|
synchronized (rsReports) {
|
||||||
for (Map.Entry<ServerName, Set<byte[]>> entry : rsReports.entrySet()) {
|
rsReports.entrySet().forEach(e -> rsReportsSnapshot.put(e.getKey(), e.getValue()));
|
||||||
for (byte[] regionName : entry.getValue()) {
|
|
||||||
reportedOnlineRegions
|
|
||||||
.computeIfAbsent(RegionInfo.getRegionNameAsString(regionName), r -> new HashSet<>())
|
|
||||||
.add(entry.getKey());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
return rsReportsSnapshot;
|
||||||
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = new HashMap<>();
|
|
||||||
List<RegionState> rits = regionStates.getRegionsStateInTransition();
|
|
||||||
for (RegionState regionState : regionStates.getRegionStates()) {
|
|
||||||
// Only consider the opened region and not in transition
|
|
||||||
if (!rits.contains(regionState) && regionState.isOpened()) {
|
|
||||||
String regionName = regionState.getRegion().getRegionNameAsString();
|
|
||||||
ServerName serverName = regionState.getServerName();
|
|
||||||
if (reportedOnlineRegions.containsKey(regionName)) {
|
|
||||||
Set<ServerName> reportedServers = reportedOnlineRegions.get(regionName);
|
|
||||||
if (reportedServers.contains(serverName)) {
|
|
||||||
if (reportedServers.size() > 1) {
|
|
||||||
// More than one regionserver reported opened this region
|
|
||||||
problematicRegions.put(regionName, new Pair<>(serverName, reportedServers));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Master thought this region opened on Server1, but regionserver reported Server2
|
|
||||||
problematicRegions.put(regionName, new Pair<>(serverName, reportedServers));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Master thought this region opened, but no regionserver reported it.
|
|
||||||
problematicRegions.put(regionName, new Pair<>(serverName, new HashSet<>()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return problematicRegions;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,153 @@
|
||||||
|
<%--
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
--%>
|
||||||
|
<%@ page contentType="text/html;charset=UTF-8"
|
||||||
|
import="java.util.Date"
|
||||||
|
import="java.util.List"
|
||||||
|
import="java.util.Map"
|
||||||
|
import="java.util.stream.Collectors"
|
||||||
|
%>
|
||||||
|
<%@ page import="org.apache.hadoop.hbase.master.HbckChecker" %>
|
||||||
|
<%@ page import="org.apache.hadoop.hbase.master.HMaster" %>
|
||||||
|
<%@ page import="org.apache.hadoop.hbase.ServerName" %>
|
||||||
|
<%@ page import="org.apache.hadoop.hbase.util.Pair" %>
|
||||||
|
<%
|
||||||
|
HMaster master = (HMaster) getServletContext().getAttribute(HMaster.MASTER);
|
||||||
|
pageContext.setAttribute("pageTitle", "HBase Master HBCK Report: " + master.getServerName());
|
||||||
|
HbckChecker hbckChecker = master.getHbckChecker();
|
||||||
|
Map<String, Pair<ServerName, List<ServerName>>> inconsistentRegions = null;
|
||||||
|
Map<String, ServerName> orphanRegionsOnRS = null;
|
||||||
|
List<String> orphanRegionsOnFS = null;
|
||||||
|
long startTimestamp = 0;
|
||||||
|
long endTimestamp = 0;
|
||||||
|
if (hbckChecker != null) {
|
||||||
|
inconsistentRegions = hbckChecker.getInconsistentRegions();
|
||||||
|
orphanRegionsOnRS = hbckChecker.getOrphanRegionsOnRS();
|
||||||
|
orphanRegionsOnFS = hbckChecker.getOrphanRegionsOnFS();
|
||||||
|
startTimestamp = hbckChecker.getCheckingStartTimestamp();
|
||||||
|
endTimestamp = hbckChecker.getCheckingEndTimestamp();
|
||||||
|
}
|
||||||
|
%>
|
||||||
|
<jsp:include page="header.jsp">
|
||||||
|
<jsp:param name="pageTitle" value="${pageTitle}"/>
|
||||||
|
</jsp:include>
|
||||||
|
|
||||||
|
<div class="container-fluid content">
|
||||||
|
|
||||||
|
<% if (!master.isInitialized()) { %>
|
||||||
|
<div class="row">
|
||||||
|
<div class="page-header">
|
||||||
|
<h1>Master is not initialized</h1>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<jsp:include page="redirect.jsp" />
|
||||||
|
<% } else { %>
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="page-header">
|
||||||
|
<h1>HBCK Report</h1>
|
||||||
|
<p>
|
||||||
|
<span>Checking started at <%= new Date(startTimestamp) %> and generated report at <%= new Date(endTimestamp) %></span>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="page-header">
|
||||||
|
<h2>Inconsistent Regions</h2>
|
||||||
|
<p>
|
||||||
|
<span>
|
||||||
|
There are three case: 1. Master thought this region opened, but no regionserver reported it.
|
||||||
|
2. Master thought this region opened on Server1, but regionserver reported Server2.
|
||||||
|
3. More than one regionservers reported opened this region.
|
||||||
|
Notice: the reported online regionservers may be not right when there are regions in transition.
|
||||||
|
Please check them in regionserver's web UI.
|
||||||
|
</span>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<% if (inconsistentRegions != null && inconsistentRegions.size() > 0) { %>
|
||||||
|
<table class="table table-striped">
|
||||||
|
<tr>
|
||||||
|
<th>Region</th>
|
||||||
|
<th>Location in META</th>
|
||||||
|
<th>Reported Online RegionServers</th>
|
||||||
|
</tr>
|
||||||
|
<% for (Map.Entry<String, Pair<ServerName, List<ServerName>>> entry : inconsistentRegions.entrySet()) { %>
|
||||||
|
<tr>
|
||||||
|
<td><%= entry.getKey() %></td>
|
||||||
|
<td><%= entry.getValue().getFirst() %></td>
|
||||||
|
<td><%= entry.getValue().getSecond().stream().map(ServerName::getServerName)
|
||||||
|
.collect(Collectors.joining(", ")) %></td>
|
||||||
|
</tr>
|
||||||
|
<% } %>
|
||||||
|
|
||||||
|
<p><%= inconsistentRegions.size() %> region(s) in set.</p>
|
||||||
|
</table>
|
||||||
|
<% } %>
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="page-header">
|
||||||
|
<h2>Orphan Regions on RegionServer</h2>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<% if (orphanRegionsOnRS != null && orphanRegionsOnRS.size() > 0) { %>
|
||||||
|
<table class="table table-striped">
|
||||||
|
<tr>
|
||||||
|
<th>Region</th>
|
||||||
|
<th>Reported Online RegionServer</th>
|
||||||
|
</tr>
|
||||||
|
<% for (Map.Entry<String, ServerName> entry : orphanRegionsOnRS.entrySet()) { %>
|
||||||
|
<tr>
|
||||||
|
<td><%= entry.getKey() %></td>
|
||||||
|
<td><%= entry.getValue() %></td>
|
||||||
|
</tr>
|
||||||
|
<% } %>
|
||||||
|
|
||||||
|
<p><%= orphanRegionsOnRS.size() %> region(s) in set.</p>
|
||||||
|
</table>
|
||||||
|
<% } %>
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="page-header">
|
||||||
|
<h2>Orphan Regions on FileSystem</h2>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<% if (orphanRegionsOnFS != null && orphanRegionsOnFS.size() > 0) { %>
|
||||||
|
<table class="table table-striped">
|
||||||
|
<tr>
|
||||||
|
<th>Region</th>
|
||||||
|
</tr>
|
||||||
|
<% for (String region : orphanRegionsOnFS) { %>
|
||||||
|
<tr>
|
||||||
|
<td><%= region %></td>
|
||||||
|
</tr>
|
||||||
|
<% } %>
|
||||||
|
|
||||||
|
<p><%= orphanRegionsOnFS.size() %> region(s) in set.</p>
|
||||||
|
</table>
|
||||||
|
<% } %>
|
||||||
|
|
||||||
|
<% } %>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<jsp:include page="footer.jsp"/>
|
|
@ -24,7 +24,6 @@ import static org.junit.Assert.assertTrue;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
|
||||||
import java.util.concurrent.Future;
|
import java.util.concurrent.Future;
|
||||||
|
|
||||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||||
|
@ -32,9 +31,11 @@ import org.apache.hadoop.hbase.ServerName;
|
||||||
import org.apache.hadoop.hbase.TableName;
|
import org.apache.hadoop.hbase.TableName;
|
||||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||||
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
||||||
|
import org.apache.hadoop.hbase.master.HbckChecker;
|
||||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||||
import org.apache.hadoop.hbase.util.Pair;
|
import org.apache.hadoop.hbase.util.Pair;
|
||||||
|
import org.junit.Before;
|
||||||
import org.junit.ClassRule;
|
import org.junit.ClassRule;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.junit.experimental.categories.Category;
|
import org.junit.experimental.categories.Category;
|
||||||
|
@ -42,41 +43,52 @@ import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@Category({ MasterTests.class, MediumTests.class })
|
@Category({ MasterTests.class, MediumTests.class })
|
||||||
public class TestAMProblematicRegions extends TestAssignmentManagerBase {
|
public class TestHbckChecker extends TestAssignmentManagerBase {
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(TestAMProblematicRegions.class);
|
private static final Logger LOG = LoggerFactory.getLogger(TestHbckChecker.class);
|
||||||
|
|
||||||
@ClassRule
|
@ClassRule
|
||||||
public static final HBaseClassTestRule CLASS_RULE =
|
public static final HBaseClassTestRule CLASS_RULE =
|
||||||
HBaseClassTestRule.forClass(TestAMProblematicRegions.class);
|
HBaseClassTestRule.forClass(TestHbckChecker.class);
|
||||||
|
|
||||||
|
private HbckChecker hbckChecker;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
hbckChecker = new HbckChecker(master);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testForMeta() throws Exception {
|
public void testForMeta() throws Exception {
|
||||||
byte[] metaRegionNameAsBytes = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName();
|
byte[] metaRegionNameAsBytes = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName();
|
||||||
String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionNameAsString();
|
String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getEncodedName();
|
||||||
List<ServerName> serverNames = master.getServerManager().getOnlineServersList();
|
List<ServerName> serverNames = master.getServerManager().getOnlineServersList();
|
||||||
assertEquals(NSERVERS, serverNames.size());
|
assertEquals(NSERVERS, serverNames.size());
|
||||||
|
|
||||||
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = am.getProblematicRegions();
|
hbckChecker.choreForTesting();
|
||||||
|
Map<String, Pair<ServerName, List<ServerName>>> inconsistentRegions =
|
||||||
|
hbckChecker.getInconsistentRegions();
|
||||||
|
|
||||||
// Test for case1: Master thought this region opened, but no regionserver reported it.
|
// Test for case1: Master thought this region opened, but no regionserver reported it.
|
||||||
assertTrue(problematicRegions.containsKey(metaRegionName));
|
assertTrue(inconsistentRegions.containsKey(metaRegionName));
|
||||||
Pair<ServerName, Set<ServerName>> pair = problematicRegions.get(metaRegionName);
|
Pair<ServerName, List<ServerName>> pair = inconsistentRegions.get(metaRegionName);
|
||||||
ServerName locationInMeta = pair.getFirst();
|
ServerName locationInMeta = pair.getFirst();
|
||||||
Set<ServerName> reportedRegionServers = pair.getSecond();
|
List<ServerName> reportedRegionServers = pair.getSecond();
|
||||||
assertTrue(serverNames.contains(locationInMeta));
|
assertTrue(serverNames.contains(locationInMeta));
|
||||||
assertEquals(0, reportedRegionServers.size());
|
assertEquals(0, reportedRegionServers.size());
|
||||||
|
|
||||||
// Reported right region location. Then not in problematic regions.
|
// Reported right region location. Then not in problematic regions.
|
||||||
am.reportOnlineRegions(locationInMeta, Collections.singleton(metaRegionNameAsBytes));
|
am.reportOnlineRegions(locationInMeta, Collections.singleton(metaRegionNameAsBytes));
|
||||||
problematicRegions = am.getProblematicRegions();
|
hbckChecker.choreForTesting();
|
||||||
assertFalse(problematicRegions.containsKey(metaRegionName));
|
inconsistentRegions = hbckChecker.getInconsistentRegions();
|
||||||
|
assertFalse(inconsistentRegions.containsKey(metaRegionName));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testForUserTable() throws Exception {
|
public void testForUserTable() throws Exception {
|
||||||
TableName tableName = TableName.valueOf("testForUserTable");
|
TableName tableName = TableName.valueOf("testForUserTable");
|
||||||
RegionInfo hri = createRegionInfo(tableName, 1);
|
RegionInfo hri = createRegionInfo(tableName, 1);
|
||||||
String regionName = hri.getRegionNameAsString();
|
String regionName = hri.getEncodedName();
|
||||||
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
|
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
|
||||||
Future<byte[]> future = submitProcedure(am.createAssignProcedure(hri));
|
Future<byte[]> future = submitProcedure(am.createAssignProcedure(hri));
|
||||||
waitOnFuture(future);
|
waitOnFuture(future);
|
||||||
|
@ -85,11 +97,13 @@ public class TestAMProblematicRegions extends TestAssignmentManagerBase {
|
||||||
assertEquals(NSERVERS, serverNames.size());
|
assertEquals(NSERVERS, serverNames.size());
|
||||||
|
|
||||||
// Test for case1: Master thought this region opened, but no regionserver reported it.
|
// Test for case1: Master thought this region opened, but no regionserver reported it.
|
||||||
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = am.getProblematicRegions();
|
hbckChecker.choreForTesting();
|
||||||
assertTrue(problematicRegions.containsKey(regionName));
|
Map<String, Pair<ServerName, List<ServerName>>> inconsistentRegions =
|
||||||
Pair<ServerName, Set<ServerName>> pair = problematicRegions.get(regionName);
|
hbckChecker.getInconsistentRegions();
|
||||||
|
assertTrue(inconsistentRegions.containsKey(regionName));
|
||||||
|
Pair<ServerName, List<ServerName>> pair = inconsistentRegions.get(regionName);
|
||||||
ServerName locationInMeta = pair.getFirst();
|
ServerName locationInMeta = pair.getFirst();
|
||||||
Set<ServerName> reportedRegionServers = pair.getSecond();
|
List<ServerName> reportedRegionServers = pair.getSecond();
|
||||||
assertTrue(serverNames.contains(locationInMeta));
|
assertTrue(serverNames.contains(locationInMeta));
|
||||||
assertEquals(0, reportedRegionServers.size());
|
assertEquals(0, reportedRegionServers.size());
|
||||||
|
|
||||||
|
@ -99,9 +113,10 @@ public class TestAMProblematicRegions extends TestAssignmentManagerBase {
|
||||||
final ServerName anotherServer =
|
final ServerName anotherServer =
|
||||||
serverNames.stream().filter(s -> !s.equals(tempLocationInMeta)).findFirst().get();
|
serverNames.stream().filter(s -> !s.equals(tempLocationInMeta)).findFirst().get();
|
||||||
am.reportOnlineRegions(anotherServer, Collections.singleton(hri.getRegionName()));
|
am.reportOnlineRegions(anotherServer, Collections.singleton(hri.getRegionName()));
|
||||||
problematicRegions = am.getProblematicRegions();
|
hbckChecker.choreForTesting();
|
||||||
assertTrue(problematicRegions.containsKey(regionName));
|
inconsistentRegions = hbckChecker.getInconsistentRegions();
|
||||||
pair = problematicRegions.get(regionName);
|
assertTrue(inconsistentRegions.containsKey(regionName));
|
||||||
|
pair = inconsistentRegions.get(regionName);
|
||||||
locationInMeta = pair.getFirst();
|
locationInMeta = pair.getFirst();
|
||||||
reportedRegionServers = pair.getSecond();
|
reportedRegionServers = pair.getSecond();
|
||||||
assertEquals(1, reportedRegionServers.size());
|
assertEquals(1, reportedRegionServers.size());
|
||||||
|
@ -110,9 +125,10 @@ public class TestAMProblematicRegions extends TestAssignmentManagerBase {
|
||||||
|
|
||||||
// Test for case3: More than one regionservers reported opened this region.
|
// Test for case3: More than one regionservers reported opened this region.
|
||||||
am.reportOnlineRegions(locationInMeta, Collections.singleton(hri.getRegionName()));
|
am.reportOnlineRegions(locationInMeta, Collections.singleton(hri.getRegionName()));
|
||||||
problematicRegions = am.getProblematicRegions();
|
hbckChecker.choreForTesting();
|
||||||
assertTrue(problematicRegions.containsKey(regionName));
|
inconsistentRegions = hbckChecker.getInconsistentRegions();
|
||||||
pair = problematicRegions.get(regionName);
|
assertTrue(inconsistentRegions.containsKey(regionName));
|
||||||
|
pair = inconsistentRegions.get(regionName);
|
||||||
locationInMeta = pair.getFirst();
|
locationInMeta = pair.getFirst();
|
||||||
reportedRegionServers = pair.getSecond();
|
reportedRegionServers = pair.getSecond();
|
||||||
assertEquals(2, reportedRegionServers.size());
|
assertEquals(2, reportedRegionServers.size());
|
||||||
|
@ -121,7 +137,8 @@ public class TestAMProblematicRegions extends TestAssignmentManagerBase {
|
||||||
|
|
||||||
// Reported right region location. Then not in problematic regions.
|
// Reported right region location. Then not in problematic regions.
|
||||||
am.reportOnlineRegions(anotherServer, Collections.EMPTY_SET);
|
am.reportOnlineRegions(anotherServer, Collections.EMPTY_SET);
|
||||||
problematicRegions = am.getProblematicRegions();
|
hbckChecker.choreForTesting();
|
||||||
assertFalse(problematicRegions.containsKey(regionName));
|
inconsistentRegions = hbckChecker.getInconsistentRegions();
|
||||||
|
assertFalse(inconsistentRegions.containsKey(regionName));
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue