HBASE-22527 [hbck2] Add a master web ui to show the problematic regions

This commit is contained in:
Guanghao Zhang 2019-07-11 15:20:34 +08:00
parent 2b7e33fe52
commit f0cbdf8beb
3 changed files with 273 additions and 12 deletions

View File

@ -17,27 +17,105 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
</%doc> </%doc>
<%import> <%import>
org.apache.hadoop.hbase.master.assignment.AssignmentManager; java.util.Map;
org.apache.hadoop.hbase.master.assignment.AssignmentManager.RegionInTransitionStat; java.util.Set;
org.apache.hadoop.hbase.master.assignment.RegionStates.RegionFailedOpen; java.util.SortedSet;
org.apache.hadoop.hbase.master.RegionState; java.util.concurrent.atomic.AtomicInteger;
java.util.stream.Collectors;
org.apache.hadoop.conf.Configuration; org.apache.hadoop.conf.Configuration;
org.apache.hadoop.hbase.HBaseConfiguration; org.apache.hadoop.hbase.HBaseConfiguration;
org.apache.hadoop.hbase.HConstants; org.apache.hadoop.hbase.HConstants;
org.apache.hadoop.hbase.ServerName;
org.apache.hadoop.hbase.client.RegionInfo;
org.apache.hadoop.hbase.client.RegionInfoDisplay; org.apache.hadoop.hbase.client.RegionInfoDisplay;
java.util.HashSet; org.apache.hadoop.hbase.master.RegionState;
java.util.SortedSet; org.apache.hadoop.hbase.master.assignment.AssignmentManager;
java.util.Map; org.apache.hadoop.hbase.master.assignment.AssignmentManager.RegionInTransitionStat;
java.util.concurrent.atomic.AtomicInteger; org.apache.hadoop.hbase.master.assignment.RegionStates.RegionFailedOpen;
org.apache.hadoop.hbase.util.Pair;
</%import> </%import>
<%args> <%args>
AssignmentManager assignmentManager; AssignmentManager assignmentManager;
int limit = 100; int limit = 100;
</%args> </%args>
<%java SortedSet<RegionState> rit = assignmentManager <%java>
.getRegionStates().getRegionsInTransitionOrderedByTimestamp(); SortedSet<RegionState> rit = assignmentManager.getRegionStates()
%> .getRegionsInTransitionOrderedByTimestamp();
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = assignmentManager
.getProblematicRegions();
</%java>
<%if !problematicRegions.isEmpty() %>
<%java>
int totalSize = problematicRegions.size();
int sizePerPage = Math.min(10, totalSize);
int numOfPages = (int) Math.ceil(totalSize * 1.0 / sizePerPage);
</%java>
<section>
<h2><a name="problem-regions">Problematic Regions</a></h2>
<p>
<span>
<% problematicRegions.size() %> problematic region(s). Notice: the reported online
regionservers may be not right when there are regions in transition. Please check them
in regionserver's web UI.
</span>
</p>
<div class="tabbable">
<div class="tab-content">
<%java int recordItr = 0; %>
<%for Map.Entry<String, Pair<ServerName, Set<ServerName>>> entry : problematicRegions.entrySet() %>
<%if (recordItr % sizePerPage) == 0 %>
<%if recordItr == 0 %>
<div class="tab-pane active" id="tab_prs<% (recordItr / sizePerPage) + 1 %>">
<%else>
<div class="tab-pane" id="tab_prs<% (recordItr / sizePerPage) + 1 %>">
</%if>
<table class="table table-striped" style="margin-bottom:0px;">
<tr>
<th>Region</th>
<th>Location in META</th>
<th>Reported Online Region Servers</th>
</tr>
</%if>
<tr>
<td><% entry.getKey() %></td>
<td><% entry.getValue().getFirst() %></td>
<td><% entry.getValue().getSecond().stream().map(ServerName::getServerName)
.collect(Collectors.joining(", ")) %></td>
</tr>
<%java recordItr++; %>
<%if (recordItr % sizePerPage) == 0 %>
</table>
</div>
</%if>
</%for>
<%if (recordItr % sizePerPage) != 0 %>
<%for ; (recordItr % sizePerPage) != 0 ; recordItr++ %>
<tr><td colspan="3" style="height:61px"></td></tr>
</%for>
</table>
</div>
</%if>
</div>
<nav>
<ul class="nav nav-pills pagination">
<%for int i = 1 ; i <= numOfPages; i++ %>
<%if i == 1 %>
<li class="active">
<%else>
<li>
</%if>
<a href="#tab_prs<% i %>"><% i %></a></li>
</%for>
</ul>
</nav>
</div>
</section>
</%if>
<%if !rit.isEmpty() %> <%if !rit.isEmpty() %>
<%java> <%java>

View File

@ -158,6 +158,8 @@ public class AssignmentManager {
private final RegionStates regionStates = new RegionStates(); private final RegionStates regionStates = new RegionStates();
private final RegionStateStore regionStateStore; private final RegionStateStore regionStateStore;
private final Map<ServerName, Set<byte[]>> rsReports = new HashMap<>();
private final boolean shouldAssignRegionsWithFavoredNodes; private final boolean shouldAssignRegionsWithFavoredNodes;
private final int assignDispatchWaitQueueMaxSize; private final int assignDispatchWaitQueueMaxSize;
private final int assignDispatchWaitMillis; private final int assignDispatchWaitMillis;
@ -1065,13 +1067,18 @@ public class AssignmentManager {
} }
ServerStateNode serverNode = regionStates.getOrCreateServer(serverName); ServerStateNode serverNode = regionStates.getOrCreateServer(serverName);
synchronized (serverNode) { synchronized (serverNode) {
if (!serverNode.isInState(ServerState.ONLINE)) { if (!serverNode.isInState(ServerState.ONLINE)) {
LOG.warn("Got a report from a server result in state " + serverNode.getState()); LOG.warn("Got a report from a server result in state " + serverNode.getState());
return; return;
} }
} }
// Track the regionserver reported online regions in memory.
synchronized (rsReports) {
rsReports.put(serverName, regionNames);
}
if (regionNames.isEmpty()) { if (regionNames.isEmpty()) {
// nothing to do if we don't have regions // nothing to do if we don't have regions
LOG.trace("no online region found on {}", serverName); LOG.trace("no online region found on {}", serverName);
@ -2028,4 +2035,53 @@ public class AssignmentManager {
MasterServices getMaster() { MasterServices getMaster() {
return master; return master;
} }
/**
* Found the potentially problematic opened regions. There are three case:
* case 1. Master thought this region opened, but no regionserver reported it.
* case 2. Master thought this region opened on Server1, but regionserver reported Server2
* case 3. More than one regionservers reported opened this region
*
* @return the map of potentially problematic opened regions. Key is the region name. Value is
* a pair of location in meta and the regionservers which reported opened this region.
*/
public Map<String, Pair<ServerName, Set<ServerName>>> getProblematicRegions() {
Map<String, Set<ServerName>> reportedOnlineRegions = new HashMap<>();
synchronized (rsReports) {
for (Map.Entry<ServerName, Set<byte[]>> entry : rsReports.entrySet()) {
for (byte[] regionName : entry.getValue()) {
reportedOnlineRegions
.computeIfAbsent(RegionInfo.getRegionNameAsString(regionName), r -> new HashSet<>())
.add(entry.getKey());
}
}
}
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = new HashMap<>();
List<RegionState> rits = regionStates.getRegionsStateInTransition();
for (RegionState regionState : regionStates.getRegionStates()) {
// Only consider the opened region and not in transition
if (!rits.contains(regionState) && regionState.isOpened()) {
String regionName = regionState.getRegion().getRegionNameAsString();
ServerName serverName = regionState.getServerName();
if (reportedOnlineRegions.containsKey(regionName)) {
Set<ServerName> reportedServers = reportedOnlineRegions.get(regionName);
if (reportedServers.contains(serverName)) {
if (reportedServers.size() > 1) {
// More than one regionserver reported opened this region
problematicRegions.put(regionName, new Pair<>(serverName, reportedServers));
}
} else {
// Master thought this region opened on Server1, but regionserver reported Server2
problematicRegions.put(regionName, new Pair<>(serverName, reportedServers));
}
} else {
// Master thought this region opened, but no regionserver reported it.
problematicRegions.put(regionName, new Pair<>(serverName, new HashSet<>()));
}
}
}
return problematicRegions;
}
} }

View File

@ -0,0 +1,127 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master.assignment;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Future;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.Pair;
import org.junit.ClassRule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Category({ MasterTests.class, MediumTests.class })
public class TestAMProblematicRegions extends TestAssignmentManagerBase {
private static final Logger LOG = LoggerFactory.getLogger(TestAMProblematicRegions.class);
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestAMProblematicRegions.class);
@Test
public void testForMeta() {
byte[] metaRegionNameAsBytes = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName();
String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionNameAsString();
List<ServerName> serverNames = master.getServerManager().getOnlineServersList();
assertEquals(NSERVERS, serverNames.size());
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = am.getProblematicRegions();
// Test for case1: Master thought this region opened, but no regionserver reported it.
assertTrue(problematicRegions.containsKey(metaRegionName));
Pair<ServerName, Set<ServerName>> pair = problematicRegions.get(metaRegionName);
ServerName locationInMeta = pair.getFirst();
Set<ServerName> reportedRegionServers = pair.getSecond();
assertTrue(serverNames.contains(locationInMeta));
assertEquals(0, reportedRegionServers.size());
// Reported right region location. Then not in problematic regions.
am.reportOnlineRegions(locationInMeta, Collections.singleton(metaRegionNameAsBytes));
problematicRegions = am.getProblematicRegions();
assertFalse(problematicRegions.containsKey(metaRegionName));
}
@Test
public void testForUserTable() throws Exception {
TableName tableName = TableName.valueOf("testForUserTable");
RegionInfo hri = createRegionInfo(tableName, 1);
String regionName = hri.getRegionNameAsString();
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
Future<byte[]> future = submitProcedure(createAssignProcedure(hri));
waitOnFuture(future);
List<ServerName> serverNames = master.getServerManager().getOnlineServersList();
assertEquals(NSERVERS, serverNames.size());
// Test for case1: Master thought this region opened, but no regionserver reported it.
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = am.getProblematicRegions();
assertTrue(problematicRegions.containsKey(regionName));
Pair<ServerName, Set<ServerName>> pair = problematicRegions.get(regionName);
ServerName locationInMeta = pair.getFirst();
Set<ServerName> reportedRegionServers = pair.getSecond();
assertTrue(serverNames.contains(locationInMeta));
assertEquals(0, reportedRegionServers.size());
// Test for case2: Master thought this region opened on Server1, but regionserver reported
// Server2
final ServerName tempLocationInMeta = locationInMeta;
final ServerName anotherServer =
serverNames.stream().filter(s -> !s.equals(tempLocationInMeta)).findFirst().get();
am.reportOnlineRegions(anotherServer, Collections.singleton(hri.getRegionName()));
problematicRegions = am.getProblematicRegions();
assertTrue(problematicRegions.containsKey(regionName));
pair = problematicRegions.get(regionName);
locationInMeta = pair.getFirst();
reportedRegionServers = pair.getSecond();
assertEquals(1, reportedRegionServers.size());
assertFalse(reportedRegionServers.contains(locationInMeta));
assertTrue(reportedRegionServers.contains(anotherServer));
// Test for case3: More than one regionservers reported opened this region.
am.reportOnlineRegions(locationInMeta, Collections.singleton(hri.getRegionName()));
problematicRegions = am.getProblematicRegions();
assertTrue(problematicRegions.containsKey(regionName));
pair = problematicRegions.get(regionName);
locationInMeta = pair.getFirst();
reportedRegionServers = pair.getSecond();
assertEquals(2, reportedRegionServers.size());
assertTrue(reportedRegionServers.contains(locationInMeta));
assertTrue(reportedRegionServers.contains(anotherServer));
// Reported right region location. Then not in problematic regions.
am.reportOnlineRegions(anotherServer, Collections.EMPTY_SET);
problematicRegions = am.getProblematicRegions();
assertFalse(problematicRegions.containsKey(regionName));
}
}