From 9e59b7ff5ec426d011f8f0b0ac23b3a997c89bf4 Mon Sep 17 00:00:00 2001 From: Guanghao Zhang Date: Thu, 11 Jul 2019 15:20:34 +0800 Subject: [PATCH] HBASE-22527 [hbck2] Add a master web ui to show the problematic regions --- .../master/AssignmentManagerStatusTmpl.jamon | 100 ++++++++++++-- .../master/assignment/AssignmentManager.java | 58 +++++++- .../assignment/TestAMProblematicRegions.java | 127 ++++++++++++++++++ 3 files changed, 273 insertions(+), 12 deletions(-) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAMProblematicRegions.java diff --git a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon index 9c6916e768d..90351aadd5a 100644 --- a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon +++ b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon @@ -17,27 +17,105 @@ See the License for the specific language governing permissions and limitations under the License. <%import> -org.apache.hadoop.hbase.master.assignment.AssignmentManager; -org.apache.hadoop.hbase.master.assignment.AssignmentManager.RegionInTransitionStat; -org.apache.hadoop.hbase.master.assignment.RegionStates.RegionFailedOpen; -org.apache.hadoop.hbase.master.RegionState; +java.util.Map; +java.util.Set; +java.util.SortedSet; +java.util.concurrent.atomic.AtomicInteger; +java.util.stream.Collectors; org.apache.hadoop.conf.Configuration; org.apache.hadoop.hbase.HBaseConfiguration; org.apache.hadoop.hbase.HConstants; +org.apache.hadoop.hbase.ServerName; +org.apache.hadoop.hbase.client.RegionInfo; org.apache.hadoop.hbase.client.RegionInfoDisplay; -java.util.HashSet; -java.util.SortedSet; -java.util.Map; -java.util.concurrent.atomic.AtomicInteger; +org.apache.hadoop.hbase.master.RegionState; +org.apache.hadoop.hbase.master.assignment.AssignmentManager; +org.apache.hadoop.hbase.master.assignment.AssignmentManager.RegionInTransitionStat; +org.apache.hadoop.hbase.master.assignment.RegionStates.RegionFailedOpen; +org.apache.hadoop.hbase.util.Pair; <%args> AssignmentManager assignmentManager; int limit = 100; -<%java SortedSet rit = assignmentManager - .getRegionStates().getRegionsInTransitionOrderedByTimestamp(); -%> +<%java> +SortedSet rit = assignmentManager.getRegionStates() + .getRegionsInTransitionOrderedByTimestamp(); +Map>> problematicRegions = assignmentManager + .getProblematicRegions(); + + +<%if !problematicRegions.isEmpty() %> +<%java> +int totalSize = problematicRegions.size(); +int sizePerPage = Math.min(10, totalSize); +int numOfPages = (int) Math.ceil(totalSize * 1.0 / sizePerPage); + +
+

Problematic Regions

+

+ + <% problematicRegions.size() %> problematic region(s). Notice: the reported online + regionservers may be not right when there are regions in transition. Please check them + in regionserver's web UI. + +

+
+
+ <%java int recordItr = 0; %> + <%for Map.Entry>> entry : problematicRegions.entrySet() %> + <%if (recordItr % sizePerPage) == 0 %> + <%if recordItr == 0 %> +
+ <%else> +
+ + + + + + + + + + + + + + + <%java recordItr++; %> + <%if (recordItr % sizePerPage) == 0 %> +
RegionLocation in METAReported Online Region Servers
<% entry.getKey() %><% entry.getValue().getFirst() %><% entry.getValue().getSecond().stream().map(ServerName::getServerName) + .collect(Collectors.joining(", ")) %>
+
+ + + + <%if (recordItr % sizePerPage) != 0 %> + <%for ; (recordItr % sizePerPage) != 0 ; recordItr++ %> + + + +
+ + +
+ +
+
+ <%if !rit.isEmpty() %> <%java> diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 5ad3ba43ee3..ea4a99f5dbd 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -158,6 +158,8 @@ public class AssignmentManager { private final RegionStates regionStates = new RegionStates(); private final RegionStateStore regionStateStore; + private final Map> rsReports = new HashMap<>(); + private final boolean shouldAssignRegionsWithFavoredNodes; private final int assignDispatchWaitQueueMaxSize; private final int assignDispatchWaitMillis; @@ -1065,13 +1067,18 @@ public class AssignmentManager { } ServerStateNode serverNode = regionStates.getOrCreateServer(serverName); - synchronized (serverNode) { if (!serverNode.isInState(ServerState.ONLINE)) { LOG.warn("Got a report from a server result in state " + serverNode.getState()); return; } } + + // Track the regionserver reported online regions in memory. + synchronized (rsReports) { + rsReports.put(serverName, regionNames); + } + if (regionNames.isEmpty()) { // nothing to do if we don't have regions LOG.trace("no online region found on {}", serverName); @@ -2022,4 +2029,53 @@ public class AssignmentManager { MasterServices getMaster() { return master; } + + /** + * Found the potentially problematic opened regions. There are three case: + * case 1. Master thought this region opened, but no regionserver reported it. + * case 2. Master thought this region opened on Server1, but regionserver reported Server2 + * case 3. More than one regionservers reported opened this region + * + * @return the map of potentially problematic opened regions. Key is the region name. Value is + * a pair of location in meta and the regionservers which reported opened this region. + */ + public Map>> getProblematicRegions() { + Map> reportedOnlineRegions = new HashMap<>(); + synchronized (rsReports) { + for (Map.Entry> entry : rsReports.entrySet()) { + for (byte[] regionName : entry.getValue()) { + reportedOnlineRegions + .computeIfAbsent(RegionInfo.getRegionNameAsString(regionName), r -> new HashSet<>()) + .add(entry.getKey()); + } + } + } + + Map>> problematicRegions = new HashMap<>(); + List rits = regionStates.getRegionsStateInTransition(); + for (RegionState regionState : regionStates.getRegionStates()) { + // Only consider the opened region and not in transition + if (!rits.contains(regionState) && regionState.isOpened()) { + String regionName = regionState.getRegion().getRegionNameAsString(); + ServerName serverName = regionState.getServerName(); + if (reportedOnlineRegions.containsKey(regionName)) { + Set reportedServers = reportedOnlineRegions.get(regionName); + if (reportedServers.contains(serverName)) { + if (reportedServers.size() > 1) { + // More than one regionserver reported opened this region + problematicRegions.put(regionName, new Pair<>(serverName, reportedServers)); + } + } else { + // Master thought this region opened on Server1, but regionserver reported Server2 + problematicRegions.put(regionName, new Pair<>(serverName, reportedServers)); + } + } else { + // Master thought this region opened, but no regionserver reported it. + problematicRegions.put(regionName, new Pair<>(serverName, new HashSet<>())); + } + } + } + + return problematicRegions; + } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAMProblematicRegions.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAMProblematicRegions.java new file mode 100644 index 00000000000..2c86a09bfcc --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAMProblematicRegions.java @@ -0,0 +1,127 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.assignment; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.Future; + +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.RegionInfoBuilder; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.util.Pair; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Category({ MasterTests.class, MediumTests.class }) +public class TestAMProblematicRegions extends TestAssignmentManagerBase { + private static final Logger LOG = LoggerFactory.getLogger(TestAMProblematicRegions.class); + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestAMProblematicRegions.class); + + @Test + public void testForMeta() { + byte[] metaRegionNameAsBytes = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName(); + String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionNameAsString(); + List serverNames = master.getServerManager().getOnlineServersList(); + assertEquals(NSERVERS, serverNames.size()); + + Map>> problematicRegions = am.getProblematicRegions(); + + // Test for case1: Master thought this region opened, but no regionserver reported it. + assertTrue(problematicRegions.containsKey(metaRegionName)); + Pair> pair = problematicRegions.get(metaRegionName); + ServerName locationInMeta = pair.getFirst(); + Set reportedRegionServers = pair.getSecond(); + assertTrue(serverNames.contains(locationInMeta)); + assertEquals(0, reportedRegionServers.size()); + + // Reported right region location. Then not in problematic regions. + am.reportOnlineRegions(locationInMeta, Collections.singleton(metaRegionNameAsBytes)); + problematicRegions = am.getProblematicRegions(); + assertFalse(problematicRegions.containsKey(metaRegionName)); + } + + @Test + public void testForUserTable() throws Exception { + TableName tableName = TableName.valueOf("testForUserTable"); + RegionInfo hri = createRegionInfo(tableName, 1); + String regionName = hri.getRegionNameAsString(); + rsDispatcher.setMockRsExecutor(new GoodRsExecutor()); + Future future = submitProcedure(createAssignProcedure(hri)); + waitOnFuture(future); + + List serverNames = master.getServerManager().getOnlineServersList(); + assertEquals(NSERVERS, serverNames.size()); + + // Test for case1: Master thought this region opened, but no regionserver reported it. + Map>> problematicRegions = am.getProblematicRegions(); + assertTrue(problematicRegions.containsKey(regionName)); + Pair> pair = problematicRegions.get(regionName); + ServerName locationInMeta = pair.getFirst(); + Set reportedRegionServers = pair.getSecond(); + assertTrue(serverNames.contains(locationInMeta)); + assertEquals(0, reportedRegionServers.size()); + + // Test for case2: Master thought this region opened on Server1, but regionserver reported + // Server2 + final ServerName tempLocationInMeta = locationInMeta; + final ServerName anotherServer = + serverNames.stream().filter(s -> !s.equals(tempLocationInMeta)).findFirst().get(); + am.reportOnlineRegions(anotherServer, Collections.singleton(hri.getRegionName())); + problematicRegions = am.getProblematicRegions(); + assertTrue(problematicRegions.containsKey(regionName)); + pair = problematicRegions.get(regionName); + locationInMeta = pair.getFirst(); + reportedRegionServers = pair.getSecond(); + assertEquals(1, reportedRegionServers.size()); + assertFalse(reportedRegionServers.contains(locationInMeta)); + assertTrue(reportedRegionServers.contains(anotherServer)); + + // Test for case3: More than one regionservers reported opened this region. + am.reportOnlineRegions(locationInMeta, Collections.singleton(hri.getRegionName())); + problematicRegions = am.getProblematicRegions(); + assertTrue(problematicRegions.containsKey(regionName)); + pair = problematicRegions.get(regionName); + locationInMeta = pair.getFirst(); + reportedRegionServers = pair.getSecond(); + assertEquals(2, reportedRegionServers.size()); + assertTrue(reportedRegionServers.contains(locationInMeta)); + assertTrue(reportedRegionServers.contains(anotherServer)); + + // Reported right region location. Then not in problematic regions. + am.reportOnlineRegions(anotherServer, Collections.EMPTY_SET); + problematicRegions = am.getProblematicRegions(); + assertFalse(problematicRegions.containsKey(regionName)); + } +} \ No newline at end of file