From ada772a1d31b2569f349d5bea385c77ea8a40d38 Mon Sep 17 00:00:00 2001 From: Sergey Shelukhin Date: Mon, 10 Jun 2019 15:43:27 -0700 Subject: [PATCH] HBASE-22408 add dead and unknown server open regions metric to AM Signed-off-by: Duo Zhang --- .../MetricsAssignmentManagerSource.java | 6 ++ .../MetricsAssignmentManagerSourceImpl.java | 14 +++ .../master/MetricsAssignmentManager.java | 8 ++ .../hadoop/hbase/master/ServerManager.java | 17 +++- .../master/assignment/AssignmentManager.java | 89 ++++++++++++++++++- .../hbase/master/assignment/RegionStates.java | 8 +- 6 files changed, 137 insertions(+), 5 deletions(-) diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java index 5e768c419a5..3a71c24b2d0 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java @@ -50,6 +50,8 @@ public interface MetricsAssignmentManagerSource extends BaseSource { String RIT_COUNT_OVER_THRESHOLD_NAME = "ritCountOverThreshold"; String RIT_OLDEST_AGE_NAME = "ritOldestAge"; String RIT_DURATION_NAME = "ritDuration"; + String DEAD_SERVER_OPEN_REGIONS = "deadServerOpenRegions"; + String UNKNOWN_SERVER_OPEN_REGIONS = "unknownServerOpenRegions"; String RIT_COUNT_DESC = "Current number of Regions In Transition (Gauge)."; String RIT_COUNT_OVER_THRESHOLD_DESC = @@ -93,6 +95,10 @@ public interface MetricsAssignmentManagerSource extends BaseSource { void updateRitDuration(long duration); + void updateDeadServerOpenRegions(int deadRegions); + + void updateUnknownServerOpenRegions(int unknownRegions); + /** * TODO: Remove. This may not be needed now as assign and unassign counts are tracked separately * Increment the count of operations (assign/unassign). diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java index 2532a2d836b..8b28d7eaafe 100644 --- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java +++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java @@ -34,6 +34,8 @@ public class MetricsAssignmentManagerSourceImpl private MutableGaugeLong ritCountOverThresholdGauge; private MutableGaugeLong ritOldestAgeGauge; private MetricHistogram ritDurationHisto; + private MutableGaugeLong deadServerOpenRegions; + private MutableGaugeLong unknownServerOpenRegions; private MutableFastCounter operationCounter; @@ -63,6 +65,8 @@ public class MetricsAssignmentManagerSourceImpl ritOldestAgeGauge = metricsRegistry.newGauge(RIT_OLDEST_AGE_NAME, RIT_OLDEST_AGE_DESC, 0L); ritDurationHisto = metricsRegistry.newTimeHistogram(RIT_DURATION_NAME, RIT_DURATION_DESC); operationCounter = metricsRegistry.getCounter(OPERATION_COUNT_NAME, 0L); + deadServerOpenRegions = metricsRegistry.newGauge(DEAD_SERVER_OPEN_REGIONS, "", 0); + unknownServerOpenRegions = metricsRegistry.newGauge(UNKNOWN_SERVER_OPEN_REGIONS, "", 0); /** * NOTE: Please refer to HBASE-9774 and HBASE-14282. Based on these two issues, HBase is @@ -104,6 +108,16 @@ public class MetricsAssignmentManagerSourceImpl ritDurationHisto.add(duration); } + @Override + public void updateDeadServerOpenRegions(int deadRegions) { + deadServerOpenRegions.set(deadRegions); + } + + @Override + public void updateUnknownServerOpenRegions(int unknownRegions) { + unknownServerOpenRegions.set(unknownRegions); + } + @Override public OperationMetrics getAssignMetrics() { return assignMetrics; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java index 8b214f83e63..38aeef218eb 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java @@ -96,6 +96,14 @@ public class MetricsAssignmentManager { assignmentManagerSource.incrementOperationCounter(); } + public void updateDeadServerOpenRegions(int deadRegions) { + assignmentManagerSource.updateDeadServerOpenRegions(deadRegions); + } + + public void updateUnknownServerOpenRegions(int unknownRegions) { + assignmentManagerSource.updateUnknownServerOpenRegions(unknownRegions); + } + /** * @return Set of common metrics for assign procedure */ diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 88edb79d794..02a27f8dd4c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -620,8 +620,9 @@ public class ServerManager { } } + // Note: this is currently invoked from RPC, not just tests. Locking in this class needs cleanup. @VisibleForTesting - public void moveFromOnlineToDeadServers(final ServerName sn) { + public synchronized void moveFromOnlineToDeadServers(final ServerName sn) { synchronized (onlineServers) { if (!this.onlineServers.containsKey(sn)) { LOG.trace("Expiration of {} but server not online", sn); @@ -907,6 +908,20 @@ public class ServerManager { return serverName != null && onlineServers.containsKey(serverName); } + public enum ServerLiveState { + LIVE, + DEAD, + UNKNOWN + } + + /** + * @return whether the server is online, dead, or unknown. + */ + public synchronized ServerLiveState isServerKnownAndOnline(ServerName serverName) { + return onlineServers.containsKey(serverName) ? ServerLiveState.LIVE + : (deadservers.isDeadServer(serverName) ? ServerLiveState.DEAD : ServerLiveState.UNKNOWN); + } + /** * Check if a server is known to be dead. A server can be online, * or known to be dead, or unknown to this manager (i.e, not online, diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 921101bbfc9..ac41fffc4c5 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -53,6 +54,7 @@ import org.apache.hadoop.hbase.master.MetricsAssignmentManager; import org.apache.hadoop.hbase.master.RegionPlan; import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.RegionState.State; +import org.apache.hadoop.hbase.master.ServerManager; import org.apache.hadoop.hbase.master.TableStateManager; import org.apache.hadoop.hbase.master.balancer.FavoredStochasticBalancer; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; @@ -127,6 +129,10 @@ public class AssignmentManager { "hbase.assignment.rit.chore.interval.msec"; private static final int DEFAULT_RIT_CHORE_INTERVAL_MSEC = 60 * 1000; + public static final String DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC_CONF_KEY = + "hbase.assignment.dead.region.metric.chore.interval.msec"; + private static final int DEFAULT_DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC = 120 * 1000; + public static final String ASSIGN_MAX_ATTEMPTS = "hbase.assignment.maximum.attempts"; private static final int DEFAULT_ASSIGN_MAX_ATTEMPTS = Integer.MAX_VALUE; @@ -145,6 +151,7 @@ public class AssignmentManager { private final MetricsAssignmentManager metrics; private final RegionInTransitionChore ritChore; + private final DeadServerMetricRegionChore deadMetricChore; private final MasterServices master; private final AtomicBoolean running = new AtomicBoolean(false); @@ -190,6 +197,14 @@ public class AssignmentManager { int ritChoreInterval = conf.getInt(RIT_CHORE_INTERVAL_MSEC_CONF_KEY, DEFAULT_RIT_CHORE_INTERVAL_MSEC); this.ritChore = new RegionInTransitionChore(ritChoreInterval); + + int deadRegionChoreInterval = conf.getInt(DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC_CONF_KEY, + DEFAULT_DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC); + if (deadRegionChoreInterval > 0) { + this.deadMetricChore = new DeadServerMetricRegionChore(deadRegionChoreInterval); + } else { + this.deadMetricChore = null; + } } public void start() throws IOException, KeeperException { @@ -271,6 +286,9 @@ public class AssignmentManager { // Remove the RIT chore if (hasProcExecutor) { master.getMasterProcedureExecutor().removeChore(this.ritChore); + if (this.deadMetricChore != null) { + master.getMasterProcedureExecutor().removeChore(this.deadMetricChore); + } } // Stop the Assignment Thread @@ -1130,6 +1148,69 @@ public class AssignmentManager { } } + private static class DeadServerMetricRegionChore + extends ProcedureInMemoryChore { + public DeadServerMetricRegionChore(final int timeoutMsec) { + super(timeoutMsec); + } + + @Override + protected void periodicExecute(final MasterProcedureEnv env) { + final ServerManager sm = env.getMasterServices().getServerManager(); + final AssignmentManager am = env.getAssignmentManager(); + // To minimize inconsistencies we are not going to snapshot live servers in advance in case + // new servers are added; OTOH we don't want to add heavy sync for a consistent view since + // this is for metrics. Instead, we're going to check each regions as we go; to avoid making + // too many checks, we maintain a local lists of server, limiting us to false negatives. If + // we miss some recently-dead server, we'll just see it next time. + Set recentlyLiveServers = new HashSet<>(); + int deadRegions = 0, unknownRegions = 0; + for (RegionStateNode rsn : am.getRegionStates().getRegionStateNodes()) { + if (rsn.getState() != State.OPEN) { + continue; // Opportunistic check, should quickly skip RITs, offline tables, etc. + } + ServerName sn; + State state; + rsn.lock(); + try { + sn = rsn.getRegionLocation(); + state = rsn.getState(); + } finally { + rsn.unlock(); + } + if (state != State.OPEN) { + continue; // Mostly skipping RITs that are already being take care of. + } + if (sn == null) { + ++unknownRegions; // Opened on null? + continue; + } + if (recentlyLiveServers.contains(sn)) { + continue; + } + ServerManager.ServerLiveState sls = sm.isServerKnownAndOnline(sn); + switch (sls) { + case LIVE: + recentlyLiveServers.add(sn); + break; + case DEAD: + ++deadRegions; + break; + case UNKNOWN: + ++unknownRegions; + break; + default: throw new AssertionError("Unexpected " + sls); + } + } + if (deadRegions > 0 || unknownRegions > 0) { + LOG.info("Found {} OPEN regions on dead servers and {} OPEN regions on unknown servers", + deadRegions, unknownRegions); + } + + am.updateDeadServerRegionMetrics(deadRegions, unknownRegions); + } + } + public RegionInTransitionStat computeRegionInTransitionStat() { final RegionInTransitionStat rit = new RegionInTransitionStat(getConfiguration()); rit.update(this); @@ -1236,6 +1317,11 @@ public class AssignmentManager { metrics.updateRITCountOverThreshold(ritStat.getTotalRITsOverThreshold()); } + private void updateDeadServerRegionMetrics(int deadRegions, int unknownRegions) { + metrics.updateDeadServerOpenRegions(deadRegions); + metrics.updateUnknownServerOpenRegions(unknownRegions); + } + private void handleRegionOverStuckWarningThreshold(final RegionInfo regionInfo) { final RegionStateNode regionNode = regionStates.getRegionStateNode(regionInfo); //if (regionNode.isStuck()) { @@ -1261,8 +1347,9 @@ public class AssignmentManager { } LOG.info("Number of RegionServers={}", master.getServerManager().countOfRegionServers()); - // Start the RIT chore + // Start the chores master.getMasterProcedureExecutor().addChore(this.ritChore); + master.getMasterProcedureExecutor().addChore(this.deadMetricChore); long costMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTime); LOG.info("Joined the cluster in {}", StringUtils.humanTimeDiff(costMs)); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java index 4728d1f4a64..0efeb32e274 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java @@ -178,12 +178,14 @@ public class RegionStates { return regions; } - Collection getRegionStateNodes() { - return regionsMap.values(); + /** @return A view of region state nodes for all the regions. */ + public Collection getRegionStateNodes() { + return Collections.unmodifiableCollection(regionsMap.values()); } + /** @return A snapshot of region state nodes for all the regions. */ public ArrayList getRegionStates() { - final ArrayList regions = new ArrayList(regionsMap.size()); + final ArrayList regions = new ArrayList<>(regionsMap.size()); for (RegionStateNode node: regionsMap.values()) { regions.add(node.toRegionState()); }