HBASE-22408 add dead and unknown server open regions metric to AM
Signed-off-by: Duo Zhang <zhangduo@apache.org>
This commit is contained in:
parent
2f9d995163
commit
ada772a1d3
|
@ -50,6 +50,8 @@ public interface MetricsAssignmentManagerSource extends BaseSource {
|
||||||
String RIT_COUNT_OVER_THRESHOLD_NAME = "ritCountOverThreshold";
|
String RIT_COUNT_OVER_THRESHOLD_NAME = "ritCountOverThreshold";
|
||||||
String RIT_OLDEST_AGE_NAME = "ritOldestAge";
|
String RIT_OLDEST_AGE_NAME = "ritOldestAge";
|
||||||
String RIT_DURATION_NAME = "ritDuration";
|
String RIT_DURATION_NAME = "ritDuration";
|
||||||
|
String DEAD_SERVER_OPEN_REGIONS = "deadServerOpenRegions";
|
||||||
|
String UNKNOWN_SERVER_OPEN_REGIONS = "unknownServerOpenRegions";
|
||||||
|
|
||||||
String RIT_COUNT_DESC = "Current number of Regions In Transition (Gauge).";
|
String RIT_COUNT_DESC = "Current number of Regions In Transition (Gauge).";
|
||||||
String RIT_COUNT_OVER_THRESHOLD_DESC =
|
String RIT_COUNT_OVER_THRESHOLD_DESC =
|
||||||
|
@ -93,6 +95,10 @@ public interface MetricsAssignmentManagerSource extends BaseSource {
|
||||||
|
|
||||||
void updateRitDuration(long duration);
|
void updateRitDuration(long duration);
|
||||||
|
|
||||||
|
void updateDeadServerOpenRegions(int deadRegions);
|
||||||
|
|
||||||
|
void updateUnknownServerOpenRegions(int unknownRegions);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* TODO: Remove. This may not be needed now as assign and unassign counts are tracked separately
|
* TODO: Remove. This may not be needed now as assign and unassign counts are tracked separately
|
||||||
* Increment the count of operations (assign/unassign).
|
* Increment the count of operations (assign/unassign).
|
||||||
|
|
|
@ -34,6 +34,8 @@ public class MetricsAssignmentManagerSourceImpl
|
||||||
private MutableGaugeLong ritCountOverThresholdGauge;
|
private MutableGaugeLong ritCountOverThresholdGauge;
|
||||||
private MutableGaugeLong ritOldestAgeGauge;
|
private MutableGaugeLong ritOldestAgeGauge;
|
||||||
private MetricHistogram ritDurationHisto;
|
private MetricHistogram ritDurationHisto;
|
||||||
|
private MutableGaugeLong deadServerOpenRegions;
|
||||||
|
private MutableGaugeLong unknownServerOpenRegions;
|
||||||
|
|
||||||
private MutableFastCounter operationCounter;
|
private MutableFastCounter operationCounter;
|
||||||
|
|
||||||
|
@ -63,6 +65,8 @@ public class MetricsAssignmentManagerSourceImpl
|
||||||
ritOldestAgeGauge = metricsRegistry.newGauge(RIT_OLDEST_AGE_NAME, RIT_OLDEST_AGE_DESC, 0L);
|
ritOldestAgeGauge = metricsRegistry.newGauge(RIT_OLDEST_AGE_NAME, RIT_OLDEST_AGE_DESC, 0L);
|
||||||
ritDurationHisto = metricsRegistry.newTimeHistogram(RIT_DURATION_NAME, RIT_DURATION_DESC);
|
ritDurationHisto = metricsRegistry.newTimeHistogram(RIT_DURATION_NAME, RIT_DURATION_DESC);
|
||||||
operationCounter = metricsRegistry.getCounter(OPERATION_COUNT_NAME, 0L);
|
operationCounter = metricsRegistry.getCounter(OPERATION_COUNT_NAME, 0L);
|
||||||
|
deadServerOpenRegions = metricsRegistry.newGauge(DEAD_SERVER_OPEN_REGIONS, "", 0);
|
||||||
|
unknownServerOpenRegions = metricsRegistry.newGauge(UNKNOWN_SERVER_OPEN_REGIONS, "", 0);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* NOTE: Please refer to HBASE-9774 and HBASE-14282. Based on these two issues, HBase is
|
* NOTE: Please refer to HBASE-9774 and HBASE-14282. Based on these two issues, HBase is
|
||||||
|
@ -104,6 +108,16 @@ public class MetricsAssignmentManagerSourceImpl
|
||||||
ritDurationHisto.add(duration);
|
ritDurationHisto.add(duration);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void updateDeadServerOpenRegions(int deadRegions) {
|
||||||
|
deadServerOpenRegions.set(deadRegions);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void updateUnknownServerOpenRegions(int unknownRegions) {
|
||||||
|
unknownServerOpenRegions.set(unknownRegions);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public OperationMetrics getAssignMetrics() {
|
public OperationMetrics getAssignMetrics() {
|
||||||
return assignMetrics;
|
return assignMetrics;
|
||||||
|
|
|
@ -96,6 +96,14 @@ public class MetricsAssignmentManager {
|
||||||
assignmentManagerSource.incrementOperationCounter();
|
assignmentManagerSource.incrementOperationCounter();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void updateDeadServerOpenRegions(int deadRegions) {
|
||||||
|
assignmentManagerSource.updateDeadServerOpenRegions(deadRegions);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void updateUnknownServerOpenRegions(int unknownRegions) {
|
||||||
|
assignmentManagerSource.updateUnknownServerOpenRegions(unknownRegions);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return Set of common metrics for assign procedure
|
* @return Set of common metrics for assign procedure
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -620,8 +620,9 @@ public class ServerManager {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Note: this is currently invoked from RPC, not just tests. Locking in this class needs cleanup.
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
public void moveFromOnlineToDeadServers(final ServerName sn) {
|
public synchronized void moveFromOnlineToDeadServers(final ServerName sn) {
|
||||||
synchronized (onlineServers) {
|
synchronized (onlineServers) {
|
||||||
if (!this.onlineServers.containsKey(sn)) {
|
if (!this.onlineServers.containsKey(sn)) {
|
||||||
LOG.trace("Expiration of {} but server not online", sn);
|
LOG.trace("Expiration of {} but server not online", sn);
|
||||||
|
@ -907,6 +908,20 @@ public class ServerManager {
|
||||||
return serverName != null && onlineServers.containsKey(serverName);
|
return serverName != null && onlineServers.containsKey(serverName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public enum ServerLiveState {
|
||||||
|
LIVE,
|
||||||
|
DEAD,
|
||||||
|
UNKNOWN
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return whether the server is online, dead, or unknown.
|
||||||
|
*/
|
||||||
|
public synchronized ServerLiveState isServerKnownAndOnline(ServerName serverName) {
|
||||||
|
return onlineServers.containsKey(serverName) ? ServerLiveState.LIVE
|
||||||
|
: (deadservers.isDeadServer(serverName) ? ServerLiveState.DEAD : ServerLiveState.UNKNOWN);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if a server is known to be dead. A server can be online,
|
* Check if a server is known to be dead. A server can be online,
|
||||||
* or known to be dead, or unknown to this manager (i.e, not online,
|
* or known to be dead, or unknown to this manager (i.e, not online,
|
||||||
|
|
|
@ -22,6 +22,7 @@ import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -53,6 +54,7 @@ import org.apache.hadoop.hbase.master.MetricsAssignmentManager;
|
||||||
import org.apache.hadoop.hbase.master.RegionPlan;
|
import org.apache.hadoop.hbase.master.RegionPlan;
|
||||||
import org.apache.hadoop.hbase.master.RegionState;
|
import org.apache.hadoop.hbase.master.RegionState;
|
||||||
import org.apache.hadoop.hbase.master.RegionState.State;
|
import org.apache.hadoop.hbase.master.RegionState.State;
|
||||||
|
import org.apache.hadoop.hbase.master.ServerManager;
|
||||||
import org.apache.hadoop.hbase.master.TableStateManager;
|
import org.apache.hadoop.hbase.master.TableStateManager;
|
||||||
import org.apache.hadoop.hbase.master.balancer.FavoredStochasticBalancer;
|
import org.apache.hadoop.hbase.master.balancer.FavoredStochasticBalancer;
|
||||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||||
|
@ -127,6 +129,10 @@ public class AssignmentManager {
|
||||||
"hbase.assignment.rit.chore.interval.msec";
|
"hbase.assignment.rit.chore.interval.msec";
|
||||||
private static final int DEFAULT_RIT_CHORE_INTERVAL_MSEC = 60 * 1000;
|
private static final int DEFAULT_RIT_CHORE_INTERVAL_MSEC = 60 * 1000;
|
||||||
|
|
||||||
|
public static final String DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC_CONF_KEY =
|
||||||
|
"hbase.assignment.dead.region.metric.chore.interval.msec";
|
||||||
|
private static final int DEFAULT_DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC = 120 * 1000;
|
||||||
|
|
||||||
public static final String ASSIGN_MAX_ATTEMPTS =
|
public static final String ASSIGN_MAX_ATTEMPTS =
|
||||||
"hbase.assignment.maximum.attempts";
|
"hbase.assignment.maximum.attempts";
|
||||||
private static final int DEFAULT_ASSIGN_MAX_ATTEMPTS = Integer.MAX_VALUE;
|
private static final int DEFAULT_ASSIGN_MAX_ATTEMPTS = Integer.MAX_VALUE;
|
||||||
|
@ -145,6 +151,7 @@ public class AssignmentManager {
|
||||||
|
|
||||||
private final MetricsAssignmentManager metrics;
|
private final MetricsAssignmentManager metrics;
|
||||||
private final RegionInTransitionChore ritChore;
|
private final RegionInTransitionChore ritChore;
|
||||||
|
private final DeadServerMetricRegionChore deadMetricChore;
|
||||||
private final MasterServices master;
|
private final MasterServices master;
|
||||||
|
|
||||||
private final AtomicBoolean running = new AtomicBoolean(false);
|
private final AtomicBoolean running = new AtomicBoolean(false);
|
||||||
|
@ -190,6 +197,14 @@ public class AssignmentManager {
|
||||||
int ritChoreInterval = conf.getInt(RIT_CHORE_INTERVAL_MSEC_CONF_KEY,
|
int ritChoreInterval = conf.getInt(RIT_CHORE_INTERVAL_MSEC_CONF_KEY,
|
||||||
DEFAULT_RIT_CHORE_INTERVAL_MSEC);
|
DEFAULT_RIT_CHORE_INTERVAL_MSEC);
|
||||||
this.ritChore = new RegionInTransitionChore(ritChoreInterval);
|
this.ritChore = new RegionInTransitionChore(ritChoreInterval);
|
||||||
|
|
||||||
|
int deadRegionChoreInterval = conf.getInt(DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC_CONF_KEY,
|
||||||
|
DEFAULT_DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC);
|
||||||
|
if (deadRegionChoreInterval > 0) {
|
||||||
|
this.deadMetricChore = new DeadServerMetricRegionChore(deadRegionChoreInterval);
|
||||||
|
} else {
|
||||||
|
this.deadMetricChore = null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void start() throws IOException, KeeperException {
|
public void start() throws IOException, KeeperException {
|
||||||
|
@ -271,6 +286,9 @@ public class AssignmentManager {
|
||||||
// Remove the RIT chore
|
// Remove the RIT chore
|
||||||
if (hasProcExecutor) {
|
if (hasProcExecutor) {
|
||||||
master.getMasterProcedureExecutor().removeChore(this.ritChore);
|
master.getMasterProcedureExecutor().removeChore(this.ritChore);
|
||||||
|
if (this.deadMetricChore != null) {
|
||||||
|
master.getMasterProcedureExecutor().removeChore(this.deadMetricChore);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stop the Assignment Thread
|
// Stop the Assignment Thread
|
||||||
|
@ -1130,6 +1148,69 @@ public class AssignmentManager {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static class DeadServerMetricRegionChore
|
||||||
|
extends ProcedureInMemoryChore<MasterProcedureEnv> {
|
||||||
|
public DeadServerMetricRegionChore(final int timeoutMsec) {
|
||||||
|
super(timeoutMsec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void periodicExecute(final MasterProcedureEnv env) {
|
||||||
|
final ServerManager sm = env.getMasterServices().getServerManager();
|
||||||
|
final AssignmentManager am = env.getAssignmentManager();
|
||||||
|
// To minimize inconsistencies we are not going to snapshot live servers in advance in case
|
||||||
|
// new servers are added; OTOH we don't want to add heavy sync for a consistent view since
|
||||||
|
// this is for metrics. Instead, we're going to check each regions as we go; to avoid making
|
||||||
|
// too many checks, we maintain a local lists of server, limiting us to false negatives. If
|
||||||
|
// we miss some recently-dead server, we'll just see it next time.
|
||||||
|
Set<ServerName> recentlyLiveServers = new HashSet<>();
|
||||||
|
int deadRegions = 0, unknownRegions = 0;
|
||||||
|
for (RegionStateNode rsn : am.getRegionStates().getRegionStateNodes()) {
|
||||||
|
if (rsn.getState() != State.OPEN) {
|
||||||
|
continue; // Opportunistic check, should quickly skip RITs, offline tables, etc.
|
||||||
|
}
|
||||||
|
ServerName sn;
|
||||||
|
State state;
|
||||||
|
rsn.lock();
|
||||||
|
try {
|
||||||
|
sn = rsn.getRegionLocation();
|
||||||
|
state = rsn.getState();
|
||||||
|
} finally {
|
||||||
|
rsn.unlock();
|
||||||
|
}
|
||||||
|
if (state != State.OPEN) {
|
||||||
|
continue; // Mostly skipping RITs that are already being take care of.
|
||||||
|
}
|
||||||
|
if (sn == null) {
|
||||||
|
++unknownRegions; // Opened on null?
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (recentlyLiveServers.contains(sn)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
ServerManager.ServerLiveState sls = sm.isServerKnownAndOnline(sn);
|
||||||
|
switch (sls) {
|
||||||
|
case LIVE:
|
||||||
|
recentlyLiveServers.add(sn);
|
||||||
|
break;
|
||||||
|
case DEAD:
|
||||||
|
++deadRegions;
|
||||||
|
break;
|
||||||
|
case UNKNOWN:
|
||||||
|
++unknownRegions;
|
||||||
|
break;
|
||||||
|
default: throw new AssertionError("Unexpected " + sls);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (deadRegions > 0 || unknownRegions > 0) {
|
||||||
|
LOG.info("Found {} OPEN regions on dead servers and {} OPEN regions on unknown servers",
|
||||||
|
deadRegions, unknownRegions);
|
||||||
|
}
|
||||||
|
|
||||||
|
am.updateDeadServerRegionMetrics(deadRegions, unknownRegions);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public RegionInTransitionStat computeRegionInTransitionStat() {
|
public RegionInTransitionStat computeRegionInTransitionStat() {
|
||||||
final RegionInTransitionStat rit = new RegionInTransitionStat(getConfiguration());
|
final RegionInTransitionStat rit = new RegionInTransitionStat(getConfiguration());
|
||||||
rit.update(this);
|
rit.update(this);
|
||||||
|
@ -1236,6 +1317,11 @@ public class AssignmentManager {
|
||||||
metrics.updateRITCountOverThreshold(ritStat.getTotalRITsOverThreshold());
|
metrics.updateRITCountOverThreshold(ritStat.getTotalRITsOverThreshold());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void updateDeadServerRegionMetrics(int deadRegions, int unknownRegions) {
|
||||||
|
metrics.updateDeadServerOpenRegions(deadRegions);
|
||||||
|
metrics.updateUnknownServerOpenRegions(unknownRegions);
|
||||||
|
}
|
||||||
|
|
||||||
private void handleRegionOverStuckWarningThreshold(final RegionInfo regionInfo) {
|
private void handleRegionOverStuckWarningThreshold(final RegionInfo regionInfo) {
|
||||||
final RegionStateNode regionNode = regionStates.getRegionStateNode(regionInfo);
|
final RegionStateNode regionNode = regionStates.getRegionStateNode(regionInfo);
|
||||||
//if (regionNode.isStuck()) {
|
//if (regionNode.isStuck()) {
|
||||||
|
@ -1261,8 +1347,9 @@ public class AssignmentManager {
|
||||||
}
|
}
|
||||||
LOG.info("Number of RegionServers={}", master.getServerManager().countOfRegionServers());
|
LOG.info("Number of RegionServers={}", master.getServerManager().countOfRegionServers());
|
||||||
|
|
||||||
// Start the RIT chore
|
// Start the chores
|
||||||
master.getMasterProcedureExecutor().addChore(this.ritChore);
|
master.getMasterProcedureExecutor().addChore(this.ritChore);
|
||||||
|
master.getMasterProcedureExecutor().addChore(this.deadMetricChore);
|
||||||
|
|
||||||
long costMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTime);
|
long costMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTime);
|
||||||
LOG.info("Joined the cluster in {}", StringUtils.humanTimeDiff(costMs));
|
LOG.info("Joined the cluster in {}", StringUtils.humanTimeDiff(costMs));
|
||||||
|
|
|
@ -178,12 +178,14 @@ public class RegionStates {
|
||||||
return regions;
|
return regions;
|
||||||
}
|
}
|
||||||
|
|
||||||
Collection<RegionStateNode> getRegionStateNodes() {
|
/** @return A view of region state nodes for all the regions. */
|
||||||
return regionsMap.values();
|
public Collection<RegionStateNode> getRegionStateNodes() {
|
||||||
|
return Collections.unmodifiableCollection(regionsMap.values());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @return A snapshot of region state nodes for all the regions. */
|
||||||
public ArrayList<RegionState> getRegionStates() {
|
public ArrayList<RegionState> getRegionStates() {
|
||||||
final ArrayList<RegionState> regions = new ArrayList<RegionState>(regionsMap.size());
|
final ArrayList<RegionState> regions = new ArrayList<>(regionsMap.size());
|
||||||
for (RegionStateNode node: regionsMap.values()) {
|
for (RegionStateNode node: regionsMap.values()) {
|
||||||
regions.add(node.toRegionState());
|
regions.add(node.toRegionState());
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue