diff --git a/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon b/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon index 0dc0691a894..eef895d38b7 100644 --- a/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon +++ b/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon @@ -21,6 +21,9 @@ limitations under the License. org.apache.hadoop.hbase.HRegionInfo; org.apache.hadoop.hbase.master.AssignmentManager; org.apache.hadoop.hbase.master.AssignmentManager.RegionState; +org.apache.hadoop.conf.Configuration; +org.apache.hadoop.hbase.HBaseConfiguration; +org.apache.hadoop.hbase.HConstants; java.util.Iterator; java.util.Map; @@ -30,6 +33,23 @@ int limit = 100; <%java> Map rit = assignmentManager.getRegionsInTransition(); +// process the map to find region in transition details +Configuration conf = HBaseConfiguration.create(); +int ritThreshold = conf.getInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 60000); +int numOfRITOverThreshold = 0; +long maxRITTime = Long.MIN_VALUE; +long currentTime = System.currentTimeMillis(); +String regionIDForOldestRIT = ""; // avoiding null +for (Map.Entry e : rit.entrySet()) { + long ritTime = currentTime - e.getValue().getStamp(); + if(ritTime > ritThreshold) { + numOfRITOverThreshold++; + } + if(maxRITTime < ritTime) { + maxRITTime = ritTime; + regionIDForOldestRIT = e.getKey(); + } +} int toRemove = rit.size() - limit; int removed = 0; @@ -42,8 +62,8 @@ if (toRemove > 0) { if (HRegionInfo.FIRST_META_REGIONINFO.getEncodedName().equals( e.getKey()) || HRegionInfo.ROOT_REGIONINFO.getEncodedName().equals( - e.getKey())) { - // don't remove the meta regions, they're too interesting! + e.getKey()) || regionIDForOldestRIT.equals(e.getKey())) { + // don't remove the meta & the oldest rit regions, they're too interesting! continue; } it.remove(); @@ -58,13 +78,23 @@ if (toRemove > 0) { <%if rit.isEmpty() %> No regions in transition. <%else> - - - <%for Map.Entry entry : rit.entrySet() %> - - -
RegionState
<% entry.getKey() %><% entry.getValue().toDescriptiveString() %>
- <%if removed > 0 %> - (<% removed %> more regions in transition not shown) - - \ No newline at end of file + + + <%for Map.Entry entry : rit.entrySet() %> + <%if regionIDForOldestRIT.equals(entry.getKey()) %> + + <%else> + + + + + + + + +
RegionStateRIT time (ms)
<% entry.getKey() %><% entry.getValue().toDescriptiveString() %><% (currentTime - entry.getValue().getStamp()) %>
Total number of Regions in Transition for more than <% ritThreshold %> milliseconds <% numOfRITOverThreshold %>
Total number of Regions in Transition<% rit.size() %>
+ <%if removed > 0 %> + (<% removed %> more regions in transition not shown) + + + diff --git a/src/main/java/org/apache/hadoop/hbase/HConstants.java b/src/main/java/org/apache/hadoop/hbase/HConstants.java index 21ac4bad171..a4b989eea56 100644 --- a/src/main/java/org/apache/hadoop/hbase/HConstants.java +++ b/src/main/java/org/apache/hadoop/hbase/HConstants.java @@ -651,6 +651,9 @@ public final class HConstants { public static final String ENABLE_WAL_COMPRESSION = "hbase.regionserver.wal.enablecompression"; +/** Region in Transition metrics threshold time */ + public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD="hbase.metrics.rit.stuck.warning.threshold"; + private HConstants() { // Can't be instantiated with this ctor. } diff --git a/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index c0c320446bc..ae7103b3860 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -71,6 +71,7 @@ import org.apache.hadoop.hbase.master.handler.EnableTableHandler; import org.apache.hadoop.hbase.master.handler.OpenedRegionHandler; import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler; import org.apache.hadoop.hbase.master.handler.SplitRegionHandler; +import org.apache.hadoop.hbase.master.metrics.MasterMetrics; import org.apache.hadoop.hbase.regionserver.RegionAlreadyInTransitionException; import org.apache.hadoop.hbase.regionserver.RegionOpeningState; import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException; @@ -184,6 +185,9 @@ public class AssignmentManager extends ZooKeeperListener { private Map failoverProcessedRegions = new HashMap(); + // metrics instance to send metrics for RITs + MasterMetrics masterMetrics; + /** * Constructs a new assignment manager. * @@ -195,7 +199,7 @@ public class AssignmentManager extends ZooKeeperListener { * @throws IOException */ public AssignmentManager(Server master, ServerManager serverManager, - CatalogTracker catalogTracker, final ExecutorService service) + CatalogTracker catalogTracker, final ExecutorService service, MasterMetrics metrics) throws KeeperException, IOException { super(master.getZooKeeper()); this.master = master; @@ -216,6 +220,7 @@ public class AssignmentManager extends ZooKeeperListener { this.master.getConfiguration().getInt("hbase.assignment.maximum.attempts", 10); this.balancer = LoadBalancerFactory.getLoadBalancer(conf); this.threadPoolExecutorService = Executors.newCachedThreadPool(); + this.masterMetrics = metrics;// can be null only with tests. } /** @@ -2706,6 +2711,38 @@ public class AssignmentManager extends ZooKeeperListener { } } + /** + * Set Regions in transitions metrics. + * This takes an iterator on the RegionInTransition map (CLSM), and is not synchronized. + * This iterator is not fail fast, wich may lead to stale read; but that's better than + * creating a copy of the map for metrics computation, as this method will be invoked + * on a frequent interval. + */ + public void updateRegionsInTransitionMetrics() { + long currentTime = System.currentTimeMillis(); + int totalRITs = 0; + int totalRITsOverThreshold = 0; + long oldestRITTime = 0; + int ritThreshold = this.master.getConfiguration(). + getInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 60000); + for (Map.Entry e : this.regionsInTransition. + entrySet()) { + totalRITs++; + long ritTime = currentTime - e.getValue().getStamp(); + if (ritTime > ritThreshold) { // more than the threshold + totalRITsOverThreshold++; + } + if (oldestRITTime < ritTime) { + oldestRITTime = ritTime; + } + } + if (this.masterMetrics != null) { + this.masterMetrics.updateRITOldestAge(oldestRITTime); + this.masterMetrics.updateRITCount(totalRITs); + this.masterMetrics.updateRITCountOverThreshold(totalRITsOverThreshold); + } + } + /** * @return True if regions in transition. */ diff --git a/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index a3b4f6f19e6..4ac3e15cc97 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -221,6 +221,8 @@ Server { private long masterStartTime; private long masterActiveTime; + /** time interval for emitting metrics values */ + private final int msgInterval; /** * MX Bean for MasterInfo */ @@ -290,6 +292,8 @@ Server { this.zooKeeper = new ZooKeeperWatcher(conf, MASTER + ":" + isa.getPort(), this, true); this.rpcServer.startThreads(); this.metrics = new MasterMetrics(getServerName().toString()); + // metrics interval: using the same property as region server. + this.msgInterval = conf.getInt("hbase.regionserver.msginterval", 3 * 1000); } /** @@ -412,7 +416,7 @@ Server { this.catalogTracker.start(); this.assignmentManager = new AssignmentManager(this, serverManager, - this.catalogTracker, this.executorService); + this.catalogTracker, this.executorService, this.metrics); this.balancer = LoadBalancerFactory.getLoadBalancer(conf); zooKeeper.registerListenerFirst(assignmentManager); @@ -456,13 +460,33 @@ Server { // Check if we should stop every 100ms private Sleeper stopSleeper = new Sleeper(100, this); + private void loop() { + long lastMsgTs = 0l; + long now = 0l; while (!this.stopped) { + now = System.currentTimeMillis(); + if ((now - lastMsgTs) >= this.msgInterval) { + doMetrics(); + lastMsgTs = System.currentTimeMillis(); + } stopSleeper.sleep(); } } /** + * Emit the HMaster metrics, such as region in transition metrics. + * Surrounding in a try block just to be sure metrics doesn't abort HMaster. + */ + private void doMetrics() { + try { + this.assignmentManager.updateRegionsInTransitionMetrics(); + } catch (Throwable e) { + LOG.error("Couldn't update metrics: " + e.getMessage()); + } + } + +/** * Finish initialization of HMaster after becoming the primary master. * *
    diff --git a/src/main/java/org/apache/hadoop/hbase/master/metrics/MasterMetrics.java b/src/main/java/org/apache/hadoop/hbase/master/metrics/MasterMetrics.java index 83abc524eba..1ea3368dbac 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/metrics/MasterMetrics.java +++ b/src/main/java/org/apache/hadoop/hbase/master/metrics/MasterMetrics.java @@ -31,6 +31,7 @@ import org.apache.hadoop.metrics.MetricsRecord; import org.apache.hadoop.metrics.MetricsUtil; import org.apache.hadoop.metrics.Updater; import org.apache.hadoop.metrics.jvm.JvmMetrics; +import org.apache.hadoop.metrics.util.MetricsIntValue; import org.apache.hadoop.metrics.util.MetricsLongValue; import org.apache.hadoop.metrics.util.MetricsRegistry; @@ -66,6 +67,18 @@ public class MasterMetrics implements Updater { final PersistentMetricsTimeVaryingRate splitSize = new PersistentMetricsTimeVaryingRate("splitSize", registry); + /** + * Regions in Transition metrics such as number of RIT regions, oldest + * RIT time and number of such regions that are in transition + * for more than a specified threshold. + */ + public final MetricsIntValue ritCount = + new MetricsIntValue("ritCount", registry); + public final MetricsIntValue ritCountOverThreshold = + new MetricsIntValue("ritCountOverThreshold", registry); + public final MetricsLongValue ritOldestAge = + new MetricsLongValue("ritOldestAge", registry); + public MasterMetrics(final String name) { MetricsContext context = MetricsUtil.getContext("hbase"); metricsRecord = MetricsUtil.createRecord(context, "master"); @@ -117,6 +130,9 @@ public class MasterMetrics implements Updater { this.cluster_requests.pushMetric(metricsRecord); this.splitTime.pushMetric(metricsRecord); this.splitSize.pushMetric(metricsRecord); + this.ritCount.pushMetric(metricsRecord); + this.ritCountOverThreshold.pushMetric(metricsRecord); + this.ritOldestAge.pushMetric(metricsRecord); } this.metricsRecord.update(); } @@ -148,4 +164,28 @@ public class MasterMetrics implements Updater { public void incrementRequests(final int inc) { this.cluster_requests.inc(inc); } + + /** + * set new value for number of regions in transition. + * @param ritCount + */ + public void updateRITCount(int ritCount) { + this.ritCount.set(ritCount); + } + + /** + * update RIT count that are in this state for more than the threshold + * as defined by the property rit.metrics.threshold.time. + * @param ritCountOverThreshold + */ + public void updateRITCountOverThreshold(int ritCountOverThreshold) { + this.ritCountOverThreshold.set(ritCountOverThreshold); + } + /** + * update the timestamp for oldest region in transition metrics. + * @param timestamp + */ + public void updateRITOldestAge(long timestamp) { + this.ritOldestAge.set(timestamp); + } } diff --git a/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManager.java b/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManager.java index 91dce365c60..6ed4ba2c4fc 100644 --- a/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManager.java +++ b/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManager.java @@ -307,7 +307,7 @@ public class TestAssignmentManager { CatalogTracker ct = Mockito.mock(CatalogTracker.class); // Create an AM. AssignmentManager am = - new AssignmentManager(this.server, this.serverManager, ct, executor); + new AssignmentManager(this.server, this.serverManager, ct, executor, null); try { // Make sure our new AM gets callbacks; once registered, can't unregister. // Thats ok because we make a new zk watcher for each test. @@ -372,7 +372,7 @@ public class TestAssignmentManager { CatalogTracker ct = Mockito.mock(CatalogTracker.class); // Create an AM. AssignmentManager am = - new AssignmentManager(this.server, this.serverManager, ct, executor); + new AssignmentManager(this.server, this.serverManager, ct, executor, null); try { // Make sure our new AM gets callbacks; once registered, can't unregister. // Thats ok because we make a new zk watcher for each test. @@ -446,7 +446,7 @@ public class TestAssignmentManager { CatalogTracker ct = Mockito.mock(CatalogTracker.class); // Create an AM. AssignmentManager am = - new AssignmentManager(this.server, this.serverManager, ct, null); + new AssignmentManager(this.server, this.serverManager, ct, null, null); try { // First make sure my mock up basically works. Unassign a region. unassign(am, SERVERNAME_A, hri); @@ -578,7 +578,7 @@ public class TestAssignmentManager { final ServerManager serverManager, final CatalogTracker catalogTracker, final ExecutorService service) throws KeeperException, IOException { - super(master, serverManager, catalogTracker, service); + super(master, serverManager, catalogTracker, service, null); this.es = service; this.ct = catalogTracker; }