HBASE-4348 Add metrics for regions in transition (Himanshu Vashishtha)

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1310159 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jonathan Hsieh 2012-04-06 03:46:04 +00:00
parent be245101ac
commit 9b57215185
6 changed files with 152 additions and 18 deletions

View File

@ -21,6 +21,9 @@ limitations under the License.
org.apache.hadoop.hbase.HRegionInfo;
org.apache.hadoop.hbase.master.AssignmentManager;
org.apache.hadoop.hbase.master.AssignmentManager.RegionState;
org.apache.hadoop.conf.Configuration;
org.apache.hadoop.hbase.HBaseConfiguration;
org.apache.hadoop.hbase.HConstants;
java.util.Iterator;
java.util.Map;
</%import>
@ -30,6 +33,23 @@ int limit = 100;
</%args>
<%java>
Map<String, RegionState> rit = assignmentManager.getRegionsInTransition();
// process the map to find region in transition details
Configuration conf = HBaseConfiguration.create();
int ritThreshold = conf.getInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 60000);
int numOfRITOverThreshold = 0;
long maxRITTime = Long.MIN_VALUE;
long currentTime = System.currentTimeMillis();
String regionIDForOldestRIT = ""; // avoiding null
for (Map.Entry<String, RegionState> e : rit.entrySet()) {
long ritTime = currentTime - e.getValue().getStamp();
if(ritTime > ritThreshold) {
numOfRITOverThreshold++;
}
if(maxRITTime < ritTime) {
maxRITTime = ritTime;
regionIDForOldestRIT = e.getKey();
}
}
int toRemove = rit.size() - limit;
int removed = 0;
@ -42,8 +62,8 @@ if (toRemove > 0) {
if (HRegionInfo.FIRST_META_REGIONINFO.getEncodedName().equals(
e.getKey()) ||
HRegionInfo.ROOT_REGIONINFO.getEncodedName().equals(
e.getKey())) {
// don't remove the meta regions, they're too interesting!
e.getKey()) || regionIDForOldestRIT.equals(e.getKey())) {
// don't remove the meta & the oldest rit regions, they're too interesting!
continue;
}
it.remove();
@ -58,13 +78,23 @@ if (toRemove > 0) {
<%if rit.isEmpty() %>
No regions in transition.
<%else>
<table>
<tr><th>Region</th><th>State</th></tr>
<%for Map.Entry<String, RegionState> entry : rit.entrySet() %>
<tr><td><% entry.getKey() %></td><td><% entry.getValue().toDescriptiveString() %></td>
</%for>
</table>
<%if removed > 0 %>
(<% removed %> more regions in transition not shown)
</%if>
</%if>
<table>
<tr><th>Region</th><th>State</th><th>RIT time (ms)</th></tr>
<%for Map.Entry<String, RegionState> entry : rit.entrySet() %>
<%if regionIDForOldestRIT.equals(entry.getKey()) %>
<tr BGCOLOR="#FE2E2E" >
<%else>
<tr>
</%if>
<td><% entry.getKey() %></td><td><% entry.getValue().toDescriptiveString() %></td>
<td><% (currentTime - entry.getValue().getStamp()) %> </td></tr>
</%for>
<tr BGCOLOR="#D7DF01"> <td>Total number of Regions in Transition for more than <% ritThreshold %> milliseconds</td><td> <% numOfRITOverThreshold %></td><td></td>
</tr>
<tr> <td> Total number of Regions in Transition</td><td><% rit.size() %> </td><td></td>
</table>
<%if removed > 0 %>
(<% removed %> more regions in transition not shown)
</%if>
</%if>

View File

@ -651,6 +651,9 @@ public final class HConstants {
public static final String ENABLE_WAL_COMPRESSION =
"hbase.regionserver.wal.enablecompression";
/** Region in Transition metrics threshold time */
public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD="hbase.metrics.rit.stuck.warning.threshold";
private HConstants() {
// Can't be instantiated with this ctor.
}

View File

@ -71,6 +71,7 @@ import org.apache.hadoop.hbase.master.handler.EnableTableHandler;
import org.apache.hadoop.hbase.master.handler.OpenedRegionHandler;
import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
import org.apache.hadoop.hbase.master.handler.SplitRegionHandler;
import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
import org.apache.hadoop.hbase.regionserver.RegionAlreadyInTransitionException;
import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
@ -184,6 +185,9 @@ public class AssignmentManager extends ZooKeeperListener {
private Map<String, HRegionInfo> failoverProcessedRegions =
new HashMap<String, HRegionInfo>();
// metrics instance to send metrics for RITs
MasterMetrics masterMetrics;
/**
* Constructs a new assignment manager.
*
@ -195,7 +199,7 @@ public class AssignmentManager extends ZooKeeperListener {
* @throws IOException
*/
public AssignmentManager(Server master, ServerManager serverManager,
CatalogTracker catalogTracker, final ExecutorService service)
CatalogTracker catalogTracker, final ExecutorService service, MasterMetrics metrics)
throws KeeperException, IOException {
super(master.getZooKeeper());
this.master = master;
@ -216,6 +220,7 @@ public class AssignmentManager extends ZooKeeperListener {
this.master.getConfiguration().getInt("hbase.assignment.maximum.attempts", 10);
this.balancer = LoadBalancerFactory.getLoadBalancer(conf);
this.threadPoolExecutorService = Executors.newCachedThreadPool();
this.masterMetrics = metrics;// can be null only with tests.
}
/**
@ -2706,6 +2711,38 @@ public class AssignmentManager extends ZooKeeperListener {
}
}
/**
* Set Regions in transitions metrics.
* This takes an iterator on the RegionInTransition map (CLSM), and is not synchronized.
* This iterator is not fail fast, wich may lead to stale read; but that's better than
* creating a copy of the map for metrics computation, as this method will be invoked
* on a frequent interval.
*/
public void updateRegionsInTransitionMetrics() {
long currentTime = System.currentTimeMillis();
int totalRITs = 0;
int totalRITsOverThreshold = 0;
long oldestRITTime = 0;
int ritThreshold = this.master.getConfiguration().
getInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 60000);
for (Map.Entry<String, RegionState> e : this.regionsInTransition.
entrySet()) {
totalRITs++;
long ritTime = currentTime - e.getValue().getStamp();
if (ritTime > ritThreshold) { // more than the threshold
totalRITsOverThreshold++;
}
if (oldestRITTime < ritTime) {
oldestRITTime = ritTime;
}
}
if (this.masterMetrics != null) {
this.masterMetrics.updateRITOldestAge(oldestRITTime);
this.masterMetrics.updateRITCount(totalRITs);
this.masterMetrics.updateRITCountOverThreshold(totalRITsOverThreshold);
}
}
/**
* @return True if regions in transition.
*/

View File

@ -221,6 +221,8 @@ Server {
private long masterStartTime;
private long masterActiveTime;
/** time interval for emitting metrics values */
private final int msgInterval;
/**
* MX Bean for MasterInfo
*/
@ -290,6 +292,8 @@ Server {
this.zooKeeper = new ZooKeeperWatcher(conf, MASTER + ":" + isa.getPort(), this, true);
this.rpcServer.startThreads();
this.metrics = new MasterMetrics(getServerName().toString());
// metrics interval: using the same property as region server.
this.msgInterval = conf.getInt("hbase.regionserver.msginterval", 3 * 1000);
}
/**
@ -412,7 +416,7 @@ Server {
this.catalogTracker.start();
this.assignmentManager = new AssignmentManager(this, serverManager,
this.catalogTracker, this.executorService);
this.catalogTracker, this.executorService, this.metrics);
this.balancer = LoadBalancerFactory.getLoadBalancer(conf);
zooKeeper.registerListenerFirst(assignmentManager);
@ -456,13 +460,33 @@ Server {
// Check if we should stop every 100ms
private Sleeper stopSleeper = new Sleeper(100, this);
private void loop() {
long lastMsgTs = 0l;
long now = 0l;
while (!this.stopped) {
now = System.currentTimeMillis();
if ((now - lastMsgTs) >= this.msgInterval) {
doMetrics();
lastMsgTs = System.currentTimeMillis();
}
stopSleeper.sleep();
}
}
/**
* Emit the HMaster metrics, such as region in transition metrics.
* Surrounding in a try block just to be sure metrics doesn't abort HMaster.
*/
private void doMetrics() {
try {
this.assignmentManager.updateRegionsInTransitionMetrics();
} catch (Throwable e) {
LOG.error("Couldn't update metrics: " + e.getMessage());
}
}
/**
* Finish initialization of HMaster after becoming the primary master.
*
* <ol>

View File

@ -31,6 +31,7 @@ import org.apache.hadoop.metrics.MetricsRecord;
import org.apache.hadoop.metrics.MetricsUtil;
import org.apache.hadoop.metrics.Updater;
import org.apache.hadoop.metrics.jvm.JvmMetrics;
import org.apache.hadoop.metrics.util.MetricsIntValue;
import org.apache.hadoop.metrics.util.MetricsLongValue;
import org.apache.hadoop.metrics.util.MetricsRegistry;
@ -66,6 +67,18 @@ public class MasterMetrics implements Updater {
final PersistentMetricsTimeVaryingRate splitSize =
new PersistentMetricsTimeVaryingRate("splitSize", registry);
/**
* Regions in Transition metrics such as number of RIT regions, oldest
* RIT time and number of such regions that are in transition
* for more than a specified threshold.
*/
public final MetricsIntValue ritCount =
new MetricsIntValue("ritCount", registry);
public final MetricsIntValue ritCountOverThreshold =
new MetricsIntValue("ritCountOverThreshold", registry);
public final MetricsLongValue ritOldestAge =
new MetricsLongValue("ritOldestAge", registry);
public MasterMetrics(final String name) {
MetricsContext context = MetricsUtil.getContext("hbase");
metricsRecord = MetricsUtil.createRecord(context, "master");
@ -117,6 +130,9 @@ public class MasterMetrics implements Updater {
this.cluster_requests.pushMetric(metricsRecord);
this.splitTime.pushMetric(metricsRecord);
this.splitSize.pushMetric(metricsRecord);
this.ritCount.pushMetric(metricsRecord);
this.ritCountOverThreshold.pushMetric(metricsRecord);
this.ritOldestAge.pushMetric(metricsRecord);
}
this.metricsRecord.update();
}
@ -148,4 +164,28 @@ public class MasterMetrics implements Updater {
public void incrementRequests(final int inc) {
this.cluster_requests.inc(inc);
}
/**
* set new value for number of regions in transition.
* @param ritCount
*/
public void updateRITCount(int ritCount) {
this.ritCount.set(ritCount);
}
/**
* update RIT count that are in this state for more than the threshold
* as defined by the property rit.metrics.threshold.time.
* @param ritCountOverThreshold
*/
public void updateRITCountOverThreshold(int ritCountOverThreshold) {
this.ritCountOverThreshold.set(ritCountOverThreshold);
}
/**
* update the timestamp for oldest region in transition metrics.
* @param timestamp
*/
public void updateRITOldestAge(long timestamp) {
this.ritOldestAge.set(timestamp);
}
}

View File

@ -307,7 +307,7 @@ public class TestAssignmentManager {
CatalogTracker ct = Mockito.mock(CatalogTracker.class);
// Create an AM.
AssignmentManager am =
new AssignmentManager(this.server, this.serverManager, ct, executor);
new AssignmentManager(this.server, this.serverManager, ct, executor, null);
try {
// Make sure our new AM gets callbacks; once registered, can't unregister.
// Thats ok because we make a new zk watcher for each test.
@ -372,7 +372,7 @@ public class TestAssignmentManager {
CatalogTracker ct = Mockito.mock(CatalogTracker.class);
// Create an AM.
AssignmentManager am =
new AssignmentManager(this.server, this.serverManager, ct, executor);
new AssignmentManager(this.server, this.serverManager, ct, executor, null);
try {
// Make sure our new AM gets callbacks; once registered, can't unregister.
// Thats ok because we make a new zk watcher for each test.
@ -446,7 +446,7 @@ public class TestAssignmentManager {
CatalogTracker ct = Mockito.mock(CatalogTracker.class);
// Create an AM.
AssignmentManager am =
new AssignmentManager(this.server, this.serverManager, ct, null);
new AssignmentManager(this.server, this.serverManager, ct, null, null);
try {
// First make sure my mock up basically works. Unassign a region.
unassign(am, SERVERNAME_A, hri);
@ -578,7 +578,7 @@ public class TestAssignmentManager {
final ServerManager serverManager,
final CatalogTracker catalogTracker, final ExecutorService service)
throws KeeperException, IOException {
super(master, serverManager, catalogTracker, service);
super(master, serverManager, catalogTracker, service, null);
this.es = service;
this.ct = catalogTracker;
}