HBASE-4348 Add metrics for regions in transition (Himanshu Vashishtha)
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1310159 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
be245101ac
commit
9b57215185
|
@ -21,6 +21,9 @@ limitations under the License.
|
|||
org.apache.hadoop.hbase.HRegionInfo;
|
||||
org.apache.hadoop.hbase.master.AssignmentManager;
|
||||
org.apache.hadoop.hbase.master.AssignmentManager.RegionState;
|
||||
org.apache.hadoop.conf.Configuration;
|
||||
org.apache.hadoop.hbase.HBaseConfiguration;
|
||||
org.apache.hadoop.hbase.HConstants;
|
||||
java.util.Iterator;
|
||||
java.util.Map;
|
||||
</%import>
|
||||
|
@ -30,6 +33,23 @@ int limit = 100;
|
|||
</%args>
|
||||
<%java>
|
||||
Map<String, RegionState> rit = assignmentManager.getRegionsInTransition();
|
||||
// process the map to find region in transition details
|
||||
Configuration conf = HBaseConfiguration.create();
|
||||
int ritThreshold = conf.getInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 60000);
|
||||
int numOfRITOverThreshold = 0;
|
||||
long maxRITTime = Long.MIN_VALUE;
|
||||
long currentTime = System.currentTimeMillis();
|
||||
String regionIDForOldestRIT = ""; // avoiding null
|
||||
for (Map.Entry<String, RegionState> e : rit.entrySet()) {
|
||||
long ritTime = currentTime - e.getValue().getStamp();
|
||||
if(ritTime > ritThreshold) {
|
||||
numOfRITOverThreshold++;
|
||||
}
|
||||
if(maxRITTime < ritTime) {
|
||||
maxRITTime = ritTime;
|
||||
regionIDForOldestRIT = e.getKey();
|
||||
}
|
||||
}
|
||||
|
||||
int toRemove = rit.size() - limit;
|
||||
int removed = 0;
|
||||
|
@ -42,8 +62,8 @@ if (toRemove > 0) {
|
|||
if (HRegionInfo.FIRST_META_REGIONINFO.getEncodedName().equals(
|
||||
e.getKey()) ||
|
||||
HRegionInfo.ROOT_REGIONINFO.getEncodedName().equals(
|
||||
e.getKey())) {
|
||||
// don't remove the meta regions, they're too interesting!
|
||||
e.getKey()) || regionIDForOldestRIT.equals(e.getKey())) {
|
||||
// don't remove the meta & the oldest rit regions, they're too interesting!
|
||||
continue;
|
||||
}
|
||||
it.remove();
|
||||
|
@ -58,13 +78,23 @@ if (toRemove > 0) {
|
|||
<%if rit.isEmpty() %>
|
||||
No regions in transition.
|
||||
<%else>
|
||||
<table>
|
||||
<tr><th>Region</th><th>State</th></tr>
|
||||
<%for Map.Entry<String, RegionState> entry : rit.entrySet() %>
|
||||
<tr><td><% entry.getKey() %></td><td><% entry.getValue().toDescriptiveString() %></td>
|
||||
</%for>
|
||||
</table>
|
||||
<%if removed > 0 %>
|
||||
(<% removed %> more regions in transition not shown)
|
||||
</%if>
|
||||
</%if>
|
||||
<table>
|
||||
<tr><th>Region</th><th>State</th><th>RIT time (ms)</th></tr>
|
||||
<%for Map.Entry<String, RegionState> entry : rit.entrySet() %>
|
||||
<%if regionIDForOldestRIT.equals(entry.getKey()) %>
|
||||
<tr BGCOLOR="#FE2E2E" >
|
||||
<%else>
|
||||
<tr>
|
||||
</%if>
|
||||
<td><% entry.getKey() %></td><td><% entry.getValue().toDescriptiveString() %></td>
|
||||
<td><% (currentTime - entry.getValue().getStamp()) %> </td></tr>
|
||||
</%for>
|
||||
<tr BGCOLOR="#D7DF01"> <td>Total number of Regions in Transition for more than <% ritThreshold %> milliseconds</td><td> <% numOfRITOverThreshold %></td><td></td>
|
||||
</tr>
|
||||
<tr> <td> Total number of Regions in Transition</td><td><% rit.size() %> </td><td></td>
|
||||
</table>
|
||||
<%if removed > 0 %>
|
||||
(<% removed %> more regions in transition not shown)
|
||||
</%if>
|
||||
</%if>
|
||||
|
||||
|
|
|
@ -651,6 +651,9 @@ public final class HConstants {
|
|||
public static final String ENABLE_WAL_COMPRESSION =
|
||||
"hbase.regionserver.wal.enablecompression";
|
||||
|
||||
/** Region in Transition metrics threshold time */
|
||||
public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD="hbase.metrics.rit.stuck.warning.threshold";
|
||||
|
||||
private HConstants() {
|
||||
// Can't be instantiated with this ctor.
|
||||
}
|
||||
|
|
|
@ -71,6 +71,7 @@ import org.apache.hadoop.hbase.master.handler.EnableTableHandler;
|
|||
import org.apache.hadoop.hbase.master.handler.OpenedRegionHandler;
|
||||
import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
|
||||
import org.apache.hadoop.hbase.master.handler.SplitRegionHandler;
|
||||
import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
|
||||
import org.apache.hadoop.hbase.regionserver.RegionAlreadyInTransitionException;
|
||||
import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
|
||||
import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
|
||||
|
@ -184,6 +185,9 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
private Map<String, HRegionInfo> failoverProcessedRegions =
|
||||
new HashMap<String, HRegionInfo>();
|
||||
|
||||
// metrics instance to send metrics for RITs
|
||||
MasterMetrics masterMetrics;
|
||||
|
||||
/**
|
||||
* Constructs a new assignment manager.
|
||||
*
|
||||
|
@ -195,7 +199,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
* @throws IOException
|
||||
*/
|
||||
public AssignmentManager(Server master, ServerManager serverManager,
|
||||
CatalogTracker catalogTracker, final ExecutorService service)
|
||||
CatalogTracker catalogTracker, final ExecutorService service, MasterMetrics metrics)
|
||||
throws KeeperException, IOException {
|
||||
super(master.getZooKeeper());
|
||||
this.master = master;
|
||||
|
@ -216,6 +220,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
this.master.getConfiguration().getInt("hbase.assignment.maximum.attempts", 10);
|
||||
this.balancer = LoadBalancerFactory.getLoadBalancer(conf);
|
||||
this.threadPoolExecutorService = Executors.newCachedThreadPool();
|
||||
this.masterMetrics = metrics;// can be null only with tests.
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -2706,6 +2711,38 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set Regions in transitions metrics.
|
||||
* This takes an iterator on the RegionInTransition map (CLSM), and is not synchronized.
|
||||
* This iterator is not fail fast, wich may lead to stale read; but that's better than
|
||||
* creating a copy of the map for metrics computation, as this method will be invoked
|
||||
* on a frequent interval.
|
||||
*/
|
||||
public void updateRegionsInTransitionMetrics() {
|
||||
long currentTime = System.currentTimeMillis();
|
||||
int totalRITs = 0;
|
||||
int totalRITsOverThreshold = 0;
|
||||
long oldestRITTime = 0;
|
||||
int ritThreshold = this.master.getConfiguration().
|
||||
getInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 60000);
|
||||
for (Map.Entry<String, RegionState> e : this.regionsInTransition.
|
||||
entrySet()) {
|
||||
totalRITs++;
|
||||
long ritTime = currentTime - e.getValue().getStamp();
|
||||
if (ritTime > ritThreshold) { // more than the threshold
|
||||
totalRITsOverThreshold++;
|
||||
}
|
||||
if (oldestRITTime < ritTime) {
|
||||
oldestRITTime = ritTime;
|
||||
}
|
||||
}
|
||||
if (this.masterMetrics != null) {
|
||||
this.masterMetrics.updateRITOldestAge(oldestRITTime);
|
||||
this.masterMetrics.updateRITCount(totalRITs);
|
||||
this.masterMetrics.updateRITCountOverThreshold(totalRITsOverThreshold);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return True if regions in transition.
|
||||
*/
|
||||
|
|
|
@ -221,6 +221,8 @@ Server {
|
|||
private long masterStartTime;
|
||||
private long masterActiveTime;
|
||||
|
||||
/** time interval for emitting metrics values */
|
||||
private final int msgInterval;
|
||||
/**
|
||||
* MX Bean for MasterInfo
|
||||
*/
|
||||
|
@ -290,6 +292,8 @@ Server {
|
|||
this.zooKeeper = new ZooKeeperWatcher(conf, MASTER + ":" + isa.getPort(), this, true);
|
||||
this.rpcServer.startThreads();
|
||||
this.metrics = new MasterMetrics(getServerName().toString());
|
||||
// metrics interval: using the same property as region server.
|
||||
this.msgInterval = conf.getInt("hbase.regionserver.msginterval", 3 * 1000);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -412,7 +416,7 @@ Server {
|
|||
this.catalogTracker.start();
|
||||
|
||||
this.assignmentManager = new AssignmentManager(this, serverManager,
|
||||
this.catalogTracker, this.executorService);
|
||||
this.catalogTracker, this.executorService, this.metrics);
|
||||
this.balancer = LoadBalancerFactory.getLoadBalancer(conf);
|
||||
zooKeeper.registerListenerFirst(assignmentManager);
|
||||
|
||||
|
@ -456,13 +460,33 @@ Server {
|
|||
|
||||
// Check if we should stop every 100ms
|
||||
private Sleeper stopSleeper = new Sleeper(100, this);
|
||||
|
||||
private void loop() {
|
||||
long lastMsgTs = 0l;
|
||||
long now = 0l;
|
||||
while (!this.stopped) {
|
||||
now = System.currentTimeMillis();
|
||||
if ((now - lastMsgTs) >= this.msgInterval) {
|
||||
doMetrics();
|
||||
lastMsgTs = System.currentTimeMillis();
|
||||
}
|
||||
stopSleeper.sleep();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Emit the HMaster metrics, such as region in transition metrics.
|
||||
* Surrounding in a try block just to be sure metrics doesn't abort HMaster.
|
||||
*/
|
||||
private void doMetrics() {
|
||||
try {
|
||||
this.assignmentManager.updateRegionsInTransitionMetrics();
|
||||
} catch (Throwable e) {
|
||||
LOG.error("Couldn't update metrics: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finish initialization of HMaster after becoming the primary master.
|
||||
*
|
||||
* <ol>
|
||||
|
|
|
@ -31,6 +31,7 @@ import org.apache.hadoop.metrics.MetricsRecord;
|
|||
import org.apache.hadoop.metrics.MetricsUtil;
|
||||
import org.apache.hadoop.metrics.Updater;
|
||||
import org.apache.hadoop.metrics.jvm.JvmMetrics;
|
||||
import org.apache.hadoop.metrics.util.MetricsIntValue;
|
||||
import org.apache.hadoop.metrics.util.MetricsLongValue;
|
||||
import org.apache.hadoop.metrics.util.MetricsRegistry;
|
||||
|
||||
|
@ -66,6 +67,18 @@ public class MasterMetrics implements Updater {
|
|||
final PersistentMetricsTimeVaryingRate splitSize =
|
||||
new PersistentMetricsTimeVaryingRate("splitSize", registry);
|
||||
|
||||
/**
|
||||
* Regions in Transition metrics such as number of RIT regions, oldest
|
||||
* RIT time and number of such regions that are in transition
|
||||
* for more than a specified threshold.
|
||||
*/
|
||||
public final MetricsIntValue ritCount =
|
||||
new MetricsIntValue("ritCount", registry);
|
||||
public final MetricsIntValue ritCountOverThreshold =
|
||||
new MetricsIntValue("ritCountOverThreshold", registry);
|
||||
public final MetricsLongValue ritOldestAge =
|
||||
new MetricsLongValue("ritOldestAge", registry);
|
||||
|
||||
public MasterMetrics(final String name) {
|
||||
MetricsContext context = MetricsUtil.getContext("hbase");
|
||||
metricsRecord = MetricsUtil.createRecord(context, "master");
|
||||
|
@ -117,6 +130,9 @@ public class MasterMetrics implements Updater {
|
|||
this.cluster_requests.pushMetric(metricsRecord);
|
||||
this.splitTime.pushMetric(metricsRecord);
|
||||
this.splitSize.pushMetric(metricsRecord);
|
||||
this.ritCount.pushMetric(metricsRecord);
|
||||
this.ritCountOverThreshold.pushMetric(metricsRecord);
|
||||
this.ritOldestAge.pushMetric(metricsRecord);
|
||||
}
|
||||
this.metricsRecord.update();
|
||||
}
|
||||
|
@ -148,4 +164,28 @@ public class MasterMetrics implements Updater {
|
|||
public void incrementRequests(final int inc) {
|
||||
this.cluster_requests.inc(inc);
|
||||
}
|
||||
|
||||
/**
|
||||
* set new value for number of regions in transition.
|
||||
* @param ritCount
|
||||
*/
|
||||
public void updateRITCount(int ritCount) {
|
||||
this.ritCount.set(ritCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* update RIT count that are in this state for more than the threshold
|
||||
* as defined by the property rit.metrics.threshold.time.
|
||||
* @param ritCountOverThreshold
|
||||
*/
|
||||
public void updateRITCountOverThreshold(int ritCountOverThreshold) {
|
||||
this.ritCountOverThreshold.set(ritCountOverThreshold);
|
||||
}
|
||||
/**
|
||||
* update the timestamp for oldest region in transition metrics.
|
||||
* @param timestamp
|
||||
*/
|
||||
public void updateRITOldestAge(long timestamp) {
|
||||
this.ritOldestAge.set(timestamp);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -307,7 +307,7 @@ public class TestAssignmentManager {
|
|||
CatalogTracker ct = Mockito.mock(CatalogTracker.class);
|
||||
// Create an AM.
|
||||
AssignmentManager am =
|
||||
new AssignmentManager(this.server, this.serverManager, ct, executor);
|
||||
new AssignmentManager(this.server, this.serverManager, ct, executor, null);
|
||||
try {
|
||||
// Make sure our new AM gets callbacks; once registered, can't unregister.
|
||||
// Thats ok because we make a new zk watcher for each test.
|
||||
|
@ -372,7 +372,7 @@ public class TestAssignmentManager {
|
|||
CatalogTracker ct = Mockito.mock(CatalogTracker.class);
|
||||
// Create an AM.
|
||||
AssignmentManager am =
|
||||
new AssignmentManager(this.server, this.serverManager, ct, executor);
|
||||
new AssignmentManager(this.server, this.serverManager, ct, executor, null);
|
||||
try {
|
||||
// Make sure our new AM gets callbacks; once registered, can't unregister.
|
||||
// Thats ok because we make a new zk watcher for each test.
|
||||
|
@ -446,7 +446,7 @@ public class TestAssignmentManager {
|
|||
CatalogTracker ct = Mockito.mock(CatalogTracker.class);
|
||||
// Create an AM.
|
||||
AssignmentManager am =
|
||||
new AssignmentManager(this.server, this.serverManager, ct, null);
|
||||
new AssignmentManager(this.server, this.serverManager, ct, null, null);
|
||||
try {
|
||||
// First make sure my mock up basically works. Unassign a region.
|
||||
unassign(am, SERVERNAME_A, hri);
|
||||
|
@ -578,7 +578,7 @@ public class TestAssignmentManager {
|
|||
final ServerManager serverManager,
|
||||
final CatalogTracker catalogTracker, final ExecutorService service)
|
||||
throws KeeperException, IOException {
|
||||
super(master, serverManager, catalogTracker, service);
|
||||
super(master, serverManager, catalogTracker, service, null);
|
||||
this.es = service;
|
||||
this.ct = catalogTracker;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue