HBASE-4393 Implement a canary monitoring program
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1329574 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e031dc8d34
commit
1d6b501c9b
|
@ -0,0 +1,253 @@
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.hbase.tool;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
|
import org.apache.hadoop.util.Tool;
|
||||||
|
import org.apache.hadoop.util.ToolRunner;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
|
||||||
|
import org.apache.hadoop.hbase.HRegionInfo;
|
||||||
|
import org.apache.hadoop.hbase.HTableDescriptor;
|
||||||
|
import org.apache.hadoop.hbase.HColumnDescriptor;
|
||||||
|
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||||
|
import org.apache.hadoop.hbase.TableNotFoundException;
|
||||||
|
|
||||||
|
import org.apache.hadoop.hbase.client.Get;
|
||||||
|
import org.apache.hadoop.hbase.client.HTable;
|
||||||
|
import org.apache.hadoop.hbase.client.HBaseAdmin;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* HBase Canary Tool, that that can be used to do
|
||||||
|
* "canary monitoring" of a running HBase cluster.
|
||||||
|
*
|
||||||
|
* Foreach region tries to get one row per column family
|
||||||
|
* and outputs some information about failure or latency.
|
||||||
|
*/
|
||||||
|
public final class Canary implements Tool {
|
||||||
|
// Sink interface used by the canary to outputs information
|
||||||
|
public interface Sink {
|
||||||
|
public void publishReadFailure(HRegionInfo region);
|
||||||
|
public void publishReadFailure(HRegionInfo region, HColumnDescriptor column);
|
||||||
|
public void publishReadTiming(HRegionInfo region, HColumnDescriptor column, long msTime);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Simple implementation of canary sink that allows to plot on
|
||||||
|
// file or standard output timings or failures.
|
||||||
|
public static class StdOutSink implements Sink {
|
||||||
|
@Override
|
||||||
|
public void publishReadFailure(HRegionInfo region) {
|
||||||
|
LOG.error(String.format("read from region %s failed", region.getRegionNameAsString()));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void publishReadFailure(HRegionInfo region, HColumnDescriptor column) {
|
||||||
|
LOG.error(String.format("read from region %s column family %s failed",
|
||||||
|
region.getRegionNameAsString(), column.getNameAsString()));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void publishReadTiming(HRegionInfo region, HColumnDescriptor column, long msTime) {
|
||||||
|
LOG.info(String.format("read from region %s column family %s in %dms",
|
||||||
|
region.getRegionNameAsString(), column.getNameAsString(), msTime));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final long DEFAULT_INTERVAL = 6000;
|
||||||
|
|
||||||
|
private static final Log LOG = LogFactory.getLog(Canary.class);
|
||||||
|
|
||||||
|
private Configuration conf = null;
|
||||||
|
private HBaseAdmin admin = null;
|
||||||
|
private long interval = 0;
|
||||||
|
private Sink sink = null;
|
||||||
|
|
||||||
|
public Canary() {
|
||||||
|
this(new StdOutSink());
|
||||||
|
}
|
||||||
|
|
||||||
|
public Canary(Sink sink) {
|
||||||
|
this.sink = sink;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Configuration getConf() {
|
||||||
|
return conf;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setConf(Configuration conf) {
|
||||||
|
this.conf = conf;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int run(String[] args) throws Exception {
|
||||||
|
int tables_index = -1;
|
||||||
|
|
||||||
|
// Process command line args
|
||||||
|
for (int i = 0; i < args.length; i++) {
|
||||||
|
String cmd = args[i];
|
||||||
|
|
||||||
|
if (cmd.startsWith("-")) {
|
||||||
|
if (tables_index >= 0) {
|
||||||
|
// command line args must be in the form: [opts] [table 1 [table 2 ...]]
|
||||||
|
System.err.println("Invalid command line options");
|
||||||
|
printUsageAndExit();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cmd.equals("-help")) {
|
||||||
|
// user asked for help, print the help and quit.
|
||||||
|
printUsageAndExit();
|
||||||
|
} else if (cmd.equals("-daemon") && interval == 0) {
|
||||||
|
// user asked for daemon mode, set a default interval between checks
|
||||||
|
interval = DEFAULT_INTERVAL;
|
||||||
|
} else if (cmd.equals("-interval")) {
|
||||||
|
// user has specified an interval for canary breaths (-interval N)
|
||||||
|
i++;
|
||||||
|
|
||||||
|
if (i == args.length) {
|
||||||
|
System.err.println("-interval needs a numeric value argument.");
|
||||||
|
printUsageAndExit();
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
interval = Long.parseLong(args[i]) * 1000;
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
System.err.println("-interval needs a numeric value argument.");
|
||||||
|
printUsageAndExit();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// no options match
|
||||||
|
System.err.println(cmd + " options is invalid.");
|
||||||
|
printUsageAndExit();
|
||||||
|
}
|
||||||
|
} else if (tables_index < 0) {
|
||||||
|
// keep track of first table name specified by the user
|
||||||
|
tables_index = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize HBase conf and admin
|
||||||
|
if (conf == null) conf = HBaseConfiguration.create();
|
||||||
|
admin = new HBaseAdmin(conf);
|
||||||
|
|
||||||
|
// lets the canary monitor the cluster
|
||||||
|
do {
|
||||||
|
if (admin.isAborted()) {
|
||||||
|
LOG.error("HBaseAdmin aborted");
|
||||||
|
return(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tables_index >= 0) {
|
||||||
|
for (int i = tables_index; i < args.length; i++) {
|
||||||
|
sniff(args[i]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
sniff();
|
||||||
|
}
|
||||||
|
|
||||||
|
Thread.sleep(interval);
|
||||||
|
} while (interval > 0);
|
||||||
|
|
||||||
|
return(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void printUsageAndExit() {
|
||||||
|
System.err.printf("Usage: bin/hbase %s [opts] [table 1 [table 2...]]\n", getClass().getName());
|
||||||
|
System.err.println(" where [opts] are:");
|
||||||
|
System.err.println(" -help Show this help and exit.");
|
||||||
|
System.err.println(" -daemon Continuous check at defined intervals.");
|
||||||
|
System.err.println(" -interval <N> Interval between checks (sec)");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* canary entry point to monitor all the tables.
|
||||||
|
*/
|
||||||
|
private void sniff() throws Exception {
|
||||||
|
for (HTableDescriptor table : admin.listTables()) {
|
||||||
|
sniff(table);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* canary entry point to monitor specified table.
|
||||||
|
*/
|
||||||
|
private void sniff(String tableName) throws Exception {
|
||||||
|
if (admin.isTableAvailable(tableName)) {
|
||||||
|
sniff(admin.getTableDescriptor(tableName.getBytes()));
|
||||||
|
} else {
|
||||||
|
LOG.warn(String.format("Table %s is not available", tableName));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Loops over regions that owns this table,
|
||||||
|
* and output some information abouts the state.
|
||||||
|
*/
|
||||||
|
private void sniff(HTableDescriptor tableDesc) throws Exception {
|
||||||
|
HTable table = null;
|
||||||
|
|
||||||
|
try {
|
||||||
|
table = new HTable(admin.getConfiguration(), tableDesc.getName());
|
||||||
|
} catch (TableNotFoundException e) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (HRegionInfo region : admin.getTableRegions(tableDesc.getName())) {
|
||||||
|
try {
|
||||||
|
sniffRegion(region, table);
|
||||||
|
} catch (Exception e) {
|
||||||
|
sink.publishReadFailure(region);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For each column family of the region tries to get one row
|
||||||
|
* and outputs the latency, or the failure.
|
||||||
|
*/
|
||||||
|
private void sniffRegion(HRegionInfo region, HTable table) throws Exception {
|
||||||
|
HTableDescriptor tableDesc = table.getTableDescriptor();
|
||||||
|
for (HColumnDescriptor column : tableDesc.getColumnFamilies()) {
|
||||||
|
Get get = new Get(region.getStartKey());
|
||||||
|
get.addFamily(column.getName());
|
||||||
|
|
||||||
|
try {
|
||||||
|
long startTime = System.currentTimeMillis();
|
||||||
|
table.get(get);
|
||||||
|
long time = System.currentTimeMillis() - startTime;
|
||||||
|
|
||||||
|
sink.publishReadTiming(region, column, time);
|
||||||
|
} catch (Exception e) {
|
||||||
|
sink.publishReadFailure(region, column);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
int exitCode = ToolRunner.run(new Canary(), args);
|
||||||
|
System.exit(exitCode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue