HBASE-15773 Improvements to CellCounter job

This commit is contained in:
Gary Helmling 2016-05-05 12:40:47 -07:00
parent d90f0571e6
commit 86ca09e0e5
2 changed files with 131 additions and 86 deletions

View File

@ -92,7 +92,30 @@ public class CellCounter extends Configured implements Tool {
* Counter enumeration to count the actual rows. * Counter enumeration to count the actual rows.
*/ */
public static enum Counters { public static enum Counters {
ROWS ROWS,
CELLS
}
private Configuration conf;
private String separator;
// state of current row, family, column needs to persist across map() invocations
// in order to properly handle scanner batching, where a single qualifier may have too
// many versions for a single map() call
private byte[] lastRow;
private String currentRowKey;
byte[] currentFamily = null;
String currentFamilyName = null;
byte[] currentQualifier = null;
// family + qualifier
String currentQualifierName = null;
// rowkey + family + qualifier
String currentRowQualifierName = null;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
conf = context.getConfiguration();
separator = conf.get("ReportSeparator",":");
} }
/** /**
@ -112,49 +135,45 @@ public class CellCounter extends Configured implements Tool {
throws IOException { throws IOException {
Preconditions.checkState(values != null, Preconditions.checkState(values != null,
"values passed to the map is null"); "values passed to the map is null");
String currentFamilyName = null;
String currentQualifierName = null;
String currentRowKey = null;
Configuration config = context.getConfiguration();
String separator = config.get("ReportSeparator",":");
try { try {
context.getCounter(Counters.ROWS).increment(1); byte[] currentRow = values.getRow();
context.write(new Text("Total ROWS"), new IntWritable(1)); if (lastRow == null || !Bytes.equals(lastRow, currentRow)) {
if (values != null && !values.isEmpty()) { lastRow = currentRow;
currentRowKey = Bytes.toStringBinary(currentRow);
currentFamily = null;
currentQualifier = null;
context.getCounter(Counters.ROWS).increment(1);
context.write(new Text("Total ROWS"), new IntWritable(1));
}
if (!values.isEmpty()) {
int cellCount = 0;
for (Cell value : values.listCells()) { for (Cell value : values.listCells()) {
currentRowKey = Bytes.toStringBinary(CellUtil.cloneRow(value)); cellCount++;
String thisRowFamilyName = Bytes.toStringBinary(CellUtil.cloneFamily(value)); if (currentFamily == null || !CellUtil.matchingFamily(value, currentFamily)) {
if (!thisRowFamilyName.equals(currentFamilyName)) { currentFamily = CellUtil.cloneFamily(value);
currentFamilyName = thisRowFamilyName; currentFamilyName = Bytes.toStringBinary(currentFamily);
context.getCounter("CF", thisRowFamilyName).increment(1); currentQualifier = null;
if (1 == context.getCounter("CF", thisRowFamilyName).getValue()) { context.getCounter("CF", currentFamilyName).increment(1);
if (1 == context.getCounter("CF", currentFamilyName).getValue()) {
context.write(new Text("Total Families Across all Rows"), new IntWritable(1)); context.write(new Text("Total Families Across all Rows"), new IntWritable(1));
context.write(new Text(thisRowFamilyName), new IntWritable(1)); context.write(new Text(currentFamily), new IntWritable(1));
} }
} }
String thisRowQualifierName = thisRowFamilyName + separator if (currentQualifier == null || !CellUtil.matchingQualifier(value, currentQualifier)) {
+ Bytes.toStringBinary(CellUtil.cloneQualifier(value)); currentQualifier = CellUtil.cloneQualifier(value);
if (!thisRowQualifierName.equals(currentQualifierName)) { currentQualifierName = currentFamilyName + separator +
currentQualifierName = thisRowQualifierName; Bytes.toStringBinary(currentQualifier);
context.getCounter("CFQL", thisRowQualifierName).increment(1); currentRowQualifierName = currentRowKey + separator + currentQualifierName;
context.write(new Text("Total Qualifiers across all Rows"), context.write(new Text("Total Qualifiers across all Rows"),
new IntWritable(1)); new IntWritable(1));
context.write(new Text(thisRowQualifierName), new IntWritable(1)); context.write(new Text(currentQualifierName), new IntWritable(1));
// Intialize versions
context.getCounter("QL_VERSIONS", currentRowKey + separator +
thisRowQualifierName).increment(1);
context.write(new Text(currentRowKey + separator
+ thisRowQualifierName + "_Versions"), new IntWritable(1));
} else {
// Increment versions
currentQualifierName = thisRowQualifierName;
context.getCounter("QL_VERSIONS", currentRowKey + separator +
thisRowQualifierName).increment(1);
context.write(new Text(currentRowKey + separator
+ thisRowQualifierName + "_Versions"), new IntWritable(1));
} }
// Increment versions
context.write(new Text(currentRowQualifierName + "_Versions"), new IntWritable(1));
} }
context.getCounter(Counters.CELLS).increment(cellCount);
} }
} catch (InterruptedException e) { } catch (InterruptedException e) {
e.printStackTrace(); e.printStackTrace();
@ -208,15 +227,16 @@ public class CellCounter extends Configured implements Tool {
return job; return job;
} }
private static Scan getConfiguredScanForJob(Configuration conf, String[] args) throws IOException { private static Scan getConfiguredScanForJob(Configuration conf, String[] args)
Scan s = new Scan(); throws IOException {
// create scan with any properties set from TableInputFormat
Scan s = TableInputFormat.createScanFromConfiguration(conf);
// Set Scan Versions // Set Scan Versions
s.setMaxVersions(Integer.MAX_VALUE); if (conf.get(TableInputFormat.SCAN_MAXVERSIONS) == null) {
s.setCacheBlocks(false); // default to all versions unless explicitly set
// Set Scan Column Family s.setMaxVersions(Integer.MAX_VALUE);
if (conf.get(TableInputFormat.SCAN_COLUMN_FAMILY) != null) {
s.addFamily(Bytes.toBytes(conf.get(TableInputFormat.SCAN_COLUMN_FAMILY)));
} }
s.setCacheBlocks(false);
// Set RowFilter or Prefix Filter if applicable. // Set RowFilter or Prefix Filter if applicable.
Filter rowFilter = getRowFilter(args); Filter rowFilter = getRowFilter(args);
if (rowFilter!= null) { if (rowFilter!= null) {
@ -277,9 +297,18 @@ public class CellCounter extends Configured implements Tool {
System.err.println(" <tablename> <outputDir> <reportSeparator> [^[regex pattern] or " + System.err.println(" <tablename> <outputDir> <reportSeparator> [^[regex pattern] or " +
"[Prefix] for row filter]] --starttime=[starttime] --endtime=[endtime]"); "[Prefix] for row filter]] --starttime=[starttime] --endtime=[endtime]");
System.err.println(" Note: -D properties will be applied to the conf used. "); System.err.println(" Note: -D properties will be applied to the conf used. ");
System.err.println(" Additionally, the following SCAN properties can be specified"); System.err.println(" Additionally, all of the SCAN properties from TableInputFormat");
System.err.println(" to get fine grained control on what is counted.."); System.err.println(" can be specified to get fine grained control on what is counted..");
System.err.println(" -D " + TableInputFormat.SCAN_ROW_START + "=<rowkey>");
System.err.println(" -D " + TableInputFormat.SCAN_ROW_STOP + "=<rowkey>");
System.err.println(" -D " + TableInputFormat.SCAN_COLUMNS + "=\"<col1> <col2>...\"");
System.err.println(" -D " + TableInputFormat.SCAN_COLUMN_FAMILY + "=<familyName>"); System.err.println(" -D " + TableInputFormat.SCAN_COLUMN_FAMILY + "=<familyName>");
System.err.println(" -D " + TableInputFormat.SCAN_TIMESTAMP + "=<timestamp>");
System.err.println(" -D " + TableInputFormat.SCAN_TIMERANGE_START + "=<timestamp>");
System.err.println(" -D " + TableInputFormat.SCAN_TIMERANGE_END + "=<timestamp>");
System.err.println(" -D " + TableInputFormat.SCAN_MAXVERSIONS + "=<count>");
System.err.println(" -D " + TableInputFormat.SCAN_CACHEDROWS + "=<count>");
System.err.println(" -D " + TableInputFormat.SCAN_BATCHSIZE + "=<count>");
System.err.println(" <reportSeparator> parameter can be used to override the default report separator " + System.err.println(" <reportSeparator> parameter can be used to override the default report separator " +
"string : used to separate the rowId/column family name and qualifier name."); "string : used to separate the rowId/column family name and qualifier name.");
System.err.println(" [^[regex pattern] or [Prefix] parameter can be used to limit the cell counter count " + System.err.println(" [^[regex pattern] or [Prefix] parameter can be used to limit the cell counter count " +

View File

@ -126,48 +126,7 @@ implements Configurable {
} }
} else { } else {
try { try {
scan = new Scan(); scan = createScanFromConfiguration(conf);
if (conf.get(SCAN_ROW_START) != null) {
scan.setStartRow(Bytes.toBytesBinary(conf.get(SCAN_ROW_START)));
}
if (conf.get(SCAN_ROW_STOP) != null) {
scan.setStopRow(Bytes.toBytesBinary(conf.get(SCAN_ROW_STOP)));
}
if (conf.get(SCAN_COLUMNS) != null) {
addColumns(scan, conf.get(SCAN_COLUMNS));
}
if (conf.get(SCAN_COLUMN_FAMILY) != null) {
scan.addFamily(Bytes.toBytes(conf.get(SCAN_COLUMN_FAMILY)));
}
if (conf.get(SCAN_TIMESTAMP) != null) {
scan.setTimeStamp(Long.parseLong(conf.get(SCAN_TIMESTAMP)));
}
if (conf.get(SCAN_TIMERANGE_START) != null && conf.get(SCAN_TIMERANGE_END) != null) {
scan.setTimeRange(
Long.parseLong(conf.get(SCAN_TIMERANGE_START)),
Long.parseLong(conf.get(SCAN_TIMERANGE_END)));
}
if (conf.get(SCAN_MAXVERSIONS) != null) {
scan.setMaxVersions(Integer.parseInt(conf.get(SCAN_MAXVERSIONS)));
}
if (conf.get(SCAN_CACHEDROWS) != null) {
scan.setCaching(Integer.parseInt(conf.get(SCAN_CACHEDROWS)));
}
if (conf.get(SCAN_BATCHSIZE) != null) {
scan.setBatch(Integer.parseInt(conf.get(SCAN_BATCHSIZE)));
}
// false by default, full table scans generate too much BC churn
scan.setCacheBlocks((conf.getBoolean(SCAN_CACHEBLOCKS, false)));
} catch (Exception e) { } catch (Exception e) {
LOG.error(StringUtils.stringifyException(e)); LOG.error(StringUtils.stringifyException(e));
} }
@ -176,6 +135,63 @@ implements Configurable {
setScan(scan); setScan(scan);
} }
/**
* Sets up a {@link Scan} instance, applying settings from the configuration property
* constants defined in {@code TableInputFormat}. This allows specifying things such as:
* <ul>
* <li>start and stop rows</li>
* <li>column qualifiers or families</li>
* <li>timestamps or timerange</li>
* <li>scanner caching and batch size</li>
* </ul>
*/
public static Scan createScanFromConfiguration(Configuration conf) throws IOException {
Scan scan = new Scan();
if (conf.get(SCAN_ROW_START) != null) {
scan.setStartRow(Bytes.toBytesBinary(conf.get(SCAN_ROW_START)));
}
if (conf.get(SCAN_ROW_STOP) != null) {
scan.setStopRow(Bytes.toBytesBinary(conf.get(SCAN_ROW_STOP)));
}
if (conf.get(SCAN_COLUMNS) != null) {
addColumns(scan, conf.get(SCAN_COLUMNS));
}
if (conf.get(SCAN_COLUMN_FAMILY) != null) {
scan.addFamily(Bytes.toBytes(conf.get(SCAN_COLUMN_FAMILY)));
}
if (conf.get(SCAN_TIMESTAMP) != null) {
scan.setTimeStamp(Long.parseLong(conf.get(SCAN_TIMESTAMP)));
}
if (conf.get(SCAN_TIMERANGE_START) != null && conf.get(SCAN_TIMERANGE_END) != null) {
scan.setTimeRange(
Long.parseLong(conf.get(SCAN_TIMERANGE_START)),
Long.parseLong(conf.get(SCAN_TIMERANGE_END)));
}
if (conf.get(SCAN_MAXVERSIONS) != null) {
scan.setMaxVersions(Integer.parseInt(conf.get(SCAN_MAXVERSIONS)));
}
if (conf.get(SCAN_CACHEDROWS) != null) {
scan.setCaching(Integer.parseInt(conf.get(SCAN_CACHEDROWS)));
}
if (conf.get(SCAN_BATCHSIZE) != null) {
scan.setBatch(Integer.parseInt(conf.get(SCAN_BATCHSIZE)));
}
// false by default, full table scans generate too much BC churn
scan.setCacheBlocks((conf.getBoolean(SCAN_CACHEBLOCKS, false)));
return scan;
}
@Override @Override
protected void initialize(JobContext context) throws IOException { protected void initialize(JobContext context) throws IOException {
// Do we have to worry about mis-matches between the Configuration from setConf and the one // Do we have to worry about mis-matches between the Configuration from setConf and the one