HBASE-27224 HFile tool statistic sampling produces misleading results (#4638)
Signed-off-by: Duo Zhang <zhangduo@apache.org> Reviewed-by: Clay Baenziger <cwb@clayb.net>
This commit is contained in:
parent
890d6dedfe
commit
6f6857b83f
|
@ -20,21 +20,17 @@ package org.apache.hadoop.hbase.io.hfile;
|
|||
import static com.codahale.metrics.MetricRegistry.name;
|
||||
|
||||
import com.codahale.metrics.ConsoleReporter;
|
||||
import com.codahale.metrics.Counter;
|
||||
import com.codahale.metrics.Gauge;
|
||||
import com.codahale.metrics.Histogram;
|
||||
import com.codahale.metrics.Meter;
|
||||
import com.codahale.metrics.MetricFilter;
|
||||
import com.codahale.metrics.MetricRegistry;
|
||||
import com.codahale.metrics.ScheduledReporter;
|
||||
import com.codahale.metrics.Snapshot;
|
||||
import com.codahale.metrics.Timer;
|
||||
import com.codahale.metrics.UniformReservoir;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.DataInput;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.text.DateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashSet;
|
||||
|
@ -43,10 +39,8 @@ import java.util.Locale;
|
|||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TimeZone;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import java.util.concurrent.atomic.LongAdder;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.conf.Configured;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
|
@ -107,6 +101,7 @@ public class HFilePrettyPrinter extends Configured implements Tool {
|
|||
private boolean printBlockIndex;
|
||||
private boolean printBlockHeaders;
|
||||
private boolean printStats;
|
||||
private boolean printStatRanges;
|
||||
private boolean checkRow;
|
||||
private boolean checkFamily;
|
||||
private boolean isSeekToRow = false;
|
||||
|
@ -150,6 +145,8 @@ public class HFilePrettyPrinter extends Configured implements Tool {
|
|||
options.addOption("w", "seekToRow", true,
|
||||
"Seek to this row and print all the kvs for this row only");
|
||||
options.addOption("s", "stats", false, "Print statistics");
|
||||
options.addOption("d", "details", false,
|
||||
"Print detailed statistics, including counts by range");
|
||||
options.addOption("i", "checkMobIntegrity", false,
|
||||
"Print all cells whose mob files are missing");
|
||||
|
||||
|
@ -181,7 +178,8 @@ public class HFilePrettyPrinter extends Configured implements Tool {
|
|||
shouldPrintMeta = cmd.hasOption("m");
|
||||
printBlockIndex = cmd.hasOption("b");
|
||||
printBlockHeaders = cmd.hasOption("h");
|
||||
printStats = cmd.hasOption("s");
|
||||
printStatRanges = cmd.hasOption("d");
|
||||
printStats = cmd.hasOption("s") || printStatRanges;
|
||||
checkRow = cmd.hasOption("k");
|
||||
checkFamily = cmd.hasOption("a");
|
||||
checkMobIntegrity = cmd.hasOption("i");
|
||||
|
@ -356,7 +354,7 @@ public class HFilePrettyPrinter extends Configured implements Tool {
|
|||
}
|
||||
|
||||
if (printStats) {
|
||||
fileStats.finish();
|
||||
fileStats.finish(printStatRanges);
|
||||
out.println("Stats:\n" + fileStats);
|
||||
}
|
||||
|
||||
|
@ -382,7 +380,7 @@ public class HFilePrettyPrinter extends Configured implements Tool {
|
|||
}
|
||||
// collect stats
|
||||
if (printStats) {
|
||||
fileStats.collect(cell);
|
||||
fileStats.collect(cell, printStatRanges);
|
||||
}
|
||||
// dump key value
|
||||
if (printKey) {
|
||||
|
@ -581,18 +579,101 @@ public class HFilePrettyPrinter extends Configured implements Tool {
|
|||
}
|
||||
}
|
||||
|
||||
// Default reservoir is exponentially decaying, but we're doing a point-in-time analysis
|
||||
// of a store file. It doesn't make sense to prefer keys later in the store file.
|
||||
private static final MetricRegistry.MetricSupplier<Histogram> UNIFORM_RESERVOIR =
|
||||
() -> new Histogram(new UniformReservoir());
|
||||
|
||||
// Useful ranges for viewing distribution of small to large keys, values, and rows.
|
||||
// we only print ranges which actually have values, so more here doesn't add much overhead
|
||||
private static final long[] RANGES = new long[] { 1, 3, 10, 50, 100, 500, 1_000, 5_000, 10_000,
|
||||
50_000, 100_000, 500_000, 750_000, 1_000_000, 5_000_000, 10_000_000, 50_000_000, 100_000_000 };
|
||||
|
||||
/**
|
||||
* Holds a Histogram and supporting min/max and range buckets for analyzing distribution of key
|
||||
* bytes, value bytes, row bytes, and row columns. Supports adding values, getting the histogram,
|
||||
* and getting counts per range.
|
||||
*/
|
||||
static class KeyValueStats {
|
||||
private final Histogram histogram;
|
||||
private final String name;
|
||||
private long max = Long.MIN_VALUE;
|
||||
private long min = Long.MAX_VALUE;
|
||||
private boolean collectRanges = false;
|
||||
private final LongAdder[] rangeCounts;
|
||||
|
||||
KeyValueStats(MetricRegistry metricRegistry, String statName) {
|
||||
this.histogram =
|
||||
metricRegistry.histogram(name(HFilePrettyPrinter.class, statName), UNIFORM_RESERVOIR);
|
||||
this.name = statName;
|
||||
this.rangeCounts = new LongAdder[RANGES.length];
|
||||
for (int i = 0; i < rangeCounts.length; i++) {
|
||||
rangeCounts[i] = new LongAdder();
|
||||
}
|
||||
}
|
||||
|
||||
void update(long value, boolean collectRanges) {
|
||||
histogram.update(value);
|
||||
min = Math.min(value, min);
|
||||
max = Math.max(value, max);
|
||||
|
||||
if (collectRanges) {
|
||||
this.collectRanges = true;
|
||||
int result = Arrays.binarySearch(RANGES, value);
|
||||
int idx = result >= 0 ? result : Math.abs(result) - 1;
|
||||
rangeCounts[idx].increment();
|
||||
}
|
||||
}
|
||||
|
||||
Histogram getHistogram() {
|
||||
return histogram;
|
||||
}
|
||||
|
||||
String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
long getMax() {
|
||||
return max;
|
||||
}
|
||||
|
||||
long getMin() {
|
||||
return min;
|
||||
}
|
||||
|
||||
long[] getRanges() {
|
||||
return RANGES;
|
||||
}
|
||||
|
||||
long getCountAtOrBelow(long range) {
|
||||
long count = 0;
|
||||
for (int i = 0; i < RANGES.length; i++) {
|
||||
if (RANGES[i] <= range) {
|
||||
count += rangeCounts[i].sum();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
boolean hasRangeCounts() {
|
||||
return collectRanges;
|
||||
}
|
||||
}
|
||||
|
||||
private static class KeyValueStatsCollector {
|
||||
private final MetricRegistry metricsRegistry = new MetricRegistry();
|
||||
private final ByteArrayOutputStream metricsOutput = new ByteArrayOutputStream();
|
||||
private final SimpleReporter simpleReporter = SimpleReporter.forRegistry(metricsRegistry)
|
||||
.outputTo(new PrintStream(metricsOutput)).filter(MetricFilter.ALL).build();
|
||||
|
||||
Histogram keyLen = metricsRegistry.histogram(name(HFilePrettyPrinter.class, "Key length"));
|
||||
Histogram valLen = metricsRegistry.histogram(name(HFilePrettyPrinter.class, "Val length"));
|
||||
Histogram rowSizeBytes =
|
||||
metricsRegistry.histogram(name(HFilePrettyPrinter.class, "Row size (bytes)"));
|
||||
Histogram rowSizeCols =
|
||||
metricsRegistry.histogram(name(HFilePrettyPrinter.class, "Row size (columns)"));
|
||||
KeyValueStats keyLen = new KeyValueStats(metricsRegistry, "Key length");
|
||||
KeyValueStats valLen = new KeyValueStats(metricsRegistry, "Val length");
|
||||
KeyValueStats rowSizeBytes = new KeyValueStats(metricsRegistry, "Row size (bytes)");
|
||||
KeyValueStats rowSizeCols = new KeyValueStats(metricsRegistry, "Row size (columns)");
|
||||
|
||||
private final SimpleReporter simpleReporter =
|
||||
SimpleReporter.newBuilder().outputTo(new PrintStream(metricsOutput)).addStats(keyLen)
|
||||
.addStats(valLen).addStats(rowSizeBytes).addStats(rowSizeCols).build();
|
||||
|
||||
long curRowBytes = 0;
|
||||
long curRowCols = 0;
|
||||
|
@ -603,11 +684,11 @@ public class HFilePrettyPrinter extends Configured implements Tool {
|
|||
private long maxRowBytes = 0;
|
||||
private long curRowKeyLength;
|
||||
|
||||
public void collect(Cell cell) {
|
||||
valLen.update(cell.getValueLength());
|
||||
public void collect(Cell cell, boolean printStatRanges) {
|
||||
valLen.update(cell.getValueLength(), printStatRanges);
|
||||
if (prevCell != null && CellComparator.getInstance().compareRows(prevCell, cell) != 0) {
|
||||
// new row
|
||||
collectRow();
|
||||
collectRow(printStatRanges);
|
||||
}
|
||||
curRowBytes += cell.getSerializedSize();
|
||||
curRowKeyLength = KeyValueUtil.keyLength(cell);
|
||||
|
@ -615,10 +696,10 @@ public class HFilePrettyPrinter extends Configured implements Tool {
|
|||
prevCell = cell;
|
||||
}
|
||||
|
||||
private void collectRow() {
|
||||
rowSizeBytes.update(curRowBytes);
|
||||
rowSizeCols.update(curRowCols);
|
||||
keyLen.update(curRowKeyLength);
|
||||
private void collectRow(boolean printStatRanges) {
|
||||
rowSizeBytes.update(curRowBytes, printStatRanges);
|
||||
rowSizeCols.update(curRowCols, printStatRanges);
|
||||
keyLen.update(curRowKeyLength, printStatRanges);
|
||||
|
||||
if (curRowBytes > maxRowBytes && prevCell != null) {
|
||||
biggestRow = CellUtil.cloneRow(prevCell);
|
||||
|
@ -629,9 +710,9 @@ public class HFilePrettyPrinter extends Configured implements Tool {
|
|||
curRowCols = 0;
|
||||
}
|
||||
|
||||
public void finish() {
|
||||
public void finish(boolean printStatRanges) {
|
||||
if (curRowCols > 0) {
|
||||
collectRow();
|
||||
collectRow(printStatRanges);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -640,7 +721,6 @@ public class HFilePrettyPrinter extends Configured implements Tool {
|
|||
if (prevCell == null) return "no data available for statistics";
|
||||
|
||||
// Dump the metrics to the output stream
|
||||
simpleReporter.stop();
|
||||
simpleReporter.report();
|
||||
|
||||
return metricsOutput.toString() + "Key of biggest row: " + Bytes.toStringBinary(biggestRow);
|
||||
|
@ -648,41 +728,32 @@ public class HFilePrettyPrinter extends Configured implements Tool {
|
|||
}
|
||||
|
||||
/**
|
||||
* Almost identical to ConsoleReporter, but extending ScheduledReporter, as extending
|
||||
* ConsoleReporter in this version of dropwizard is now too much trouble.
|
||||
* Simple reporter which collects registered histograms for printing to an output stream in
|
||||
* {@link #report()}.
|
||||
*/
|
||||
private static class SimpleReporter extends ScheduledReporter {
|
||||
private static final class SimpleReporter {
|
||||
/**
|
||||
* Returns a new {@link Builder} for {@link ConsoleReporter}.
|
||||
* @param registry the registry to report
|
||||
* @return a {@link Builder} instance for a {@link ConsoleReporter}
|
||||
* Returns a new {@link Builder} for {@link SimpleReporter}.
|
||||
* @return a {@link Builder} instance for a {@link SimpleReporter}
|
||||
*/
|
||||
public static Builder forRegistry(MetricRegistry registry) {
|
||||
return new Builder(registry);
|
||||
public static Builder newBuilder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
/**
|
||||
* A builder for {@link SimpleReporter} instances. Defaults to using the default locale and time
|
||||
* zone, writing to {@code System.out}, converting rates to events/second, converting durations
|
||||
* to milliseconds, and not filtering metrics.
|
||||
* zone, writing to {@code System.out}.
|
||||
*/
|
||||
public static class Builder {
|
||||
private final MetricRegistry registry;
|
||||
private final List<KeyValueStats> stats = new ArrayList<>();
|
||||
private PrintStream output;
|
||||
private Locale locale;
|
||||
private TimeZone timeZone;
|
||||
private TimeUnit rateUnit;
|
||||
private TimeUnit durationUnit;
|
||||
private MetricFilter filter;
|
||||
|
||||
private Builder(MetricRegistry registry) {
|
||||
this.registry = registry;
|
||||
private Builder() {
|
||||
this.output = System.out;
|
||||
this.locale = Locale.getDefault();
|
||||
this.timeZone = TimeZone.getDefault();
|
||||
this.rateUnit = TimeUnit.SECONDS;
|
||||
this.durationUnit = TimeUnit.MILLISECONDS;
|
||||
this.filter = MetricFilter.ALL;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -696,12 +767,12 @@ public class HFilePrettyPrinter extends Configured implements Tool {
|
|||
}
|
||||
|
||||
/**
|
||||
* Only report metrics which match the given filter.
|
||||
* @param filter a {@link MetricFilter}
|
||||
* Add the given {@link KeyValueStats} to be reported
|
||||
* @param stat the stat to be reported
|
||||
* @return {@code this}
|
||||
*/
|
||||
public Builder filter(MetricFilter filter) {
|
||||
this.filter = filter;
|
||||
public Builder addStats(KeyValueStats stat) {
|
||||
this.stats.add(stat);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -710,35 +781,31 @@ public class HFilePrettyPrinter extends Configured implements Tool {
|
|||
* @return a {@link ConsoleReporter}
|
||||
*/
|
||||
public SimpleReporter build() {
|
||||
return new SimpleReporter(registry, output, locale, timeZone, rateUnit, durationUnit,
|
||||
filter);
|
||||
return new SimpleReporter(output, stats, locale, timeZone);
|
||||
}
|
||||
}
|
||||
|
||||
private final PrintStream output;
|
||||
private final List<KeyValueStats> stats;
|
||||
private final Locale locale;
|
||||
private final DateFormat dateFormat;
|
||||
|
||||
private SimpleReporter(MetricRegistry registry, PrintStream output, Locale locale,
|
||||
TimeZone timeZone, TimeUnit rateUnit, TimeUnit durationUnit, MetricFilter filter) {
|
||||
super(registry, "simple-reporter", filter, rateUnit, durationUnit);
|
||||
private SimpleReporter(PrintStream output, List<KeyValueStats> stats, Locale locale,
|
||||
TimeZone timeZone) {
|
||||
this.output = output;
|
||||
this.stats = stats;
|
||||
this.locale = locale;
|
||||
|
||||
this.dateFormat = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.MEDIUM, locale);
|
||||
dateFormat.setTimeZone(timeZone);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void report(SortedMap<String, Gauge> gauges, SortedMap<String, Counter> counters,
|
||||
SortedMap<String, Histogram> histograms, SortedMap<String, Meter> meters,
|
||||
SortedMap<String, Timer> timers) {
|
||||
public void report() {
|
||||
// we know we only have histograms
|
||||
if (!histograms.isEmpty()) {
|
||||
for (Map.Entry<String, Histogram> entry : histograms.entrySet()) {
|
||||
output.print(" " + StringUtils.substringAfterLast(entry.getKey(), "."));
|
||||
if (!stats.isEmpty()) {
|
||||
for (KeyValueStats stat : stats) {
|
||||
output.print(" " + stat.getName());
|
||||
output.println(':');
|
||||
printHistogram(entry.getValue());
|
||||
printHistogram(stat);
|
||||
}
|
||||
output.println();
|
||||
}
|
||||
|
@ -747,10 +814,12 @@ public class HFilePrettyPrinter extends Configured implements Tool {
|
|||
output.flush();
|
||||
}
|
||||
|
||||
private void printHistogram(Histogram histogram) {
|
||||
private void printHistogram(KeyValueStats stats) {
|
||||
Histogram histogram = stats.getHistogram();
|
||||
Snapshot snapshot = histogram.getSnapshot();
|
||||
output.printf(locale, " min = %d%n", snapshot.getMin());
|
||||
output.printf(locale, " max = %d%n", snapshot.getMax());
|
||||
|
||||
output.printf(locale, " min = %d%n", stats.getMin());
|
||||
output.printf(locale, " max = %d%n", stats.getMax());
|
||||
output.printf(locale, " mean = %2.2f%n", snapshot.getMean());
|
||||
output.printf(locale, " stddev = %2.2f%n", snapshot.getStdDev());
|
||||
output.printf(locale, " median = %2.2f%n", snapshot.getMedian());
|
||||
|
@ -760,6 +829,37 @@ public class HFilePrettyPrinter extends Configured implements Tool {
|
|||
output.printf(locale, " 99%% <= %2.2f%n", snapshot.get99thPercentile());
|
||||
output.printf(locale, " 99.9%% <= %2.2f%n", snapshot.get999thPercentile());
|
||||
output.printf(locale, " count = %d%n", histogram.getCount());
|
||||
|
||||
// if printStatRanges was enabled with -d arg, below we'll create an approximate histogram
|
||||
// of counts based on the configured ranges in RANGES. Each range of sizes (i.e. <= 50, <=
|
||||
// 100, etc) will have a count printed if any values were seen in that range. If no values
|
||||
// were seen for a range, that range will be excluded to keep the output small.
|
||||
if (stats.hasRangeCounts()) {
|
||||
output.printf(locale, " (range <= count):%n");
|
||||
long lastVal = 0;
|
||||
long lastRange = 0;
|
||||
for (long range : stats.getRanges()) {
|
||||
long val = stats.getCountAtOrBelow(range);
|
||||
if (val - lastVal > 0) {
|
||||
// print the last zero value before this one, to give context
|
||||
if (lastVal == 0 && lastRange != 0) {
|
||||
printRangeCount(lastRange, lastVal);
|
||||
}
|
||||
printRangeCount(range, val - lastVal);
|
||||
}
|
||||
lastVal = val;
|
||||
lastRange = range;
|
||||
}
|
||||
if (histogram.getCount() - lastVal > 0) {
|
||||
// print any remaining that might have been outside our buckets
|
||||
printRangeCount(Long.MAX_VALUE, histogram.getCount() - lastVal);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void printRangeCount(long range, long countAtOrBelow) {
|
||||
String rangeString = range == Long.MAX_VALUE ? "inf" : Long.toString(range);
|
||||
output.printf(locale, "%17s <= %d%n", rangeString, countAtOrBelow);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@ import static org.junit.Assert.assertEquals;
|
|||
import static org.junit.Assert.assertNotEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import com.codahale.metrics.MetricRegistry;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
@ -126,4 +127,59 @@ public class TestHFilePrettyPrinter {
|
|||
String expectedResult = "Scanning -> " + fileNotInRootDir + "\n" + "Scanned kv count -> 1\n";
|
||||
assertEquals(expectedResult, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHistograms() throws Exception {
|
||||
Path fileNotInRootDir = UTIL.getDataTestDir("hfile");
|
||||
TestHRegionServerBulkLoad.createHFile(fs, fileNotInRootDir, cf, fam, value, 1000);
|
||||
assertNotEquals("directory used is not an HBase root dir", UTIL.getDefaultRootDirPath(),
|
||||
fileNotInRootDir);
|
||||
|
||||
System.setOut(ps);
|
||||
new HFilePrettyPrinter(conf).run(new String[] { "-s", "-d", String.valueOf(fileNotInRootDir) });
|
||||
String result = stream.toString();
|
||||
LOG.info(result);
|
||||
|
||||
// split out the output into sections based on the headers
|
||||
String[] headers =
|
||||
new String[] { "Key length", "Val length", "Row size (bytes)", "Row size (columns)" };
|
||||
// for each section, there is a corresponding expected (count, range) pairs
|
||||
int[][] expectations = new int[][] { new int[] { 0, 10, 1000, 50 }, new int[] { 0, 1, 1000, 3 },
|
||||
new int[] { 0, 10, 1000, 50 }, new int[] { 1000, 1 }, };
|
||||
|
||||
for (int i = 0; i < headers.length - 1; i++) {
|
||||
int idx = result.indexOf(headers[i]);
|
||||
int nextIdx = result.indexOf(headers[i + 1]);
|
||||
|
||||
assertContainsRanges(result.substring(idx, nextIdx), expectations[i]);
|
||||
}
|
||||
}
|
||||
|
||||
private void assertContainsRanges(String result, int... rangeCountPairs) {
|
||||
for (int i = 0; i < rangeCountPairs.length - 1; i += 2) {
|
||||
String expected = rangeCountPairs[i + 1] + " <= " + rangeCountPairs[i];
|
||||
assertTrue("expected:\n" + result + "\nto contain: '" + expected + "'",
|
||||
result.contains(expected));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testKeyValueStats() {
|
||||
HFilePrettyPrinter.KeyValueStats stats =
|
||||
new HFilePrettyPrinter.KeyValueStats(new MetricRegistry(), "test");
|
||||
long[] ranges = stats.getRanges();
|
||||
for (long range : ranges) {
|
||||
stats.update(range - 1, true);
|
||||
}
|
||||
|
||||
assertEquals(ranges[ranges.length - 1] - 1, stats.getMax());
|
||||
assertEquals(ranges[0] - 1, stats.getMin());
|
||||
|
||||
int total = 1;
|
||||
for (long range : ranges) {
|
||||
long val = stats.getCountAtOrBelow(range);
|
||||
assertEquals(total++, val);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue