HBASE-27224 HFile tool statistic sampling produces misleading results (#4638)

Signed-off-by: Duo Zhang <zhangduo@apache.org>
Reviewed-by:  Clay Baenziger <cwb@clayb.net>
This commit is contained in:
Bryan Beaudreault 2022-09-06 15:04:32 -04:00 committed by GitHub
parent 1b5403cf7d
commit ee58f177f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 226 additions and 70 deletions

View File

@ -20,21 +20,17 @@ package org.apache.hadoop.hbase.io.hfile;
import static com.codahale.metrics.MetricRegistry.name;
import com.codahale.metrics.ConsoleReporter;
import com.codahale.metrics.Counter;
import com.codahale.metrics.Gauge;
import com.codahale.metrics.Histogram;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricFilter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.ScheduledReporter;
import com.codahale.metrics.Snapshot;
import com.codahale.metrics.Timer;
import com.codahale.metrics.UniformReservoir;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.IOException;
import java.io.PrintStream;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
@ -43,10 +39,8 @@ import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.SortedMap;
import java.util.TimeZone;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang3.StringUtils;
import java.util.concurrent.atomic.LongAdder;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
@ -107,6 +101,7 @@ public class HFilePrettyPrinter extends Configured implements Tool {
private boolean printBlockIndex;
private boolean printBlockHeaders;
private boolean printStats;
private boolean printStatRanges;
private boolean checkRow;
private boolean checkFamily;
private boolean isSeekToRow = false;
@ -150,6 +145,8 @@ public class HFilePrettyPrinter extends Configured implements Tool {
options.addOption("w", "seekToRow", true,
"Seek to this row and print all the kvs for this row only");
options.addOption("s", "stats", false, "Print statistics");
options.addOption("d", "details", false,
"Print detailed statistics, including counts by range");
options.addOption("i", "checkMobIntegrity", false,
"Print all cells whose mob files are missing");
@ -181,7 +178,8 @@ public class HFilePrettyPrinter extends Configured implements Tool {
shouldPrintMeta = cmd.hasOption("m");
printBlockIndex = cmd.hasOption("b");
printBlockHeaders = cmd.hasOption("h");
printStats = cmd.hasOption("s");
printStatRanges = cmd.hasOption("d");
printStats = cmd.hasOption("s") || printStatRanges;
checkRow = cmd.hasOption("k");
checkFamily = cmd.hasOption("a");
checkMobIntegrity = cmd.hasOption("i");
@ -356,7 +354,7 @@ public class HFilePrettyPrinter extends Configured implements Tool {
}
if (printStats) {
fileStats.finish();
fileStats.finish(printStatRanges);
out.println("Stats:\n" + fileStats);
}
@ -382,7 +380,7 @@ public class HFilePrettyPrinter extends Configured implements Tool {
}
// collect stats
if (printStats) {
fileStats.collect(cell);
fileStats.collect(cell, printStatRanges);
}
// dump key value
if (printKey) {
@ -581,18 +579,101 @@ public class HFilePrettyPrinter extends Configured implements Tool {
}
}
// Default reservoir is exponentially decaying, but we're doing a point-in-time analysis
// of a store file. It doesn't make sense to prefer keys later in the store file.
private static final MetricRegistry.MetricSupplier<Histogram> UNIFORM_RESERVOIR =
() -> new Histogram(new UniformReservoir());
// Useful ranges for viewing distribution of small to large keys, values, and rows.
// we only print ranges which actually have values, so more here doesn't add much overhead
private static final long[] RANGES = new long[] { 1, 3, 10, 50, 100, 500, 1_000, 5_000, 10_000,
50_000, 100_000, 500_000, 750_000, 1_000_000, 5_000_000, 10_000_000, 50_000_000, 100_000_000 };
/**
* Holds a Histogram and supporting min/max and range buckets for analyzing distribution of key
* bytes, value bytes, row bytes, and row columns. Supports adding values, getting the histogram,
* and getting counts per range.
*/
static class KeyValueStats {
private final Histogram histogram;
private final String name;
private long max = Long.MIN_VALUE;
private long min = Long.MAX_VALUE;
private boolean collectRanges = false;
private final LongAdder[] rangeCounts;
KeyValueStats(MetricRegistry metricRegistry, String statName) {
this.histogram =
metricRegistry.histogram(name(HFilePrettyPrinter.class, statName), UNIFORM_RESERVOIR);
this.name = statName;
this.rangeCounts = new LongAdder[RANGES.length];
for (int i = 0; i < rangeCounts.length; i++) {
rangeCounts[i] = new LongAdder();
}
}
void update(long value, boolean collectRanges) {
histogram.update(value);
min = Math.min(value, min);
max = Math.max(value, max);
if (collectRanges) {
this.collectRanges = true;
int result = Arrays.binarySearch(RANGES, value);
int idx = result >= 0 ? result : Math.abs(result) - 1;
rangeCounts[idx].increment();
}
}
Histogram getHistogram() {
return histogram;
}
String getName() {
return name;
}
long getMax() {
return max;
}
long getMin() {
return min;
}
long[] getRanges() {
return RANGES;
}
long getCountAtOrBelow(long range) {
long count = 0;
for (int i = 0; i < RANGES.length; i++) {
if (RANGES[i] <= range) {
count += rangeCounts[i].sum();
} else {
break;
}
}
return count;
}
boolean hasRangeCounts() {
return collectRanges;
}
}
private static class KeyValueStatsCollector {
private final MetricRegistry metricsRegistry = new MetricRegistry();
private final ByteArrayOutputStream metricsOutput = new ByteArrayOutputStream();
private final SimpleReporter simpleReporter = SimpleReporter.forRegistry(metricsRegistry)
.outputTo(new PrintStream(metricsOutput)).filter(MetricFilter.ALL).build();
Histogram keyLen = metricsRegistry.histogram(name(HFilePrettyPrinter.class, "Key length"));
Histogram valLen = metricsRegistry.histogram(name(HFilePrettyPrinter.class, "Val length"));
Histogram rowSizeBytes =
metricsRegistry.histogram(name(HFilePrettyPrinter.class, "Row size (bytes)"));
Histogram rowSizeCols =
metricsRegistry.histogram(name(HFilePrettyPrinter.class, "Row size (columns)"));
KeyValueStats keyLen = new KeyValueStats(metricsRegistry, "Key length");
KeyValueStats valLen = new KeyValueStats(metricsRegistry, "Val length");
KeyValueStats rowSizeBytes = new KeyValueStats(metricsRegistry, "Row size (bytes)");
KeyValueStats rowSizeCols = new KeyValueStats(metricsRegistry, "Row size (columns)");
private final SimpleReporter simpleReporter =
SimpleReporter.newBuilder().outputTo(new PrintStream(metricsOutput)).addStats(keyLen)
.addStats(valLen).addStats(rowSizeBytes).addStats(rowSizeCols).build();
long curRowBytes = 0;
long curRowCols = 0;
@ -603,11 +684,11 @@ public class HFilePrettyPrinter extends Configured implements Tool {
private long maxRowBytes = 0;
private long curRowKeyLength;
public void collect(Cell cell) {
valLen.update(cell.getValueLength());
public void collect(Cell cell, boolean printStatRanges) {
valLen.update(cell.getValueLength(), printStatRanges);
if (prevCell != null && CellComparator.getInstance().compareRows(prevCell, cell) != 0) {
// new row
collectRow();
collectRow(printStatRanges);
}
curRowBytes += cell.getSerializedSize();
curRowKeyLength = KeyValueUtil.keyLength(cell);
@ -615,10 +696,10 @@ public class HFilePrettyPrinter extends Configured implements Tool {
prevCell = cell;
}
private void collectRow() {
rowSizeBytes.update(curRowBytes);
rowSizeCols.update(curRowCols);
keyLen.update(curRowKeyLength);
private void collectRow(boolean printStatRanges) {
rowSizeBytes.update(curRowBytes, printStatRanges);
rowSizeCols.update(curRowCols, printStatRanges);
keyLen.update(curRowKeyLength, printStatRanges);
if (curRowBytes > maxRowBytes && prevCell != null) {
biggestRow = CellUtil.cloneRow(prevCell);
@ -629,9 +710,9 @@ public class HFilePrettyPrinter extends Configured implements Tool {
curRowCols = 0;
}
public void finish() {
public void finish(boolean printStatRanges) {
if (curRowCols > 0) {
collectRow();
collectRow(printStatRanges);
}
}
@ -640,7 +721,6 @@ public class HFilePrettyPrinter extends Configured implements Tool {
if (prevCell == null) return "no data available for statistics";
// Dump the metrics to the output stream
simpleReporter.stop();
simpleReporter.report();
return metricsOutput.toString() + "Key of biggest row: " + Bytes.toStringBinary(biggestRow);
@ -648,41 +728,32 @@ public class HFilePrettyPrinter extends Configured implements Tool {
}
/**
* Almost identical to ConsoleReporter, but extending ScheduledReporter, as extending
* ConsoleReporter in this version of dropwizard is now too much trouble.
* Simple reporter which collects registered histograms for printing to an output stream in
* {@link #report()}.
*/
private static class SimpleReporter extends ScheduledReporter {
private static final class SimpleReporter {
/**
* Returns a new {@link Builder} for {@link ConsoleReporter}.
* @param registry the registry to report
* @return a {@link Builder} instance for a {@link ConsoleReporter}
* Returns a new {@link Builder} for {@link SimpleReporter}.
* @return a {@link Builder} instance for a {@link SimpleReporter}
*/
public static Builder forRegistry(MetricRegistry registry) {
return new Builder(registry);
public static Builder newBuilder() {
return new Builder();
}
/**
* A builder for {@link SimpleReporter} instances. Defaults to using the default locale and time
* zone, writing to {@code System.out}, converting rates to events/second, converting durations
* to milliseconds, and not filtering metrics.
* zone, writing to {@code System.out}.
*/
public static class Builder {
private final MetricRegistry registry;
private final List<KeyValueStats> stats = new ArrayList<>();
private PrintStream output;
private Locale locale;
private TimeZone timeZone;
private TimeUnit rateUnit;
private TimeUnit durationUnit;
private MetricFilter filter;
private Builder(MetricRegistry registry) {
this.registry = registry;
private Builder() {
this.output = System.out;
this.locale = Locale.getDefault();
this.timeZone = TimeZone.getDefault();
this.rateUnit = TimeUnit.SECONDS;
this.durationUnit = TimeUnit.MILLISECONDS;
this.filter = MetricFilter.ALL;
}
/**
@ -696,12 +767,12 @@ public class HFilePrettyPrinter extends Configured implements Tool {
}
/**
* Only report metrics which match the given filter.
* @param filter a {@link MetricFilter}
* Add the given {@link KeyValueStats} to be reported
* @param stat the stat to be reported
* @return {@code this}
*/
public Builder filter(MetricFilter filter) {
this.filter = filter;
public Builder addStats(KeyValueStats stat) {
this.stats.add(stat);
return this;
}
@ -710,35 +781,31 @@ public class HFilePrettyPrinter extends Configured implements Tool {
* @return a {@link ConsoleReporter}
*/
public SimpleReporter build() {
return new SimpleReporter(registry, output, locale, timeZone, rateUnit, durationUnit,
filter);
return new SimpleReporter(output, stats, locale, timeZone);
}
}
private final PrintStream output;
private final List<KeyValueStats> stats;
private final Locale locale;
private final DateFormat dateFormat;
private SimpleReporter(MetricRegistry registry, PrintStream output, Locale locale,
TimeZone timeZone, TimeUnit rateUnit, TimeUnit durationUnit, MetricFilter filter) {
super(registry, "simple-reporter", filter, rateUnit, durationUnit);
private SimpleReporter(PrintStream output, List<KeyValueStats> stats, Locale locale,
TimeZone timeZone) {
this.output = output;
this.stats = stats;
this.locale = locale;
this.dateFormat = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.MEDIUM, locale);
dateFormat.setTimeZone(timeZone);
}
@Override
public void report(SortedMap<String, Gauge> gauges, SortedMap<String, Counter> counters,
SortedMap<String, Histogram> histograms, SortedMap<String, Meter> meters,
SortedMap<String, Timer> timers) {
public void report() {
// we know we only have histograms
if (!histograms.isEmpty()) {
for (Map.Entry<String, Histogram> entry : histograms.entrySet()) {
output.print(" " + StringUtils.substringAfterLast(entry.getKey(), "."));
if (!stats.isEmpty()) {
for (KeyValueStats stat : stats) {
output.print(" " + stat.getName());
output.println(':');
printHistogram(entry.getValue());
printHistogram(stat);
}
output.println();
}
@ -747,10 +814,12 @@ public class HFilePrettyPrinter extends Configured implements Tool {
output.flush();
}
private void printHistogram(Histogram histogram) {
private void printHistogram(KeyValueStats stats) {
Histogram histogram = stats.getHistogram();
Snapshot snapshot = histogram.getSnapshot();
output.printf(locale, " min = %d%n", snapshot.getMin());
output.printf(locale, " max = %d%n", snapshot.getMax());
output.printf(locale, " min = %d%n", stats.getMin());
output.printf(locale, " max = %d%n", stats.getMax());
output.printf(locale, " mean = %2.2f%n", snapshot.getMean());
output.printf(locale, " stddev = %2.2f%n", snapshot.getStdDev());
output.printf(locale, " median = %2.2f%n", snapshot.getMedian());
@ -760,6 +829,37 @@ public class HFilePrettyPrinter extends Configured implements Tool {
output.printf(locale, " 99%% <= %2.2f%n", snapshot.get99thPercentile());
output.printf(locale, " 99.9%% <= %2.2f%n", snapshot.get999thPercentile());
output.printf(locale, " count = %d%n", histogram.getCount());
// if printStatRanges was enabled with -d arg, below we'll create an approximate histogram
// of counts based on the configured ranges in RANGES. Each range of sizes (i.e. <= 50, <=
// 100, etc) will have a count printed if any values were seen in that range. If no values
// were seen for a range, that range will be excluded to keep the output small.
if (stats.hasRangeCounts()) {
output.printf(locale, " (range <= count):%n");
long lastVal = 0;
long lastRange = 0;
for (long range : stats.getRanges()) {
long val = stats.getCountAtOrBelow(range);
if (val - lastVal > 0) {
// print the last zero value before this one, to give context
if (lastVal == 0 && lastRange != 0) {
printRangeCount(lastRange, lastVal);
}
printRangeCount(range, val - lastVal);
}
lastVal = val;
lastRange = range;
}
if (histogram.getCount() - lastVal > 0) {
// print any remaining that might have been outside our buckets
printRangeCount(Long.MAX_VALUE, histogram.getCount() - lastVal);
}
}
}
private void printRangeCount(long range, long countAtOrBelow) {
String rangeString = range == Long.MAX_VALUE ? "inf" : Long.toString(range);
output.printf(locale, "%17s <= %d%n", rangeString, countAtOrBelow);
}
}

View File

@ -21,6 +21,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertTrue;
import com.codahale.metrics.MetricRegistry;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import org.apache.hadoop.conf.Configuration;
@ -126,4 +127,59 @@ public class TestHFilePrettyPrinter {
String expectedResult = "Scanning -> " + fileNotInRootDir + "\n" + "Scanned kv count -> 1\n";
assertEquals(expectedResult, result);
}
@Test
public void testHistograms() throws Exception {
Path fileNotInRootDir = UTIL.getDataTestDir("hfile");
TestHRegionServerBulkLoad.createHFile(fs, fileNotInRootDir, cf, fam, value, 1000);
assertNotEquals("directory used is not an HBase root dir", UTIL.getDefaultRootDirPath(),
fileNotInRootDir);
System.setOut(ps);
new HFilePrettyPrinter(conf).run(new String[] { "-s", "-d", String.valueOf(fileNotInRootDir) });
String result = stream.toString();
LOG.info(result);
// split out the output into sections based on the headers
String[] headers =
new String[] { "Key length", "Val length", "Row size (bytes)", "Row size (columns)" };
// for each section, there is a corresponding expected (count, range) pairs
int[][] expectations = new int[][] { new int[] { 0, 10, 1000, 50 }, new int[] { 0, 1, 1000, 3 },
new int[] { 0, 10, 1000, 50 }, new int[] { 1000, 1 }, };
for (int i = 0; i < headers.length - 1; i++) {
int idx = result.indexOf(headers[i]);
int nextIdx = result.indexOf(headers[i + 1]);
assertContainsRanges(result.substring(idx, nextIdx), expectations[i]);
}
}
private void assertContainsRanges(String result, int... rangeCountPairs) {
for (int i = 0; i < rangeCountPairs.length - 1; i += 2) {
String expected = rangeCountPairs[i + 1] + " <= " + rangeCountPairs[i];
assertTrue("expected:\n" + result + "\nto contain: '" + expected + "'",
result.contains(expected));
}
}
@Test
public void testKeyValueStats() {
HFilePrettyPrinter.KeyValueStats stats =
new HFilePrettyPrinter.KeyValueStats(new MetricRegistry(), "test");
long[] ranges = stats.getRanges();
for (long range : ranges) {
stats.update(range - 1, true);
}
assertEquals(ranges[ranges.length - 1] - 1, stats.getMax());
assertEquals(ranges[0] - 1, stats.getMin());
int total = 1;
for (long range : ranges) {
long val = stats.getCountAtOrBelow(range);
assertEquals(total++, val);
}
}
}