Docs: Add more points to the chart that gives accuracy for the cardinality aggregation.
This also adds instructions how to regenerate the chart.
This commit is contained in:
parent
9b1a477120
commit
1ed6c5d110
|
@ -69,11 +69,70 @@ about `c * 8` bytes.
|
|||
|
||||
The following chart shows how the error varies before and after the threshold:
|
||||
|
||||
////
|
||||
To generate this chart use this gnuplot script:
|
||||
-------
|
||||
#!/usr/bin/gnuplot
|
||||
reset
|
||||
set terminal png size 1000,400
|
||||
|
||||
set xlabel "Actual cardinality"
|
||||
set logscale x
|
||||
|
||||
set ylabel "Relative error (%)"
|
||||
set yrange [0:8]
|
||||
|
||||
set title "Cardinality error"
|
||||
set grid
|
||||
|
||||
set style data lines
|
||||
|
||||
plot "test.dat" using 1:2 title "threshold=100", \
|
||||
"" using 1:3 title "threshold=1000", \
|
||||
"" using 1:4 title "threshold=10000"
|
||||
#
|
||||
-------
|
||||
|
||||
and generate data in a 'test.dat' file using the below Java code:
|
||||
|
||||
-------
|
||||
private static double error(HyperLogLogPlusPlus h, long expected) {
|
||||
double actual = h.cardinality(0);
|
||||
return Math.abs(expected - actual) / expected;
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
HyperLogLogPlusPlus h100 = new HyperLogLogPlusPlus(precisionFromThreshold(100), BigArrays.NON_RECYCLING_INSTANCE, 1);
|
||||
HyperLogLogPlusPlus h1000 = new HyperLogLogPlusPlus(precisionFromThreshold(1000), BigArrays.NON_RECYCLING_INSTANCE, 1);
|
||||
HyperLogLogPlusPlus h10000 = new HyperLogLogPlusPlus(precisionFromThreshold(10000), BigArrays.NON_RECYCLING_INSTANCE, 1);
|
||||
|
||||
int next = 100;
|
||||
int step = 10;
|
||||
|
||||
for (int i = 1; i <= 10000000; ++i) {
|
||||
long h = BitMixer.mix64(i);
|
||||
h100.collect(0, h);
|
||||
h1000.collect(0, h);
|
||||
h10000.collect(0, h);
|
||||
|
||||
if (i == next) {
|
||||
System.out.println(i + " " + error(h100, i)*100 + " " + error(h1000, i)*100 + " " + error(h10000, i)*100);
|
||||
next += step;
|
||||
if (next >= 100 * step) {
|
||||
step *= 10;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
-------
|
||||
|
||||
////
|
||||
|
||||
image:images/cardinality_error.png[]
|
||||
|
||||
For all 3 thresholds, counts have been accurate up to the configured threshold
|
||||
(although not guaranteed, this is likely to be the case). Please also note that
|
||||
even with a threshold as low as 100, the error remains under 5%, even when
|
||||
even with a threshold as low as 100, the error remains very low, even when
|
||||
counting millions of items.
|
||||
|
||||
==== Pre-computed hashes
|
||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 16 KiB After Width: | Height: | Size: 12 KiB |
Loading…
Reference in New Issue