Docs: Add more points to the chart that gives accuracy for the cardinality aggregation.
This also adds instructions how to regenerate the chart.
This commit is contained in:
parent
9b1a477120
commit
1ed6c5d110
|
@ -69,11 +69,70 @@ about `c * 8` bytes.
|
||||||
|
|
||||||
The following chart shows how the error varies before and after the threshold:
|
The following chart shows how the error varies before and after the threshold:
|
||||||
|
|
||||||
|
////
|
||||||
|
To generate this chart use this gnuplot script:
|
||||||
|
-------
|
||||||
|
#!/usr/bin/gnuplot
|
||||||
|
reset
|
||||||
|
set terminal png size 1000,400
|
||||||
|
|
||||||
|
set xlabel "Actual cardinality"
|
||||||
|
set logscale x
|
||||||
|
|
||||||
|
set ylabel "Relative error (%)"
|
||||||
|
set yrange [0:8]
|
||||||
|
|
||||||
|
set title "Cardinality error"
|
||||||
|
set grid
|
||||||
|
|
||||||
|
set style data lines
|
||||||
|
|
||||||
|
plot "test.dat" using 1:2 title "threshold=100", \
|
||||||
|
"" using 1:3 title "threshold=1000", \
|
||||||
|
"" using 1:4 title "threshold=10000"
|
||||||
|
#
|
||||||
|
-------
|
||||||
|
|
||||||
|
and generate data in a 'test.dat' file using the below Java code:
|
||||||
|
|
||||||
|
-------
|
||||||
|
private static double error(HyperLogLogPlusPlus h, long expected) {
|
||||||
|
double actual = h.cardinality(0);
|
||||||
|
return Math.abs(expected - actual) / expected;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
HyperLogLogPlusPlus h100 = new HyperLogLogPlusPlus(precisionFromThreshold(100), BigArrays.NON_RECYCLING_INSTANCE, 1);
|
||||||
|
HyperLogLogPlusPlus h1000 = new HyperLogLogPlusPlus(precisionFromThreshold(1000), BigArrays.NON_RECYCLING_INSTANCE, 1);
|
||||||
|
HyperLogLogPlusPlus h10000 = new HyperLogLogPlusPlus(precisionFromThreshold(10000), BigArrays.NON_RECYCLING_INSTANCE, 1);
|
||||||
|
|
||||||
|
int next = 100;
|
||||||
|
int step = 10;
|
||||||
|
|
||||||
|
for (int i = 1; i <= 10000000; ++i) {
|
||||||
|
long h = BitMixer.mix64(i);
|
||||||
|
h100.collect(0, h);
|
||||||
|
h1000.collect(0, h);
|
||||||
|
h10000.collect(0, h);
|
||||||
|
|
||||||
|
if (i == next) {
|
||||||
|
System.out.println(i + " " + error(h100, i)*100 + " " + error(h1000, i)*100 + " " + error(h10000, i)*100);
|
||||||
|
next += step;
|
||||||
|
if (next >= 100 * step) {
|
||||||
|
step *= 10;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
-------
|
||||||
|
|
||||||
|
////
|
||||||
|
|
||||||
image:images/cardinality_error.png[]
|
image:images/cardinality_error.png[]
|
||||||
|
|
||||||
For all 3 thresholds, counts have been accurate up to the configured threshold
|
For all 3 thresholds, counts have been accurate up to the configured threshold
|
||||||
(although not guaranteed, this is likely to be the case). Please also note that
|
(although not guaranteed, this is likely to be the case). Please also note that
|
||||||
even with a threshold as low as 100, the error remains under 5%, even when
|
even with a threshold as low as 100, the error remains very low, even when
|
||||||
counting millions of items.
|
counting millions of items.
|
||||||
|
|
||||||
==== Pre-computed hashes
|
==== Pre-computed hashes
|
||||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 16 KiB After Width: | Height: | Size: 12 KiB |
Loading…
Reference in New Issue